winnt.c (i386_pe_seh_end_prologue): Move code to ...
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64
65 enum upper_128bits_state
66 {
67 unknown = 0,
68 unused,
69 used
70 };
71
72 typedef struct block_info_def
73 {
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85 } *block_info;
86
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89 enum call_avx256_state
90 {
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102 };
103
104 /* Check if a 256bit AVX register is referenced in stores. */
105
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119 }
120
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130 {
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292 }
293
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338 done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356 }
357
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490 }
491
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580 };
581
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651 };
652
653 static const
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723 };
724
725 static const
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793 };
794
795 static const
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870 };
871
872 static const
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941 };
942
943 static const
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014 };
1015
1016 static const
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087 };
1088
1089 static const
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340 };
1341
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427 };
1428
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509 };
1510
1511 static const
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580 };
1581
1582 static const
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653 };
1654
1655 static const
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726 };
1727
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1729 static const
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803 };
1804
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807 static const
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875 };
1876
1877 /* Set by -mtune. */
1878 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1879
1880 /* Set by -mtune or -Os. */
1881 const struct processor_costs *ix86_cost = &pentium_cost;
1882
1883 /* Processor feature/optimization bitmasks. */
1884 #define m_386 (1<<PROCESSOR_I386)
1885 #define m_486 (1<<PROCESSOR_I486)
1886 #define m_PENT (1<<PROCESSOR_PENTIUM)
1887 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1888 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1889 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1890 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1891 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1892 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1893 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1894 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1895 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1896 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1897 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1898 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1899 #define m_ATOM (1<<PROCESSOR_ATOM)
1900
1901 #define m_GEODE (1<<PROCESSOR_GEODE)
1902 #define m_K6 (1<<PROCESSOR_K6)
1903 #define m_K6_GEODE (m_K6 | m_GEODE)
1904 #define m_K8 (1<<PROCESSOR_K8)
1905 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1906 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1907 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1908 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1909 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1910 #define m_BDVER (m_BDVER1 | m_BDVER2)
1911 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1912 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1913
1914 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1915 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1916
1917 /* Generic instruction choice should be common subset of supported CPUs
1918 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1919 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1920
1921 /* Feature tests against the various tunings. */
1922 unsigned char ix86_tune_features[X86_TUNE_LAST];
1923
1924 /* Feature tests against the various tunings used to create ix86_tune_features
1925 based on the processor mask. */
1926 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1927 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1928 negatively, so enabling for Generic64 seems like good code size
1929 tradeoff. We can't enable it for 32bit generic because it does not
1930 work well with PPro base chips. */
1931 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1932
1933 /* X86_TUNE_PUSH_MEMORY */
1934 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1935
1936 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1937 m_486 | m_PENT,
1938
1939 /* X86_TUNE_UNROLL_STRLEN */
1940 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1941
1942 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1943 on simulation result. But after P4 was made, no performance benefit
1944 was observed with branch hints. It also increases the code size.
1945 As a result, icc never generates branch hints. */
1946 0,
1947
1948 /* X86_TUNE_DOUBLE_WITH_ADD */
1949 ~m_386,
1950
1951 /* X86_TUNE_USE_SAHF */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1953
1954 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1955 partial dependencies. */
1956 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1957
1958 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1959 register stalls on Generic32 compilation setting as well. However
1960 in current implementation the partial register stalls are not eliminated
1961 very well - they can be introduced via subregs synthesized by combine
1962 and can happen in caller/callee saving sequences. Because this option
1963 pays back little on PPro based chips and is in conflict with partial reg
1964 dependencies used by Athlon/P4 based chips, it is better to leave it off
1965 for generic32 for now. */
1966 m_PPRO,
1967
1968 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1969 m_CORE2I7 | m_GENERIC,
1970
1971 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1972 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1973 m_CORE2I7 | m_GENERIC,
1974
1975 /* X86_TUNE_USE_HIMODE_FIOP */
1976 m_386 | m_486 | m_K6_GEODE,
1977
1978 /* X86_TUNE_USE_SIMODE_FIOP */
1979 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1980
1981 /* X86_TUNE_USE_MOV0 */
1982 m_K6,
1983
1984 /* X86_TUNE_USE_CLTD */
1985 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1986
1987 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1988 m_PENT4,
1989
1990 /* X86_TUNE_SPLIT_LONG_MOVES */
1991 m_PPRO,
1992
1993 /* X86_TUNE_READ_MODIFY_WRITE */
1994 ~m_PENT,
1995
1996 /* X86_TUNE_READ_MODIFY */
1997 ~(m_PENT | m_PPRO),
1998
1999 /* X86_TUNE_PROMOTE_QIMODE */
2000 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2001
2002 /* X86_TUNE_FAST_PREFIX */
2003 ~(m_386 | m_486 | m_PENT),
2004
2005 /* X86_TUNE_SINGLE_STRINGOP */
2006 m_386 | m_P4_NOCONA,
2007
2008 /* X86_TUNE_QIMODE_MATH */
2009 ~0,
2010
2011 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2012 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2013 might be considered for Generic32 if our scheme for avoiding partial
2014 stalls was more effective. */
2015 ~m_PPRO,
2016
2017 /* X86_TUNE_PROMOTE_QI_REGS */
2018 0,
2019
2020 /* X86_TUNE_PROMOTE_HI_REGS */
2021 m_PPRO,
2022
2023 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2024 over esp addition. */
2025 m_386 | m_486 | m_PENT | m_PPRO,
2026
2027 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2028 over esp addition. */
2029 m_PENT,
2030
2031 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2032 over esp subtraction. */
2033 m_386 | m_486 | m_PENT | m_K6_GEODE,
2034
2035 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2036 over esp subtraction. */
2037 m_PENT | m_K6_GEODE,
2038
2039 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2040 for DFmode copies */
2041 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2042
2043 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2044 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2045
2046 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2047 conflict here in between PPro/Pentium4 based chips that thread 128bit
2048 SSE registers as single units versus K8 based chips that divide SSE
2049 registers to two 64bit halves. This knob promotes all store destinations
2050 to be 128bit to allow register renaming on 128bit SSE units, but usually
2051 results in one extra microop on 64bit SSE units. Experimental results
2052 shows that disabling this option on P4 brings over 20% SPECfp regression,
2053 while enabling it on K8 brings roughly 2.4% regression that can be partly
2054 masked by careful scheduling of moves. */
2055 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2056
2057 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2058 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2059
2060 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2061 m_COREI7 | m_BDVER,
2062
2063 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2064 m_BDVER ,
2065
2066 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2067 are resolved on SSE register parts instead of whole registers, so we may
2068 maintain just lower part of scalar values in proper format leaving the
2069 upper part undefined. */
2070 m_ATHLON_K8,
2071
2072 /* X86_TUNE_SSE_TYPELESS_STORES */
2073 m_AMD_MULTIPLE,
2074
2075 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2076 m_PPRO | m_P4_NOCONA,
2077
2078 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2079 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2080
2081 /* X86_TUNE_PROLOGUE_USING_MOVE */
2082 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2083
2084 /* X86_TUNE_EPILOGUE_USING_MOVE */
2085 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2086
2087 /* X86_TUNE_SHIFT1 */
2088 ~m_486,
2089
2090 /* X86_TUNE_USE_FFREEP */
2091 m_AMD_MULTIPLE,
2092
2093 /* X86_TUNE_INTER_UNIT_MOVES */
2094 ~(m_AMD_MULTIPLE | m_GENERIC),
2095
2096 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2097 ~(m_AMDFAM10 | m_BDVER ),
2098
2099 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2100 than 4 branch instructions in the 16 byte window. */
2101 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2102
2103 /* X86_TUNE_SCHEDULE */
2104 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_USE_BT */
2107 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2108
2109 /* X86_TUNE_USE_INCDEC */
2110 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2111
2112 /* X86_TUNE_PAD_RETURNS */
2113 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2114
2115 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2116 m_ATOM,
2117
2118 /* X86_TUNE_EXT_80387_CONSTANTS */
2119 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2120
2121 /* X86_TUNE_SHORTEN_X87_SSE */
2122 ~m_K8,
2123
2124 /* X86_TUNE_AVOID_VECTOR_DECODE */
2125 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2126
2127 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2128 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2129 ~(m_386 | m_486),
2130
2131 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2132 vector path on AMD machines. */
2133 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2134
2135 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2136 machines. */
2137 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2138
2139 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2140 than a MOV. */
2141 m_PENT,
2142
2143 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2144 but one byte longer. */
2145 m_PENT,
2146
2147 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2148 operand that cannot be represented using a modRM byte. The XOR
2149 replacement is long decoded, so this split helps here as well. */
2150 m_K6,
2151
2152 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2153 from FP to FP. */
2154 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2155
2156 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2157 from integer to FP. */
2158 m_AMDFAM10,
2159
2160 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2161 with a subsequent conditional jump instruction into a single
2162 compare-and-branch uop. */
2163 m_BDVER,
2164
2165 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2166 will impact LEA instruction selection. */
2167 m_ATOM,
2168
2169 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2170 instructions. */
2171 ~m_ATOM,
2172
2173 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2174 at -O3. For the moment, the prefetching seems badly tuned for Intel
2175 chips. */
2176 m_K6_GEODE | m_AMD_MULTIPLE,
2177
2178 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2179 the auto-vectorizer. */
2180 m_BDVER,
2181
2182 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2183 during reassociation of integer computation. */
2184 m_ATOM,
2185
2186 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2187 during reassociation of fp computation. */
2188 m_ATOM
2189 };
2190
2191 /* Feature tests against the various architecture variations. */
2192 unsigned char ix86_arch_features[X86_ARCH_LAST];
2193
2194 /* Feature tests against the various architecture variations, used to create
2195 ix86_arch_features based on the processor mask. */
2196 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2197 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2198 ~(m_386 | m_486 | m_PENT | m_K6),
2199
2200 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2201 ~m_386,
2202
2203 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2204 ~(m_386 | m_486),
2205
2206 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2207 ~m_386,
2208
2209 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2210 ~m_386,
2211 };
2212
2213 static const unsigned int x86_accumulate_outgoing_args
2214 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2215
2216 static const unsigned int x86_arch_always_fancy_math_387
2217 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2218
2219 static const unsigned int x86_avx256_split_unaligned_load
2220 = m_COREI7 | m_GENERIC;
2221
2222 static const unsigned int x86_avx256_split_unaligned_store
2223 = m_COREI7 | m_BDVER | m_GENERIC;
2224
2225 /* In case the average insn count for single function invocation is
2226 lower than this constant, emit fast (but longer) prologue and
2227 epilogue code. */
2228 #define FAST_PROLOGUE_INSN_COUNT 20
2229
2230 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2231 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2232 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2233 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2234
2235 /* Array of the smallest class containing reg number REGNO, indexed by
2236 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2237
2238 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2239 {
2240 /* ax, dx, cx, bx */
2241 AREG, DREG, CREG, BREG,
2242 /* si, di, bp, sp */
2243 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2244 /* FP registers */
2245 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2246 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2247 /* arg pointer */
2248 NON_Q_REGS,
2249 /* flags, fpsr, fpcr, frame */
2250 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2251 /* SSE registers */
2252 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 /* MMX registers */
2255 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2256 MMX_REGS, MMX_REGS,
2257 /* REX registers */
2258 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2259 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2260 /* SSE REX registers */
2261 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2262 SSE_REGS, SSE_REGS,
2263 };
2264
2265 /* The "default" register map used in 32bit mode. */
2266
2267 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2268 {
2269 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2270 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2271 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2272 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2273 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2275 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2276 };
2277
2278 /* The "default" register map used in 64bit mode. */
2279
2280 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2281 {
2282 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2283 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2284 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2285 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2286 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2287 8,9,10,11,12,13,14,15, /* extended integer registers */
2288 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2289 };
2290
2291 /* Define the register numbers to be used in Dwarf debugging information.
2292 The SVR4 reference port C compiler uses the following register numbers
2293 in its Dwarf output code:
2294 0 for %eax (gcc regno = 0)
2295 1 for %ecx (gcc regno = 2)
2296 2 for %edx (gcc regno = 1)
2297 3 for %ebx (gcc regno = 3)
2298 4 for %esp (gcc regno = 7)
2299 5 for %ebp (gcc regno = 6)
2300 6 for %esi (gcc regno = 4)
2301 7 for %edi (gcc regno = 5)
2302 The following three DWARF register numbers are never generated by
2303 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2304 believes these numbers have these meanings.
2305 8 for %eip (no gcc equivalent)
2306 9 for %eflags (gcc regno = 17)
2307 10 for %trapno (no gcc equivalent)
2308 It is not at all clear how we should number the FP stack registers
2309 for the x86 architecture. If the version of SDB on x86/svr4 were
2310 a bit less brain dead with respect to floating-point then we would
2311 have a precedent to follow with respect to DWARF register numbers
2312 for x86 FP registers, but the SDB on x86/svr4 is so completely
2313 broken with respect to FP registers that it is hardly worth thinking
2314 of it as something to strive for compatibility with.
2315 The version of x86/svr4 SDB I have at the moment does (partially)
2316 seem to believe that DWARF register number 11 is associated with
2317 the x86 register %st(0), but that's about all. Higher DWARF
2318 register numbers don't seem to be associated with anything in
2319 particular, and even for DWARF regno 11, SDB only seems to under-
2320 stand that it should say that a variable lives in %st(0) (when
2321 asked via an `=' command) if we said it was in DWARF regno 11,
2322 but SDB still prints garbage when asked for the value of the
2323 variable in question (via a `/' command).
2324 (Also note that the labels SDB prints for various FP stack regs
2325 when doing an `x' command are all wrong.)
2326 Note that these problems generally don't affect the native SVR4
2327 C compiler because it doesn't allow the use of -O with -g and
2328 because when it is *not* optimizing, it allocates a memory
2329 location for each floating-point variable, and the memory
2330 location is what gets described in the DWARF AT_location
2331 attribute for the variable in question.
2332 Regardless of the severe mental illness of the x86/svr4 SDB, we
2333 do something sensible here and we use the following DWARF
2334 register numbers. Note that these are all stack-top-relative
2335 numbers.
2336 11 for %st(0) (gcc regno = 8)
2337 12 for %st(1) (gcc regno = 9)
2338 13 for %st(2) (gcc regno = 10)
2339 14 for %st(3) (gcc regno = 11)
2340 15 for %st(4) (gcc regno = 12)
2341 16 for %st(5) (gcc regno = 13)
2342 17 for %st(6) (gcc regno = 14)
2343 18 for %st(7) (gcc regno = 15)
2344 */
2345 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2346 {
2347 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2348 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2349 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2350 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2351 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2352 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2353 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2354 };
2355
2356 /* Define parameter passing and return registers. */
2357
2358 static int const x86_64_int_parameter_registers[6] =
2359 {
2360 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2361 };
2362
2363 static int const x86_64_ms_abi_int_parameter_registers[4] =
2364 {
2365 CX_REG, DX_REG, R8_REG, R9_REG
2366 };
2367
2368 static int const x86_64_int_return_registers[4] =
2369 {
2370 AX_REG, DX_REG, DI_REG, SI_REG
2371 };
2372
2373 /* Define the structure for the machine field in struct function. */
2374
2375 struct GTY(()) stack_local_entry {
2376 unsigned short mode;
2377 unsigned short n;
2378 rtx rtl;
2379 struct stack_local_entry *next;
2380 };
2381
2382 /* Structure describing stack frame layout.
2383 Stack grows downward:
2384
2385 [arguments]
2386 <- ARG_POINTER
2387 saved pc
2388
2389 saved static chain if ix86_static_chain_on_stack
2390
2391 saved frame pointer if frame_pointer_needed
2392 <- HARD_FRAME_POINTER
2393 [saved regs]
2394 <- regs_save_offset
2395 [padding0]
2396
2397 [saved SSE regs]
2398 <- sse_regs_save_offset
2399 [padding1] |
2400 | <- FRAME_POINTER
2401 [va_arg registers] |
2402 |
2403 [frame] |
2404 |
2405 [padding2] | = to_allocate
2406 <- STACK_POINTER
2407 */
2408 struct ix86_frame
2409 {
2410 int nsseregs;
2411 int nregs;
2412 int va_arg_size;
2413 int red_zone_size;
2414 int outgoing_arguments_size;
2415
2416 /* The offsets relative to ARG_POINTER. */
2417 HOST_WIDE_INT frame_pointer_offset;
2418 HOST_WIDE_INT hard_frame_pointer_offset;
2419 HOST_WIDE_INT stack_pointer_offset;
2420 HOST_WIDE_INT hfp_save_offset;
2421 HOST_WIDE_INT reg_save_offset;
2422 HOST_WIDE_INT sse_reg_save_offset;
2423
2424 /* When save_regs_using_mov is set, emit prologue using
2425 move instead of push instructions. */
2426 bool save_regs_using_mov;
2427 };
2428
2429 /* Which cpu are we scheduling for. */
2430 enum attr_cpu ix86_schedule;
2431
2432 /* Which cpu are we optimizing for. */
2433 enum processor_type ix86_tune;
2434
2435 /* Which instruction set architecture to use. */
2436 enum processor_type ix86_arch;
2437
2438 /* true if sse prefetch instruction is not NOOP. */
2439 int x86_prefetch_sse;
2440
2441 /* -mstackrealign option */
2442 static const char ix86_force_align_arg_pointer_string[]
2443 = "force_align_arg_pointer";
2444
2445 static rtx (*ix86_gen_leave) (void);
2446 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2449 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2450 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2451 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2452 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2453 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2454 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2455 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2456 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2457
2458 /* Preferred alignment for stack boundary in bits. */
2459 unsigned int ix86_preferred_stack_boundary;
2460
2461 /* Alignment for incoming stack boundary in bits specified at
2462 command line. */
2463 static unsigned int ix86_user_incoming_stack_boundary;
2464
2465 /* Default alignment for incoming stack boundary in bits. */
2466 static unsigned int ix86_default_incoming_stack_boundary;
2467
2468 /* Alignment for incoming stack boundary in bits. */
2469 unsigned int ix86_incoming_stack_boundary;
2470
2471 /* Calling abi specific va_list type nodes. */
2472 static GTY(()) tree sysv_va_list_type_node;
2473 static GTY(()) tree ms_va_list_type_node;
2474
2475 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2476 char internal_label_prefix[16];
2477 int internal_label_prefix_len;
2478
2479 /* Fence to use after loop using movnt. */
2480 tree x86_mfence;
2481
2482 /* Register class used for passing given 64bit part of the argument.
2483 These represent classes as documented by the PS ABI, with the exception
2484 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2485 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2486
2487 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2488 whenever possible (upper half does contain padding). */
2489 enum x86_64_reg_class
2490 {
2491 X86_64_NO_CLASS,
2492 X86_64_INTEGER_CLASS,
2493 X86_64_INTEGERSI_CLASS,
2494 X86_64_SSE_CLASS,
2495 X86_64_SSESF_CLASS,
2496 X86_64_SSEDF_CLASS,
2497 X86_64_SSEUP_CLASS,
2498 X86_64_X87_CLASS,
2499 X86_64_X87UP_CLASS,
2500 X86_64_COMPLEX_X87_CLASS,
2501 X86_64_MEMORY_CLASS
2502 };
2503
2504 #define MAX_CLASSES 4
2505
2506 /* Table of constants used by fldpi, fldln2, etc.... */
2507 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2508 static bool ext_80387_constants_init = 0;
2509
2510 \f
2511 static struct machine_function * ix86_init_machine_status (void);
2512 static rtx ix86_function_value (const_tree, const_tree, bool);
2513 static bool ix86_function_value_regno_p (const unsigned int);
2514 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2515 const_tree);
2516 static rtx ix86_static_chain (const_tree, bool);
2517 static int ix86_function_regparm (const_tree, const_tree);
2518 static void ix86_compute_frame_layout (struct ix86_frame *);
2519 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2520 rtx, rtx, int);
2521 static void ix86_add_new_builtins (HOST_WIDE_INT);
2522 static tree ix86_canonical_va_list_type (tree);
2523 static void predict_jump (int);
2524 static unsigned int split_stack_prologue_scratch_regno (void);
2525 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2526
2527 enum ix86_function_specific_strings
2528 {
2529 IX86_FUNCTION_SPECIFIC_ARCH,
2530 IX86_FUNCTION_SPECIFIC_TUNE,
2531 IX86_FUNCTION_SPECIFIC_MAX
2532 };
2533
2534 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2535 const char *, enum fpmath_unit, bool);
2536 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2537 static void ix86_function_specific_save (struct cl_target_option *);
2538 static void ix86_function_specific_restore (struct cl_target_option *);
2539 static void ix86_function_specific_print (FILE *, int,
2540 struct cl_target_option *);
2541 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2542 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2543 struct gcc_options *);
2544 static bool ix86_can_inline_p (tree, tree);
2545 static void ix86_set_current_function (tree);
2546 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2547
2548 static enum calling_abi ix86_function_abi (const_tree);
2549
2550 \f
2551 #ifndef SUBTARGET32_DEFAULT_CPU
2552 #define SUBTARGET32_DEFAULT_CPU "i386"
2553 #endif
2554
2555 /* The svr4 ABI for the i386 says that records and unions are returned
2556 in memory. */
2557 #ifndef DEFAULT_PCC_STRUCT_RETURN
2558 #define DEFAULT_PCC_STRUCT_RETURN 1
2559 #endif
2560
2561 /* Whether -mtune= or -march= were specified */
2562 static int ix86_tune_defaulted;
2563 static int ix86_arch_specified;
2564
2565 /* Vectorization library interface and handlers. */
2566 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2567
2568 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2569 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2570
2571 /* Processor target table, indexed by processor number */
2572 struct ptt
2573 {
2574 const struct processor_costs *cost; /* Processor costs */
2575 const int align_loop; /* Default alignments. */
2576 const int align_loop_max_skip;
2577 const int align_jump;
2578 const int align_jump_max_skip;
2579 const int align_func;
2580 };
2581
2582 static const struct ptt processor_target_table[PROCESSOR_max] =
2583 {
2584 {&i386_cost, 4, 3, 4, 3, 4},
2585 {&i486_cost, 16, 15, 16, 15, 16},
2586 {&pentium_cost, 16, 7, 16, 7, 16},
2587 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2588 {&geode_cost, 0, 0, 0, 0, 0},
2589 {&k6_cost, 32, 7, 32, 7, 32},
2590 {&athlon_cost, 16, 7, 16, 7, 16},
2591 {&pentium4_cost, 0, 0, 0, 0, 0},
2592 {&k8_cost, 16, 7, 16, 7, 16},
2593 {&nocona_cost, 0, 0, 0, 0, 0},
2594 /* Core 2 32-bit. */
2595 {&generic32_cost, 16, 10, 16, 10, 16},
2596 /* Core 2 64-bit. */
2597 {&generic64_cost, 16, 10, 16, 10, 16},
2598 /* Core i7 32-bit. */
2599 {&generic32_cost, 16, 10, 16, 10, 16},
2600 /* Core i7 64-bit. */
2601 {&generic64_cost, 16, 10, 16, 10, 16},
2602 {&generic32_cost, 16, 7, 16, 7, 16},
2603 {&generic64_cost, 16, 10, 16, 10, 16},
2604 {&amdfam10_cost, 32, 24, 32, 7, 32},
2605 {&bdver1_cost, 32, 24, 32, 7, 32},
2606 {&bdver2_cost, 32, 24, 32, 7, 32},
2607 {&btver1_cost, 32, 24, 32, 7, 32},
2608 {&atom_cost, 16, 15, 16, 7, 16}
2609 };
2610
2611 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2612 {
2613 "generic",
2614 "i386",
2615 "i486",
2616 "pentium",
2617 "pentium-mmx",
2618 "pentiumpro",
2619 "pentium2",
2620 "pentium3",
2621 "pentium4",
2622 "pentium-m",
2623 "prescott",
2624 "nocona",
2625 "core2",
2626 "corei7",
2627 "atom",
2628 "geode",
2629 "k6",
2630 "k6-2",
2631 "k6-3",
2632 "athlon",
2633 "athlon-4",
2634 "k8",
2635 "amdfam10",
2636 "bdver1",
2637 "bdver2",
2638 "btver1"
2639 };
2640 \f
2641 /* Return true if a red-zone is in use. */
2642
2643 static inline bool
2644 ix86_using_red_zone (void)
2645 {
2646 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2647 }
2648 \f
2649 /* Return a string that documents the current -m options. The caller is
2650 responsible for freeing the string. */
2651
2652 static char *
2653 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2654 const char *tune, enum fpmath_unit fpmath,
2655 bool add_nl_p)
2656 {
2657 struct ix86_target_opts
2658 {
2659 const char *option; /* option string */
2660 HOST_WIDE_INT mask; /* isa mask options */
2661 };
2662
2663 /* This table is ordered so that options like -msse4.2 that imply
2664 preceding options while match those first. */
2665 static struct ix86_target_opts isa_opts[] =
2666 {
2667 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2668 { "-mfma", OPTION_MASK_ISA_FMA },
2669 { "-mxop", OPTION_MASK_ISA_XOP },
2670 { "-mlwp", OPTION_MASK_ISA_LWP },
2671 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2672 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2673 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2674 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2675 { "-msse3", OPTION_MASK_ISA_SSE3 },
2676 { "-msse2", OPTION_MASK_ISA_SSE2 },
2677 { "-msse", OPTION_MASK_ISA_SSE },
2678 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2679 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2680 { "-mmmx", OPTION_MASK_ISA_MMX },
2681 { "-mabm", OPTION_MASK_ISA_ABM },
2682 { "-mbmi", OPTION_MASK_ISA_BMI },
2683 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2684 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2685 { "-mhle", OPTION_MASK_ISA_HLE },
2686 { "-mtbm", OPTION_MASK_ISA_TBM },
2687 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2688 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2689 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2690 { "-maes", OPTION_MASK_ISA_AES },
2691 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2692 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2693 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2694 { "-mf16c", OPTION_MASK_ISA_F16C },
2695 { "-mrtm", OPTION_MASK_ISA_RTM },
2696 };
2697
2698 /* Flag options. */
2699 static struct ix86_target_opts flag_opts[] =
2700 {
2701 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2702 { "-m80387", MASK_80387 },
2703 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2704 { "-malign-double", MASK_ALIGN_DOUBLE },
2705 { "-mcld", MASK_CLD },
2706 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2707 { "-mieee-fp", MASK_IEEE_FP },
2708 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2709 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2710 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2711 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2712 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2713 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2714 { "-mno-red-zone", MASK_NO_RED_ZONE },
2715 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2716 { "-mrecip", MASK_RECIP },
2717 { "-mrtd", MASK_RTD },
2718 { "-msseregparm", MASK_SSEREGPARM },
2719 { "-mstack-arg-probe", MASK_STACK_PROBE },
2720 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2721 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2722 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2723 { "-mvzeroupper", MASK_VZEROUPPER },
2724 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2725 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2726 { "-mprefer-avx128", MASK_PREFER_AVX128},
2727 };
2728
2729 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2730
2731 char isa_other[40];
2732 char target_other[40];
2733 unsigned num = 0;
2734 unsigned i, j;
2735 char *ret;
2736 char *ptr;
2737 size_t len;
2738 size_t line_len;
2739 size_t sep_len;
2740 const char *abi;
2741
2742 memset (opts, '\0', sizeof (opts));
2743
2744 /* Add -march= option. */
2745 if (arch)
2746 {
2747 opts[num][0] = "-march=";
2748 opts[num++][1] = arch;
2749 }
2750
2751 /* Add -mtune= option. */
2752 if (tune)
2753 {
2754 opts[num][0] = "-mtune=";
2755 opts[num++][1] = tune;
2756 }
2757
2758 /* Add -m32/-m64/-mx32. */
2759 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2760 {
2761 if ((isa & OPTION_MASK_ABI_64) != 0)
2762 abi = "-m64";
2763 else
2764 abi = "-mx32";
2765 isa &= ~ (OPTION_MASK_ISA_64BIT
2766 | OPTION_MASK_ABI_64
2767 | OPTION_MASK_ABI_X32);
2768 }
2769 else
2770 abi = "-m32";
2771 opts[num++][0] = abi;
2772
2773 /* Pick out the options in isa options. */
2774 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2775 {
2776 if ((isa & isa_opts[i].mask) != 0)
2777 {
2778 opts[num++][0] = isa_opts[i].option;
2779 isa &= ~ isa_opts[i].mask;
2780 }
2781 }
2782
2783 if (isa && add_nl_p)
2784 {
2785 opts[num++][0] = isa_other;
2786 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2787 isa);
2788 }
2789
2790 /* Add flag options. */
2791 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2792 {
2793 if ((flags & flag_opts[i].mask) != 0)
2794 {
2795 opts[num++][0] = flag_opts[i].option;
2796 flags &= ~ flag_opts[i].mask;
2797 }
2798 }
2799
2800 if (flags && add_nl_p)
2801 {
2802 opts[num++][0] = target_other;
2803 sprintf (target_other, "(other flags: %#x)", flags);
2804 }
2805
2806 /* Add -fpmath= option. */
2807 if (fpmath)
2808 {
2809 opts[num][0] = "-mfpmath=";
2810 switch ((int) fpmath)
2811 {
2812 case FPMATH_387:
2813 opts[num++][1] = "387";
2814 break;
2815
2816 case FPMATH_SSE:
2817 opts[num++][1] = "sse";
2818 break;
2819
2820 case FPMATH_387 | FPMATH_SSE:
2821 opts[num++][1] = "sse+387";
2822 break;
2823
2824 default:
2825 gcc_unreachable ();
2826 }
2827 }
2828
2829 /* Any options? */
2830 if (num == 0)
2831 return NULL;
2832
2833 gcc_assert (num < ARRAY_SIZE (opts));
2834
2835 /* Size the string. */
2836 len = 0;
2837 sep_len = (add_nl_p) ? 3 : 1;
2838 for (i = 0; i < num; i++)
2839 {
2840 len += sep_len;
2841 for (j = 0; j < 2; j++)
2842 if (opts[i][j])
2843 len += strlen (opts[i][j]);
2844 }
2845
2846 /* Build the string. */
2847 ret = ptr = (char *) xmalloc (len);
2848 line_len = 0;
2849
2850 for (i = 0; i < num; i++)
2851 {
2852 size_t len2[2];
2853
2854 for (j = 0; j < 2; j++)
2855 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2856
2857 if (i != 0)
2858 {
2859 *ptr++ = ' ';
2860 line_len++;
2861
2862 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2863 {
2864 *ptr++ = '\\';
2865 *ptr++ = '\n';
2866 line_len = 0;
2867 }
2868 }
2869
2870 for (j = 0; j < 2; j++)
2871 if (opts[i][j])
2872 {
2873 memcpy (ptr, opts[i][j], len2[j]);
2874 ptr += len2[j];
2875 line_len += len2[j];
2876 }
2877 }
2878
2879 *ptr = '\0';
2880 gcc_assert (ret + len >= ptr);
2881
2882 return ret;
2883 }
2884
2885 /* Return true, if profiling code should be emitted before
2886 prologue. Otherwise it returns false.
2887 Note: For x86 with "hotfix" it is sorried. */
2888 static bool
2889 ix86_profile_before_prologue (void)
2890 {
2891 return flag_fentry != 0;
2892 }
2893
2894 /* Function that is callable from the debugger to print the current
2895 options. */
2896 void
2897 ix86_debug_options (void)
2898 {
2899 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2900 ix86_arch_string, ix86_tune_string,
2901 ix86_fpmath, true);
2902
2903 if (opts)
2904 {
2905 fprintf (stderr, "%s\n\n", opts);
2906 free (opts);
2907 }
2908 else
2909 fputs ("<no options>\n\n", stderr);
2910
2911 return;
2912 }
2913 \f
2914 /* Override various settings based on options. If MAIN_ARGS_P, the
2915 options are from the command line, otherwise they are from
2916 attributes. */
2917
2918 static void
2919 ix86_option_override_internal (bool main_args_p)
2920 {
2921 int i;
2922 unsigned int ix86_arch_mask, ix86_tune_mask;
2923 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2924 const char *prefix;
2925 const char *suffix;
2926 const char *sw;
2927
2928 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2929 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2930 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2931 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2932 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2933 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2934 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2935 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2936 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2937 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2938 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2939 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2940 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2941 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2942 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2943 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2944 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2945 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2946 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2947 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2948 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2949 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2950 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2951 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2952 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2953 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2954 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2955 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2956 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2957 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2958 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2959 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2960 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2961 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2962 /* if this reaches 64, need to widen struct pta flags below */
2963
2964 static struct pta
2965 {
2966 const char *const name; /* processor name or nickname. */
2967 const enum processor_type processor;
2968 const enum attr_cpu schedule;
2969 const unsigned HOST_WIDE_INT flags;
2970 }
2971 const processor_alias_table[] =
2972 {
2973 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2974 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2975 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2976 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2977 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2978 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2979 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2980 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2981 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2982 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2983 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2984 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2985 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2986 PTA_MMX | PTA_SSE},
2987 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2988 PTA_MMX | PTA_SSE},
2989 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2990 PTA_MMX | PTA_SSE | PTA_SSE2},
2991 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2992 PTA_MMX |PTA_SSE | PTA_SSE2},
2993 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2994 PTA_MMX | PTA_SSE | PTA_SSE2},
2995 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2996 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2997 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2998 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2999 | PTA_CX16 | PTA_NO_SAHF},
3000 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3001 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3002 | PTA_SSSE3 | PTA_CX16},
3003 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3004 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3005 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3006 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3007 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3008 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3009 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3010 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3012 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3013 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3014 | PTA_RDRND | PTA_F16C},
3015 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3016 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3017 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3018 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3019 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3020 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3021 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3022 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3023 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3024 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3025 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3026 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3027 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3028 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3029 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3030 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3031 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3032 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3033 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3034 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3035 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3036 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3037 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3038 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3039 {"x86-64", PROCESSOR_K8, CPU_K8,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3041 {"k8", PROCESSOR_K8, CPU_K8,
3042 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3043 | PTA_SSE2 | PTA_NO_SAHF},
3044 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3045 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3046 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3047 {"opteron", PROCESSOR_K8, CPU_K8,
3048 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3049 | PTA_SSE2 | PTA_NO_SAHF},
3050 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3051 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3052 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3053 {"athlon64", PROCESSOR_K8, CPU_K8,
3054 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3055 | PTA_SSE2 | PTA_NO_SAHF},
3056 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3057 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3058 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3059 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3060 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3061 | PTA_SSE2 | PTA_NO_SAHF},
3062 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3063 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3064 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3065 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3066 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3067 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3068 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3069 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3070 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3071 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3072 | PTA_XOP | PTA_LWP},
3073 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3074 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3075 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3076 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3077 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3078 | PTA_FMA},
3079 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3080 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3081 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3082 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3083 PTA_HLE /* flags are only used for -march switch. */ },
3084 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3085 PTA_64BIT
3086 | PTA_HLE /* flags are only used for -march switch. */ },
3087 };
3088
3089 /* -mrecip options. */
3090 static struct
3091 {
3092 const char *string; /* option name */
3093 unsigned int mask; /* mask bits to set */
3094 }
3095 const recip_options[] =
3096 {
3097 { "all", RECIP_MASK_ALL },
3098 { "none", RECIP_MASK_NONE },
3099 { "div", RECIP_MASK_DIV },
3100 { "sqrt", RECIP_MASK_SQRT },
3101 { "vec-div", RECIP_MASK_VEC_DIV },
3102 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3103 };
3104
3105 int const pta_size = ARRAY_SIZE (processor_alias_table);
3106
3107 /* Set up prefix/suffix so the error messages refer to either the command
3108 line argument, or the attribute(target). */
3109 if (main_args_p)
3110 {
3111 prefix = "-m";
3112 suffix = "";
3113 sw = "switch";
3114 }
3115 else
3116 {
3117 prefix = "option(\"";
3118 suffix = "\")";
3119 sw = "attribute";
3120 }
3121
3122 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3123 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3124 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3125 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3126 #ifdef TARGET_BI_ARCH
3127 else
3128 {
3129 #if TARGET_BI_ARCH == 1
3130 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3131 is on and OPTION_MASK_ABI_X32 is off. We turn off
3132 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3133 -mx32. */
3134 if (TARGET_X32)
3135 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3136 #else
3137 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3138 on and OPTION_MASK_ABI_64 is off. We turn off
3139 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3140 -m64. */
3141 if (TARGET_LP64)
3142 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3143 #endif
3144 }
3145 #endif
3146
3147 if (TARGET_X32)
3148 {
3149 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3150 OPTION_MASK_ABI_64 for TARGET_X32. */
3151 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3152 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3153 }
3154 else if (TARGET_LP64)
3155 {
3156 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3157 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3158 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3159 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3160 }
3161
3162 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3163 SUBTARGET_OVERRIDE_OPTIONS;
3164 #endif
3165
3166 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3167 SUBSUBTARGET_OVERRIDE_OPTIONS;
3168 #endif
3169
3170 /* -fPIC is the default for x86_64. */
3171 if (TARGET_MACHO && TARGET_64BIT)
3172 flag_pic = 2;
3173
3174 /* Need to check -mtune=generic first. */
3175 if (ix86_tune_string)
3176 {
3177 if (!strcmp (ix86_tune_string, "generic")
3178 || !strcmp (ix86_tune_string, "i686")
3179 /* As special support for cross compilers we read -mtune=native
3180 as -mtune=generic. With native compilers we won't see the
3181 -mtune=native, as it was changed by the driver. */
3182 || !strcmp (ix86_tune_string, "native"))
3183 {
3184 if (TARGET_64BIT)
3185 ix86_tune_string = "generic64";
3186 else
3187 ix86_tune_string = "generic32";
3188 }
3189 /* If this call is for setting the option attribute, allow the
3190 generic32/generic64 that was previously set. */
3191 else if (!main_args_p
3192 && (!strcmp (ix86_tune_string, "generic32")
3193 || !strcmp (ix86_tune_string, "generic64")))
3194 ;
3195 else if (!strncmp (ix86_tune_string, "generic", 7))
3196 error ("bad value (%s) for %stune=%s %s",
3197 ix86_tune_string, prefix, suffix, sw);
3198 else if (!strcmp (ix86_tune_string, "x86-64"))
3199 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3200 "%stune=k8%s or %stune=generic%s instead as appropriate",
3201 prefix, suffix, prefix, suffix, prefix, suffix);
3202 }
3203 else
3204 {
3205 if (ix86_arch_string)
3206 ix86_tune_string = ix86_arch_string;
3207 if (!ix86_tune_string)
3208 {
3209 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3210 ix86_tune_defaulted = 1;
3211 }
3212
3213 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3214 need to use a sensible tune option. */
3215 if (!strcmp (ix86_tune_string, "generic")
3216 || !strcmp (ix86_tune_string, "x86-64")
3217 || !strcmp (ix86_tune_string, "i686"))
3218 {
3219 if (TARGET_64BIT)
3220 ix86_tune_string = "generic64";
3221 else
3222 ix86_tune_string = "generic32";
3223 }
3224 }
3225
3226 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3227 {
3228 /* rep; movq isn't available in 32-bit code. */
3229 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3230 ix86_stringop_alg = no_stringop;
3231 }
3232
3233 if (!ix86_arch_string)
3234 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3235 else
3236 ix86_arch_specified = 1;
3237
3238 if (global_options_set.x_ix86_pmode)
3239 {
3240 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3241 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3242 error ("address mode %qs not supported in the %s bit mode",
3243 TARGET_64BIT ? "short" : "long",
3244 TARGET_64BIT ? "64" : "32");
3245 }
3246 else
3247 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3248
3249 if (!global_options_set.x_ix86_abi)
3250 ix86_abi = DEFAULT_ABI;
3251
3252 if (global_options_set.x_ix86_cmodel)
3253 {
3254 switch (ix86_cmodel)
3255 {
3256 case CM_SMALL:
3257 case CM_SMALL_PIC:
3258 if (flag_pic)
3259 ix86_cmodel = CM_SMALL_PIC;
3260 if (!TARGET_64BIT)
3261 error ("code model %qs not supported in the %s bit mode",
3262 "small", "32");
3263 break;
3264
3265 case CM_MEDIUM:
3266 case CM_MEDIUM_PIC:
3267 if (flag_pic)
3268 ix86_cmodel = CM_MEDIUM_PIC;
3269 if (!TARGET_64BIT)
3270 error ("code model %qs not supported in the %s bit mode",
3271 "medium", "32");
3272 else if (TARGET_X32)
3273 error ("code model %qs not supported in x32 mode",
3274 "medium");
3275 break;
3276
3277 case CM_LARGE:
3278 case CM_LARGE_PIC:
3279 if (flag_pic)
3280 ix86_cmodel = CM_LARGE_PIC;
3281 if (!TARGET_64BIT)
3282 error ("code model %qs not supported in the %s bit mode",
3283 "large", "32");
3284 else if (TARGET_X32)
3285 error ("code model %qs not supported in x32 mode",
3286 "medium");
3287 break;
3288
3289 case CM_32:
3290 if (flag_pic)
3291 error ("code model %s does not support PIC mode", "32");
3292 if (TARGET_64BIT)
3293 error ("code model %qs not supported in the %s bit mode",
3294 "32", "64");
3295 break;
3296
3297 case CM_KERNEL:
3298 if (flag_pic)
3299 {
3300 error ("code model %s does not support PIC mode", "kernel");
3301 ix86_cmodel = CM_32;
3302 }
3303 if (!TARGET_64BIT)
3304 error ("code model %qs not supported in the %s bit mode",
3305 "kernel", "32");
3306 break;
3307
3308 default:
3309 gcc_unreachable ();
3310 }
3311 }
3312 else
3313 {
3314 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3315 use of rip-relative addressing. This eliminates fixups that
3316 would otherwise be needed if this object is to be placed in a
3317 DLL, and is essentially just as efficient as direct addressing. */
3318 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3319 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3320 else if (TARGET_64BIT)
3321 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3322 else
3323 ix86_cmodel = CM_32;
3324 }
3325 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3326 {
3327 error ("-masm=intel not supported in this configuration");
3328 ix86_asm_dialect = ASM_ATT;
3329 }
3330 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3331 sorry ("%i-bit mode not compiled in",
3332 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3333
3334 for (i = 0; i < pta_size; i++)
3335 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3336 {
3337 ix86_schedule = processor_alias_table[i].schedule;
3338 ix86_arch = processor_alias_table[i].processor;
3339 /* Default cpu tuning to the architecture. */
3340 ix86_tune = ix86_arch;
3341
3342 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3343 error ("CPU you selected does not support x86-64 "
3344 "instruction set");
3345
3346 if (processor_alias_table[i].flags & PTA_MMX
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3348 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3349 if (processor_alias_table[i].flags & PTA_3DNOW
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3351 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3352 if (processor_alias_table[i].flags & PTA_3DNOW_A
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3354 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3355 if (processor_alias_table[i].flags & PTA_SSE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3358 if (processor_alias_table[i].flags & PTA_SSE2
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3360 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3361 if (processor_alias_table[i].flags & PTA_SSE3
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3363 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3364 if (processor_alias_table[i].flags & PTA_SSSE3
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3366 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3367 if (processor_alias_table[i].flags & PTA_SSE4_1
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3369 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3370 if (processor_alias_table[i].flags & PTA_SSE4_2
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3372 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3373 if (processor_alias_table[i].flags & PTA_AVX
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3375 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3376 if (processor_alias_table[i].flags & PTA_AVX2
3377 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3378 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3379 if (processor_alias_table[i].flags & PTA_FMA
3380 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3381 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3382 if (processor_alias_table[i].flags & PTA_SSE4A
3383 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3384 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3385 if (processor_alias_table[i].flags & PTA_FMA4
3386 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3387 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3388 if (processor_alias_table[i].flags & PTA_XOP
3389 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3390 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3391 if (processor_alias_table[i].flags & PTA_LWP
3392 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3393 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3394 if (processor_alias_table[i].flags & PTA_ABM
3395 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3396 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3397 if (processor_alias_table[i].flags & PTA_BMI
3398 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3399 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3400 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3401 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3402 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3403 if (processor_alias_table[i].flags & PTA_TBM
3404 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3405 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3406 if (processor_alias_table[i].flags & PTA_BMI2
3407 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3408 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3409 if (processor_alias_table[i].flags & PTA_CX16
3410 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3411 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3412 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3413 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3414 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3415 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3416 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3417 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3418 if (processor_alias_table[i].flags & PTA_MOVBE
3419 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3420 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3421 if (processor_alias_table[i].flags & PTA_AES
3422 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3423 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3424 if (processor_alias_table[i].flags & PTA_PCLMUL
3425 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3426 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3427 if (processor_alias_table[i].flags & PTA_FSGSBASE
3428 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3429 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3430 if (processor_alias_table[i].flags & PTA_RDRND
3431 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3432 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3433 if (processor_alias_table[i].flags & PTA_F16C
3434 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3435 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3436 if (processor_alias_table[i].flags & PTA_RTM
3437 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3438 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3439 if (processor_alias_table[i].flags & PTA_HLE
3440 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3441 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3442 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3443 x86_prefetch_sse = true;
3444
3445 break;
3446 }
3447
3448 if (!strcmp (ix86_arch_string, "generic"))
3449 error ("generic CPU can be used only for %stune=%s %s",
3450 prefix, suffix, sw);
3451 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3452 error ("bad value (%s) for %sarch=%s %s",
3453 ix86_arch_string, prefix, suffix, sw);
3454
3455 ix86_arch_mask = 1u << ix86_arch;
3456 for (i = 0; i < X86_ARCH_LAST; ++i)
3457 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3458
3459 for (i = 0; i < pta_size; i++)
3460 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3461 {
3462 ix86_schedule = processor_alias_table[i].schedule;
3463 ix86_tune = processor_alias_table[i].processor;
3464 if (TARGET_64BIT)
3465 {
3466 if (!(processor_alias_table[i].flags & PTA_64BIT))
3467 {
3468 if (ix86_tune_defaulted)
3469 {
3470 ix86_tune_string = "x86-64";
3471 for (i = 0; i < pta_size; i++)
3472 if (! strcmp (ix86_tune_string,
3473 processor_alias_table[i].name))
3474 break;
3475 ix86_schedule = processor_alias_table[i].schedule;
3476 ix86_tune = processor_alias_table[i].processor;
3477 }
3478 else
3479 error ("CPU you selected does not support x86-64 "
3480 "instruction set");
3481 }
3482 }
3483 else
3484 {
3485 /* Adjust tuning when compiling for 32-bit ABI. */
3486 switch (ix86_tune)
3487 {
3488 case PROCESSOR_GENERIC64:
3489 ix86_tune = PROCESSOR_GENERIC32;
3490 ix86_schedule = CPU_PENTIUMPRO;
3491 break;
3492
3493 case PROCESSOR_CORE2_64:
3494 ix86_tune = PROCESSOR_CORE2_32;
3495 break;
3496
3497 case PROCESSOR_COREI7_64:
3498 ix86_tune = PROCESSOR_COREI7_32;
3499 break;
3500
3501 default:
3502 break;
3503 }
3504 }
3505 /* Intel CPUs have always interpreted SSE prefetch instructions as
3506 NOPs; so, we can enable SSE prefetch instructions even when
3507 -mtune (rather than -march) points us to a processor that has them.
3508 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3509 higher processors. */
3510 if (TARGET_CMOV
3511 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3512 x86_prefetch_sse = true;
3513 break;
3514 }
3515
3516 if (ix86_tune_specified && i == pta_size)
3517 error ("bad value (%s) for %stune=%s %s",
3518 ix86_tune_string, prefix, suffix, sw);
3519
3520 ix86_tune_mask = 1u << ix86_tune;
3521 for (i = 0; i < X86_TUNE_LAST; ++i)
3522 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3523
3524 #ifndef USE_IX86_FRAME_POINTER
3525 #define USE_IX86_FRAME_POINTER 0
3526 #endif
3527
3528 #ifndef USE_X86_64_FRAME_POINTER
3529 #define USE_X86_64_FRAME_POINTER 0
3530 #endif
3531
3532 /* Set the default values for switches whose default depends on TARGET_64BIT
3533 in case they weren't overwritten by command line options. */
3534 if (TARGET_64BIT)
3535 {
3536 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3537 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3538 if (flag_asynchronous_unwind_tables == 2)
3539 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3540 if (flag_pcc_struct_return == 2)
3541 flag_pcc_struct_return = 0;
3542 }
3543 else
3544 {
3545 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3546 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3547 if (flag_asynchronous_unwind_tables == 2)
3548 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3549 if (flag_pcc_struct_return == 2)
3550 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3551 }
3552
3553 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3554 if (optimize_size)
3555 ix86_cost = &ix86_size_cost;
3556 else
3557 ix86_cost = ix86_tune_cost;
3558
3559 /* Arrange to set up i386_stack_locals for all functions. */
3560 init_machine_status = ix86_init_machine_status;
3561
3562 /* Validate -mregparm= value. */
3563 if (global_options_set.x_ix86_regparm)
3564 {
3565 if (TARGET_64BIT)
3566 warning (0, "-mregparm is ignored in 64-bit mode");
3567 if (ix86_regparm > REGPARM_MAX)
3568 {
3569 error ("-mregparm=%d is not between 0 and %d",
3570 ix86_regparm, REGPARM_MAX);
3571 ix86_regparm = 0;
3572 }
3573 }
3574 if (TARGET_64BIT)
3575 ix86_regparm = REGPARM_MAX;
3576
3577 /* Default align_* from the processor table. */
3578 if (align_loops == 0)
3579 {
3580 align_loops = processor_target_table[ix86_tune].align_loop;
3581 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3582 }
3583 if (align_jumps == 0)
3584 {
3585 align_jumps = processor_target_table[ix86_tune].align_jump;
3586 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3587 }
3588 if (align_functions == 0)
3589 {
3590 align_functions = processor_target_table[ix86_tune].align_func;
3591 }
3592
3593 /* Provide default for -mbranch-cost= value. */
3594 if (!global_options_set.x_ix86_branch_cost)
3595 ix86_branch_cost = ix86_cost->branch_cost;
3596
3597 if (TARGET_64BIT)
3598 {
3599 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3600
3601 /* Enable by default the SSE and MMX builtins. Do allow the user to
3602 explicitly disable any of these. In particular, disabling SSE and
3603 MMX for kernel code is extremely useful. */
3604 if (!ix86_arch_specified)
3605 ix86_isa_flags
3606 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3607 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3608
3609 if (TARGET_RTD)
3610 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3611 }
3612 else
3613 {
3614 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3615
3616 if (!ix86_arch_specified)
3617 ix86_isa_flags
3618 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3619
3620 /* i386 ABI does not specify red zone. It still makes sense to use it
3621 when programmer takes care to stack from being destroyed. */
3622 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3623 target_flags |= MASK_NO_RED_ZONE;
3624 }
3625
3626 /* Keep nonleaf frame pointers. */
3627 if (flag_omit_frame_pointer)
3628 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3629 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3630 flag_omit_frame_pointer = 1;
3631
3632 /* If we're doing fast math, we don't care about comparison order
3633 wrt NaNs. This lets us use a shorter comparison sequence. */
3634 if (flag_finite_math_only)
3635 target_flags &= ~MASK_IEEE_FP;
3636
3637 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3638 since the insns won't need emulation. */
3639 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3640 target_flags &= ~MASK_NO_FANCY_MATH_387;
3641
3642 /* Likewise, if the target doesn't have a 387, or we've specified
3643 software floating point, don't use 387 inline intrinsics. */
3644 if (!TARGET_80387)
3645 target_flags |= MASK_NO_FANCY_MATH_387;
3646
3647 /* Turn on MMX builtins for -msse. */
3648 if (TARGET_SSE)
3649 {
3650 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3651 x86_prefetch_sse = true;
3652 }
3653
3654 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3655 if (TARGET_SSE4_2 || TARGET_ABM)
3656 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3657
3658 /* Turn on lzcnt instruction for -mabm. */
3659 if (TARGET_ABM)
3660 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3661
3662 /* Validate -mpreferred-stack-boundary= value or default it to
3663 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3664 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3665 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3666 {
3667 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3668 int max = (TARGET_SEH ? 4 : 12);
3669
3670 if (ix86_preferred_stack_boundary_arg < min
3671 || ix86_preferred_stack_boundary_arg > max)
3672 {
3673 if (min == max)
3674 error ("-mpreferred-stack-boundary is not supported "
3675 "for this target");
3676 else
3677 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3678 ix86_preferred_stack_boundary_arg, min, max);
3679 }
3680 else
3681 ix86_preferred_stack_boundary
3682 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3683 }
3684
3685 /* Set the default value for -mstackrealign. */
3686 if (ix86_force_align_arg_pointer == -1)
3687 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3688
3689 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3690
3691 /* Validate -mincoming-stack-boundary= value or default it to
3692 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3693 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3694 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3695 {
3696 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3697 || ix86_incoming_stack_boundary_arg > 12)
3698 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3699 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3700 else
3701 {
3702 ix86_user_incoming_stack_boundary
3703 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3704 ix86_incoming_stack_boundary
3705 = ix86_user_incoming_stack_boundary;
3706 }
3707 }
3708
3709 /* Accept -msseregparm only if at least SSE support is enabled. */
3710 if (TARGET_SSEREGPARM
3711 && ! TARGET_SSE)
3712 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3713
3714 if (global_options_set.x_ix86_fpmath)
3715 {
3716 if (ix86_fpmath & FPMATH_SSE)
3717 {
3718 if (!TARGET_SSE)
3719 {
3720 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3721 ix86_fpmath = FPMATH_387;
3722 }
3723 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3724 {
3725 warning (0, "387 instruction set disabled, using SSE arithmetics");
3726 ix86_fpmath = FPMATH_SSE;
3727 }
3728 }
3729 }
3730 else
3731 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3732
3733 /* If the i387 is disabled, then do not return values in it. */
3734 if (!TARGET_80387)
3735 target_flags &= ~MASK_FLOAT_RETURNS;
3736
3737 /* Use external vectorized library in vectorizing intrinsics. */
3738 if (global_options_set.x_ix86_veclibabi_type)
3739 switch (ix86_veclibabi_type)
3740 {
3741 case ix86_veclibabi_type_svml:
3742 ix86_veclib_handler = ix86_veclibabi_svml;
3743 break;
3744
3745 case ix86_veclibabi_type_acml:
3746 ix86_veclib_handler = ix86_veclibabi_acml;
3747 break;
3748
3749 default:
3750 gcc_unreachable ();
3751 }
3752
3753 if ((!USE_IX86_FRAME_POINTER
3754 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3755 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3756 && !optimize_size)
3757 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3758
3759 /* ??? Unwind info is not correct around the CFG unless either a frame
3760 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3761 unwind info generation to be aware of the CFG and propagating states
3762 around edges. */
3763 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3764 || flag_exceptions || flag_non_call_exceptions)
3765 && flag_omit_frame_pointer
3766 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3767 {
3768 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3769 warning (0, "unwind tables currently require either a frame pointer "
3770 "or %saccumulate-outgoing-args%s for correctness",
3771 prefix, suffix);
3772 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3773 }
3774
3775 /* If stack probes are required, the space used for large function
3776 arguments on the stack must also be probed, so enable
3777 -maccumulate-outgoing-args so this happens in the prologue. */
3778 if (TARGET_STACK_PROBE
3779 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3780 {
3781 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3782 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3783 "for correctness", prefix, suffix);
3784 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3785 }
3786
3787 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3788 {
3789 char *p;
3790 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3791 p = strchr (internal_label_prefix, 'X');
3792 internal_label_prefix_len = p - internal_label_prefix;
3793 *p = '\0';
3794 }
3795
3796 /* When scheduling description is not available, disable scheduler pass
3797 so it won't slow down the compilation and make x87 code slower. */
3798 if (!TARGET_SCHEDULE)
3799 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3800
3801 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3802 ix86_tune_cost->simultaneous_prefetches,
3803 global_options.x_param_values,
3804 global_options_set.x_param_values);
3805 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3806 ix86_tune_cost->prefetch_block,
3807 global_options.x_param_values,
3808 global_options_set.x_param_values);
3809 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3810 ix86_tune_cost->l1_cache_size,
3811 global_options.x_param_values,
3812 global_options_set.x_param_values);
3813 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3814 ix86_tune_cost->l2_cache_size,
3815 global_options.x_param_values,
3816 global_options_set.x_param_values);
3817
3818 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3819 if (flag_prefetch_loop_arrays < 0
3820 && HAVE_prefetch
3821 && optimize >= 3
3822 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3823 flag_prefetch_loop_arrays = 1;
3824
3825 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3826 can be optimized to ap = __builtin_next_arg (0). */
3827 if (!TARGET_64BIT && !flag_split_stack)
3828 targetm.expand_builtin_va_start = NULL;
3829
3830 if (TARGET_64BIT)
3831 {
3832 ix86_gen_leave = gen_leave_rex64;
3833 if (Pmode == DImode)
3834 {
3835 ix86_gen_monitor = gen_sse3_monitor64_di;
3836 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3837 ix86_gen_tls_local_dynamic_base_64
3838 = gen_tls_local_dynamic_base_64_di;
3839 }
3840 else
3841 {
3842 ix86_gen_monitor = gen_sse3_monitor64_si;
3843 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3844 ix86_gen_tls_local_dynamic_base_64
3845 = gen_tls_local_dynamic_base_64_si;
3846 }
3847 }
3848 else
3849 {
3850 ix86_gen_leave = gen_leave;
3851 ix86_gen_monitor = gen_sse3_monitor;
3852 }
3853
3854 if (Pmode == DImode)
3855 {
3856 ix86_gen_add3 = gen_adddi3;
3857 ix86_gen_sub3 = gen_subdi3;
3858 ix86_gen_sub3_carry = gen_subdi3_carry;
3859 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3860 ix86_gen_andsp = gen_anddi3;
3861 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3862 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3863 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3864 }
3865 else
3866 {
3867 ix86_gen_add3 = gen_addsi3;
3868 ix86_gen_sub3 = gen_subsi3;
3869 ix86_gen_sub3_carry = gen_subsi3_carry;
3870 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3871 ix86_gen_andsp = gen_andsi3;
3872 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3873 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3874 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3875 }
3876
3877 #ifdef USE_IX86_CLD
3878 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3879 if (!TARGET_64BIT)
3880 target_flags |= MASK_CLD & ~target_flags_explicit;
3881 #endif
3882
3883 if (!TARGET_64BIT && flag_pic)
3884 {
3885 if (flag_fentry > 0)
3886 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3887 "with -fpic");
3888 flag_fentry = 0;
3889 }
3890 else if (TARGET_SEH)
3891 {
3892 if (flag_fentry == 0)
3893 sorry ("-mno-fentry isn%'t compatible with SEH");
3894 flag_fentry = 1;
3895 }
3896 else if (flag_fentry < 0)
3897 {
3898 #if defined(PROFILE_BEFORE_PROLOGUE)
3899 flag_fentry = 1;
3900 #else
3901 flag_fentry = 0;
3902 #endif
3903 }
3904
3905 if (TARGET_AVX)
3906 {
3907 /* When not optimize for size, enable vzeroupper optimization for
3908 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3909 AVX unaligned load/store. */
3910 if (!optimize_size)
3911 {
3912 if (flag_expensive_optimizations
3913 && !(target_flags_explicit & MASK_VZEROUPPER))
3914 target_flags |= MASK_VZEROUPPER;
3915 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3916 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3917 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3918 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3919 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3920 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3921 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3922 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3923 target_flags |= MASK_PREFER_AVX128;
3924 }
3925 }
3926 else
3927 {
3928 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3929 target_flags &= ~MASK_VZEROUPPER;
3930 }
3931
3932 if (ix86_recip_name)
3933 {
3934 char *p = ASTRDUP (ix86_recip_name);
3935 char *q;
3936 unsigned int mask, i;
3937 bool invert;
3938
3939 while ((q = strtok (p, ",")) != NULL)
3940 {
3941 p = NULL;
3942 if (*q == '!')
3943 {
3944 invert = true;
3945 q++;
3946 }
3947 else
3948 invert = false;
3949
3950 if (!strcmp (q, "default"))
3951 mask = RECIP_MASK_ALL;
3952 else
3953 {
3954 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3955 if (!strcmp (q, recip_options[i].string))
3956 {
3957 mask = recip_options[i].mask;
3958 break;
3959 }
3960
3961 if (i == ARRAY_SIZE (recip_options))
3962 {
3963 error ("unknown option for -mrecip=%s", q);
3964 invert = false;
3965 mask = RECIP_MASK_NONE;
3966 }
3967 }
3968
3969 recip_mask_explicit |= mask;
3970 if (invert)
3971 recip_mask &= ~mask;
3972 else
3973 recip_mask |= mask;
3974 }
3975 }
3976
3977 if (TARGET_RECIP)
3978 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3979 else if (target_flags_explicit & MASK_RECIP)
3980 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3981
3982 /* Save the initial options in case the user does function specific
3983 options. */
3984 if (main_args_p)
3985 target_option_default_node = target_option_current_node
3986 = build_target_option_node ();
3987 }
3988
3989 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3990
3991 static bool
3992 function_pass_avx256_p (const_rtx val)
3993 {
3994 if (!val)
3995 return false;
3996
3997 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3998 return true;
3999
4000 if (GET_CODE (val) == PARALLEL)
4001 {
4002 int i;
4003 rtx r;
4004
4005 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4006 {
4007 r = XVECEXP (val, 0, i);
4008 if (GET_CODE (r) == EXPR_LIST
4009 && XEXP (r, 0)
4010 && REG_P (XEXP (r, 0))
4011 && (GET_MODE (XEXP (r, 0)) == OImode
4012 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4013 return true;
4014 }
4015 }
4016
4017 return false;
4018 }
4019
4020 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4021
4022 static void
4023 ix86_option_override (void)
4024 {
4025 ix86_option_override_internal (true);
4026 }
4027
4028 /* Update register usage after having seen the compiler flags. */
4029
4030 static void
4031 ix86_conditional_register_usage (void)
4032 {
4033 int i;
4034 unsigned int j;
4035
4036 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4037 {
4038 if (fixed_regs[i] > 1)
4039 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4040 if (call_used_regs[i] > 1)
4041 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4042 }
4043
4044 /* The PIC register, if it exists, is fixed. */
4045 j = PIC_OFFSET_TABLE_REGNUM;
4046 if (j != INVALID_REGNUM)
4047 fixed_regs[j] = call_used_regs[j] = 1;
4048
4049 /* The 64-bit MS_ABI changes the set of call-used registers. */
4050 if (TARGET_64BIT_MS_ABI)
4051 {
4052 call_used_regs[SI_REG] = 0;
4053 call_used_regs[DI_REG] = 0;
4054 call_used_regs[XMM6_REG] = 0;
4055 call_used_regs[XMM7_REG] = 0;
4056 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4057 call_used_regs[i] = 0;
4058 }
4059
4060 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4061 other call-clobbered regs for 64-bit. */
4062 if (TARGET_64BIT)
4063 {
4064 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4065
4066 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4067 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4068 && call_used_regs[i])
4069 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4070 }
4071
4072 /* If MMX is disabled, squash the registers. */
4073 if (! TARGET_MMX)
4074 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4075 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4076 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4077
4078 /* If SSE is disabled, squash the registers. */
4079 if (! TARGET_SSE)
4080 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4081 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4082 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4083
4084 /* If the FPU is disabled, squash the registers. */
4085 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4086 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4087 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4088 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4089
4090 /* If 32-bit, squash the 64-bit registers. */
4091 if (! TARGET_64BIT)
4092 {
4093 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4094 reg_names[i] = "";
4095 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4096 reg_names[i] = "";
4097 }
4098 }
4099
4100 \f
4101 /* Save the current options */
4102
4103 static void
4104 ix86_function_specific_save (struct cl_target_option *ptr)
4105 {
4106 ptr->arch = ix86_arch;
4107 ptr->schedule = ix86_schedule;
4108 ptr->tune = ix86_tune;
4109 ptr->branch_cost = ix86_branch_cost;
4110 ptr->tune_defaulted = ix86_tune_defaulted;
4111 ptr->arch_specified = ix86_arch_specified;
4112 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4113 ptr->ix86_target_flags_explicit = target_flags_explicit;
4114 ptr->x_recip_mask_explicit = recip_mask_explicit;
4115
4116 /* The fields are char but the variables are not; make sure the
4117 values fit in the fields. */
4118 gcc_assert (ptr->arch == ix86_arch);
4119 gcc_assert (ptr->schedule == ix86_schedule);
4120 gcc_assert (ptr->tune == ix86_tune);
4121 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4122 }
4123
4124 /* Restore the current options */
4125
4126 static void
4127 ix86_function_specific_restore (struct cl_target_option *ptr)
4128 {
4129 enum processor_type old_tune = ix86_tune;
4130 enum processor_type old_arch = ix86_arch;
4131 unsigned int ix86_arch_mask, ix86_tune_mask;
4132 int i;
4133
4134 ix86_arch = (enum processor_type) ptr->arch;
4135 ix86_schedule = (enum attr_cpu) ptr->schedule;
4136 ix86_tune = (enum processor_type) ptr->tune;
4137 ix86_branch_cost = ptr->branch_cost;
4138 ix86_tune_defaulted = ptr->tune_defaulted;
4139 ix86_arch_specified = ptr->arch_specified;
4140 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4141 target_flags_explicit = ptr->ix86_target_flags_explicit;
4142 recip_mask_explicit = ptr->x_recip_mask_explicit;
4143
4144 /* Recreate the arch feature tests if the arch changed */
4145 if (old_arch != ix86_arch)
4146 {
4147 ix86_arch_mask = 1u << ix86_arch;
4148 for (i = 0; i < X86_ARCH_LAST; ++i)
4149 ix86_arch_features[i]
4150 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4151 }
4152
4153 /* Recreate the tune optimization tests */
4154 if (old_tune != ix86_tune)
4155 {
4156 ix86_tune_mask = 1u << ix86_tune;
4157 for (i = 0; i < X86_TUNE_LAST; ++i)
4158 ix86_tune_features[i]
4159 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4160 }
4161 }
4162
4163 /* Print the current options */
4164
4165 static void
4166 ix86_function_specific_print (FILE *file, int indent,
4167 struct cl_target_option *ptr)
4168 {
4169 char *target_string
4170 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4171 NULL, NULL, ptr->x_ix86_fpmath, false);
4172
4173 fprintf (file, "%*sarch = %d (%s)\n",
4174 indent, "",
4175 ptr->arch,
4176 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4177 ? cpu_names[ptr->arch]
4178 : "<unknown>"));
4179
4180 fprintf (file, "%*stune = %d (%s)\n",
4181 indent, "",
4182 ptr->tune,
4183 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4184 ? cpu_names[ptr->tune]
4185 : "<unknown>"));
4186
4187 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4188
4189 if (target_string)
4190 {
4191 fprintf (file, "%*s%s\n", indent, "", target_string);
4192 free (target_string);
4193 }
4194 }
4195
4196 \f
4197 /* Inner function to process the attribute((target(...))), take an argument and
4198 set the current options from the argument. If we have a list, recursively go
4199 over the list. */
4200
4201 static bool
4202 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4203 struct gcc_options *enum_opts_set)
4204 {
4205 char *next_optstr;
4206 bool ret = true;
4207
4208 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4209 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4210 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4211 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4212 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4213
4214 enum ix86_opt_type
4215 {
4216 ix86_opt_unknown,
4217 ix86_opt_yes,
4218 ix86_opt_no,
4219 ix86_opt_str,
4220 ix86_opt_enum,
4221 ix86_opt_isa
4222 };
4223
4224 static const struct
4225 {
4226 const char *string;
4227 size_t len;
4228 enum ix86_opt_type type;
4229 int opt;
4230 int mask;
4231 } attrs[] = {
4232 /* isa options */
4233 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4234 IX86_ATTR_ISA ("abm", OPT_mabm),
4235 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4236 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4237 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4238 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4239 IX86_ATTR_ISA ("aes", OPT_maes),
4240 IX86_ATTR_ISA ("avx", OPT_mavx),
4241 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4242 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4243 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4244 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4245 IX86_ATTR_ISA ("sse", OPT_msse),
4246 IX86_ATTR_ISA ("sse2", OPT_msse2),
4247 IX86_ATTR_ISA ("sse3", OPT_msse3),
4248 IX86_ATTR_ISA ("sse4", OPT_msse4),
4249 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4250 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4251 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4252 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4253 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4254 IX86_ATTR_ISA ("fma", OPT_mfma),
4255 IX86_ATTR_ISA ("xop", OPT_mxop),
4256 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4257 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4258 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4259 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4260 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4261 IX86_ATTR_ISA ("hle", OPT_mhle),
4262
4263 /* enum options */
4264 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4265
4266 /* string options */
4267 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4268 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4269
4270 /* flag options */
4271 IX86_ATTR_YES ("cld",
4272 OPT_mcld,
4273 MASK_CLD),
4274
4275 IX86_ATTR_NO ("fancy-math-387",
4276 OPT_mfancy_math_387,
4277 MASK_NO_FANCY_MATH_387),
4278
4279 IX86_ATTR_YES ("ieee-fp",
4280 OPT_mieee_fp,
4281 MASK_IEEE_FP),
4282
4283 IX86_ATTR_YES ("inline-all-stringops",
4284 OPT_minline_all_stringops,
4285 MASK_INLINE_ALL_STRINGOPS),
4286
4287 IX86_ATTR_YES ("inline-stringops-dynamically",
4288 OPT_minline_stringops_dynamically,
4289 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4290
4291 IX86_ATTR_NO ("align-stringops",
4292 OPT_mno_align_stringops,
4293 MASK_NO_ALIGN_STRINGOPS),
4294
4295 IX86_ATTR_YES ("recip",
4296 OPT_mrecip,
4297 MASK_RECIP),
4298
4299 };
4300
4301 /* If this is a list, recurse to get the options. */
4302 if (TREE_CODE (args) == TREE_LIST)
4303 {
4304 bool ret = true;
4305
4306 for (; args; args = TREE_CHAIN (args))
4307 if (TREE_VALUE (args)
4308 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4309 p_strings, enum_opts_set))
4310 ret = false;
4311
4312 return ret;
4313 }
4314
4315 else if (TREE_CODE (args) != STRING_CST)
4316 gcc_unreachable ();
4317
4318 /* Handle multiple arguments separated by commas. */
4319 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4320
4321 while (next_optstr && *next_optstr != '\0')
4322 {
4323 char *p = next_optstr;
4324 char *orig_p = p;
4325 char *comma = strchr (next_optstr, ',');
4326 const char *opt_string;
4327 size_t len, opt_len;
4328 int opt;
4329 bool opt_set_p;
4330 char ch;
4331 unsigned i;
4332 enum ix86_opt_type type = ix86_opt_unknown;
4333 int mask = 0;
4334
4335 if (comma)
4336 {
4337 *comma = '\0';
4338 len = comma - next_optstr;
4339 next_optstr = comma + 1;
4340 }
4341 else
4342 {
4343 len = strlen (p);
4344 next_optstr = NULL;
4345 }
4346
4347 /* Recognize no-xxx. */
4348 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4349 {
4350 opt_set_p = false;
4351 p += 3;
4352 len -= 3;
4353 }
4354 else
4355 opt_set_p = true;
4356
4357 /* Find the option. */
4358 ch = *p;
4359 opt = N_OPTS;
4360 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4361 {
4362 type = attrs[i].type;
4363 opt_len = attrs[i].len;
4364 if (ch == attrs[i].string[0]
4365 && ((type != ix86_opt_str && type != ix86_opt_enum)
4366 ? len == opt_len
4367 : len > opt_len)
4368 && memcmp (p, attrs[i].string, opt_len) == 0)
4369 {
4370 opt = attrs[i].opt;
4371 mask = attrs[i].mask;
4372 opt_string = attrs[i].string;
4373 break;
4374 }
4375 }
4376
4377 /* Process the option. */
4378 if (opt == N_OPTS)
4379 {
4380 error ("attribute(target(\"%s\")) is unknown", orig_p);
4381 ret = false;
4382 }
4383
4384 else if (type == ix86_opt_isa)
4385 {
4386 struct cl_decoded_option decoded;
4387
4388 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4389 ix86_handle_option (&global_options, &global_options_set,
4390 &decoded, input_location);
4391 }
4392
4393 else if (type == ix86_opt_yes || type == ix86_opt_no)
4394 {
4395 if (type == ix86_opt_no)
4396 opt_set_p = !opt_set_p;
4397
4398 if (opt_set_p)
4399 target_flags |= mask;
4400 else
4401 target_flags &= ~mask;
4402 }
4403
4404 else if (type == ix86_opt_str)
4405 {
4406 if (p_strings[opt])
4407 {
4408 error ("option(\"%s\") was already specified", opt_string);
4409 ret = false;
4410 }
4411 else
4412 p_strings[opt] = xstrdup (p + opt_len);
4413 }
4414
4415 else if (type == ix86_opt_enum)
4416 {
4417 bool arg_ok;
4418 int value;
4419
4420 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4421 if (arg_ok)
4422 set_option (&global_options, enum_opts_set, opt, value,
4423 p + opt_len, DK_UNSPECIFIED, input_location,
4424 global_dc);
4425 else
4426 {
4427 error ("attribute(target(\"%s\")) is unknown", orig_p);
4428 ret = false;
4429 }
4430 }
4431
4432 else
4433 gcc_unreachable ();
4434 }
4435
4436 return ret;
4437 }
4438
4439 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4440
4441 tree
4442 ix86_valid_target_attribute_tree (tree args)
4443 {
4444 const char *orig_arch_string = ix86_arch_string;
4445 const char *orig_tune_string = ix86_tune_string;
4446 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4447 int orig_tune_defaulted = ix86_tune_defaulted;
4448 int orig_arch_specified = ix86_arch_specified;
4449 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4450 tree t = NULL_TREE;
4451 int i;
4452 struct cl_target_option *def
4453 = TREE_TARGET_OPTION (target_option_default_node);
4454 struct gcc_options enum_opts_set;
4455
4456 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4457
4458 /* Process each of the options on the chain. */
4459 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4460 &enum_opts_set))
4461 return NULL_TREE;
4462
4463 /* If the changed options are different from the default, rerun
4464 ix86_option_override_internal, and then save the options away.
4465 The string options are are attribute options, and will be undone
4466 when we copy the save structure. */
4467 if (ix86_isa_flags != def->x_ix86_isa_flags
4468 || target_flags != def->x_target_flags
4469 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4470 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4471 || enum_opts_set.x_ix86_fpmath)
4472 {
4473 /* If we are using the default tune= or arch=, undo the string assigned,
4474 and use the default. */
4475 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4476 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4477 else if (!orig_arch_specified)
4478 ix86_arch_string = NULL;
4479
4480 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4481 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4482 else if (orig_tune_defaulted)
4483 ix86_tune_string = NULL;
4484
4485 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4486 if (enum_opts_set.x_ix86_fpmath)
4487 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4488 else if (!TARGET_64BIT && TARGET_SSE)
4489 {
4490 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4491 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4492 }
4493
4494 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4495 ix86_option_override_internal (false);
4496
4497 /* Add any builtin functions with the new isa if any. */
4498 ix86_add_new_builtins (ix86_isa_flags);
4499
4500 /* Save the current options unless we are validating options for
4501 #pragma. */
4502 t = build_target_option_node ();
4503
4504 ix86_arch_string = orig_arch_string;
4505 ix86_tune_string = orig_tune_string;
4506 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4507
4508 /* Free up memory allocated to hold the strings */
4509 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4510 free (option_strings[i]);
4511 }
4512
4513 return t;
4514 }
4515
4516 /* Hook to validate attribute((target("string"))). */
4517
4518 static bool
4519 ix86_valid_target_attribute_p (tree fndecl,
4520 tree ARG_UNUSED (name),
4521 tree args,
4522 int ARG_UNUSED (flags))
4523 {
4524 struct cl_target_option cur_target;
4525 bool ret = true;
4526 tree old_optimize = build_optimization_node ();
4527 tree new_target, new_optimize;
4528 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4529
4530 /* If the function changed the optimization levels as well as setting target
4531 options, start with the optimizations specified. */
4532 if (func_optimize && func_optimize != old_optimize)
4533 cl_optimization_restore (&global_options,
4534 TREE_OPTIMIZATION (func_optimize));
4535
4536 /* The target attributes may also change some optimization flags, so update
4537 the optimization options if necessary. */
4538 cl_target_option_save (&cur_target, &global_options);
4539 new_target = ix86_valid_target_attribute_tree (args);
4540 new_optimize = build_optimization_node ();
4541
4542 if (!new_target)
4543 ret = false;
4544
4545 else if (fndecl)
4546 {
4547 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4548
4549 if (old_optimize != new_optimize)
4550 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4551 }
4552
4553 cl_target_option_restore (&global_options, &cur_target);
4554
4555 if (old_optimize != new_optimize)
4556 cl_optimization_restore (&global_options,
4557 TREE_OPTIMIZATION (old_optimize));
4558
4559 return ret;
4560 }
4561
4562 \f
4563 /* Hook to determine if one function can safely inline another. */
4564
4565 static bool
4566 ix86_can_inline_p (tree caller, tree callee)
4567 {
4568 bool ret = false;
4569 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4570 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4571
4572 /* If callee has no option attributes, then it is ok to inline. */
4573 if (!callee_tree)
4574 ret = true;
4575
4576 /* If caller has no option attributes, but callee does then it is not ok to
4577 inline. */
4578 else if (!caller_tree)
4579 ret = false;
4580
4581 else
4582 {
4583 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4584 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4585
4586 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4587 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4588 function. */
4589 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4590 != callee_opts->x_ix86_isa_flags)
4591 ret = false;
4592
4593 /* See if we have the same non-isa options. */
4594 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4595 ret = false;
4596
4597 /* See if arch, tune, etc. are the same. */
4598 else if (caller_opts->arch != callee_opts->arch)
4599 ret = false;
4600
4601 else if (caller_opts->tune != callee_opts->tune)
4602 ret = false;
4603
4604 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4605 ret = false;
4606
4607 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4608 ret = false;
4609
4610 else
4611 ret = true;
4612 }
4613
4614 return ret;
4615 }
4616
4617 \f
4618 /* Remember the last target of ix86_set_current_function. */
4619 static GTY(()) tree ix86_previous_fndecl;
4620
4621 /* Establish appropriate back-end context for processing the function
4622 FNDECL. The argument might be NULL to indicate processing at top
4623 level, outside of any function scope. */
4624 static void
4625 ix86_set_current_function (tree fndecl)
4626 {
4627 /* Only change the context if the function changes. This hook is called
4628 several times in the course of compiling a function, and we don't want to
4629 slow things down too much or call target_reinit when it isn't safe. */
4630 if (fndecl && fndecl != ix86_previous_fndecl)
4631 {
4632 tree old_tree = (ix86_previous_fndecl
4633 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4634 : NULL_TREE);
4635
4636 tree new_tree = (fndecl
4637 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4638 : NULL_TREE);
4639
4640 ix86_previous_fndecl = fndecl;
4641 if (old_tree == new_tree)
4642 ;
4643
4644 else if (new_tree)
4645 {
4646 cl_target_option_restore (&global_options,
4647 TREE_TARGET_OPTION (new_tree));
4648 target_reinit ();
4649 }
4650
4651 else if (old_tree)
4652 {
4653 struct cl_target_option *def
4654 = TREE_TARGET_OPTION (target_option_current_node);
4655
4656 cl_target_option_restore (&global_options, def);
4657 target_reinit ();
4658 }
4659 }
4660 }
4661
4662 \f
4663 /* Return true if this goes in large data/bss. */
4664
4665 static bool
4666 ix86_in_large_data_p (tree exp)
4667 {
4668 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4669 return false;
4670
4671 /* Functions are never large data. */
4672 if (TREE_CODE (exp) == FUNCTION_DECL)
4673 return false;
4674
4675 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4676 {
4677 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4678 if (strcmp (section, ".ldata") == 0
4679 || strcmp (section, ".lbss") == 0)
4680 return true;
4681 return false;
4682 }
4683 else
4684 {
4685 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4686
4687 /* If this is an incomplete type with size 0, then we can't put it
4688 in data because it might be too big when completed. */
4689 if (!size || size > ix86_section_threshold)
4690 return true;
4691 }
4692
4693 return false;
4694 }
4695
4696 /* Switch to the appropriate section for output of DECL.
4697 DECL is either a `VAR_DECL' node or a constant of some sort.
4698 RELOC indicates whether forming the initial value of DECL requires
4699 link-time relocations. */
4700
4701 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4702 ATTRIBUTE_UNUSED;
4703
4704 static section *
4705 x86_64_elf_select_section (tree decl, int reloc,
4706 unsigned HOST_WIDE_INT align)
4707 {
4708 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4709 && ix86_in_large_data_p (decl))
4710 {
4711 const char *sname = NULL;
4712 unsigned int flags = SECTION_WRITE;
4713 switch (categorize_decl_for_section (decl, reloc))
4714 {
4715 case SECCAT_DATA:
4716 sname = ".ldata";
4717 break;
4718 case SECCAT_DATA_REL:
4719 sname = ".ldata.rel";
4720 break;
4721 case SECCAT_DATA_REL_LOCAL:
4722 sname = ".ldata.rel.local";
4723 break;
4724 case SECCAT_DATA_REL_RO:
4725 sname = ".ldata.rel.ro";
4726 break;
4727 case SECCAT_DATA_REL_RO_LOCAL:
4728 sname = ".ldata.rel.ro.local";
4729 break;
4730 case SECCAT_BSS:
4731 sname = ".lbss";
4732 flags |= SECTION_BSS;
4733 break;
4734 case SECCAT_RODATA:
4735 case SECCAT_RODATA_MERGE_STR:
4736 case SECCAT_RODATA_MERGE_STR_INIT:
4737 case SECCAT_RODATA_MERGE_CONST:
4738 sname = ".lrodata";
4739 flags = 0;
4740 break;
4741 case SECCAT_SRODATA:
4742 case SECCAT_SDATA:
4743 case SECCAT_SBSS:
4744 gcc_unreachable ();
4745 case SECCAT_TEXT:
4746 case SECCAT_TDATA:
4747 case SECCAT_TBSS:
4748 /* We don't split these for medium model. Place them into
4749 default sections and hope for best. */
4750 break;
4751 }
4752 if (sname)
4753 {
4754 /* We might get called with string constants, but get_named_section
4755 doesn't like them as they are not DECLs. Also, we need to set
4756 flags in that case. */
4757 if (!DECL_P (decl))
4758 return get_section (sname, flags, NULL);
4759 return get_named_section (decl, sname, reloc);
4760 }
4761 }
4762 return default_elf_select_section (decl, reloc, align);
4763 }
4764
4765 /* Build up a unique section name, expressed as a
4766 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4767 RELOC indicates whether the initial value of EXP requires
4768 link-time relocations. */
4769
4770 static void ATTRIBUTE_UNUSED
4771 x86_64_elf_unique_section (tree decl, int reloc)
4772 {
4773 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4774 && ix86_in_large_data_p (decl))
4775 {
4776 const char *prefix = NULL;
4777 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4778 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4779
4780 switch (categorize_decl_for_section (decl, reloc))
4781 {
4782 case SECCAT_DATA:
4783 case SECCAT_DATA_REL:
4784 case SECCAT_DATA_REL_LOCAL:
4785 case SECCAT_DATA_REL_RO:
4786 case SECCAT_DATA_REL_RO_LOCAL:
4787 prefix = one_only ? ".ld" : ".ldata";
4788 break;
4789 case SECCAT_BSS:
4790 prefix = one_only ? ".lb" : ".lbss";
4791 break;
4792 case SECCAT_RODATA:
4793 case SECCAT_RODATA_MERGE_STR:
4794 case SECCAT_RODATA_MERGE_STR_INIT:
4795 case SECCAT_RODATA_MERGE_CONST:
4796 prefix = one_only ? ".lr" : ".lrodata";
4797 break;
4798 case SECCAT_SRODATA:
4799 case SECCAT_SDATA:
4800 case SECCAT_SBSS:
4801 gcc_unreachable ();
4802 case SECCAT_TEXT:
4803 case SECCAT_TDATA:
4804 case SECCAT_TBSS:
4805 /* We don't split these for medium model. Place them into
4806 default sections and hope for best. */
4807 break;
4808 }
4809 if (prefix)
4810 {
4811 const char *name, *linkonce;
4812 char *string;
4813
4814 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4815 name = targetm.strip_name_encoding (name);
4816
4817 /* If we're using one_only, then there needs to be a .gnu.linkonce
4818 prefix to the section name. */
4819 linkonce = one_only ? ".gnu.linkonce" : "";
4820
4821 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4822
4823 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4824 return;
4825 }
4826 }
4827 default_unique_section (decl, reloc);
4828 }
4829
4830 #ifdef COMMON_ASM_OP
4831 /* This says how to output assembler code to declare an
4832 uninitialized external linkage data object.
4833
4834 For medium model x86-64 we need to use .largecomm opcode for
4835 large objects. */
4836 void
4837 x86_elf_aligned_common (FILE *file,
4838 const char *name, unsigned HOST_WIDE_INT size,
4839 int align)
4840 {
4841 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4842 && size > (unsigned int)ix86_section_threshold)
4843 fputs (".largecomm\t", file);
4844 else
4845 fputs (COMMON_ASM_OP, file);
4846 assemble_name (file, name);
4847 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4848 size, align / BITS_PER_UNIT);
4849 }
4850 #endif
4851
4852 /* Utility function for targets to use in implementing
4853 ASM_OUTPUT_ALIGNED_BSS. */
4854
4855 void
4856 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4857 const char *name, unsigned HOST_WIDE_INT size,
4858 int align)
4859 {
4860 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4861 && size > (unsigned int)ix86_section_threshold)
4862 switch_to_section (get_named_section (decl, ".lbss", 0));
4863 else
4864 switch_to_section (bss_section);
4865 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4866 #ifdef ASM_DECLARE_OBJECT_NAME
4867 last_assemble_variable_decl = decl;
4868 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4869 #else
4870 /* Standard thing is just output label for the object. */
4871 ASM_OUTPUT_LABEL (file, name);
4872 #endif /* ASM_DECLARE_OBJECT_NAME */
4873 ASM_OUTPUT_SKIP (file, size ? size : 1);
4874 }
4875 \f
4876 /* Decide whether we must probe the stack before any space allocation
4877 on this target. It's essentially TARGET_STACK_PROBE except when
4878 -fstack-check causes the stack to be already probed differently. */
4879
4880 bool
4881 ix86_target_stack_probe (void)
4882 {
4883 /* Do not probe the stack twice if static stack checking is enabled. */
4884 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4885 return false;
4886
4887 return TARGET_STACK_PROBE;
4888 }
4889 \f
4890 /* Decide whether we can make a sibling call to a function. DECL is the
4891 declaration of the function being targeted by the call and EXP is the
4892 CALL_EXPR representing the call. */
4893
4894 static bool
4895 ix86_function_ok_for_sibcall (tree decl, tree exp)
4896 {
4897 tree type, decl_or_type;
4898 rtx a, b;
4899
4900 /* If we are generating position-independent code, we cannot sibcall
4901 optimize any indirect call, or a direct call to a global function,
4902 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4903 if (!TARGET_MACHO
4904 && !TARGET_64BIT
4905 && flag_pic
4906 && (!decl || !targetm.binds_local_p (decl)))
4907 return false;
4908
4909 /* If we need to align the outgoing stack, then sibcalling would
4910 unalign the stack, which may break the called function. */
4911 if (ix86_minimum_incoming_stack_boundary (true)
4912 < PREFERRED_STACK_BOUNDARY)
4913 return false;
4914
4915 if (decl)
4916 {
4917 decl_or_type = decl;
4918 type = TREE_TYPE (decl);
4919 }
4920 else
4921 {
4922 /* We're looking at the CALL_EXPR, we need the type of the function. */
4923 type = CALL_EXPR_FN (exp); /* pointer expression */
4924 type = TREE_TYPE (type); /* pointer type */
4925 type = TREE_TYPE (type); /* function type */
4926 decl_or_type = type;
4927 }
4928
4929 /* Check that the return value locations are the same. Like
4930 if we are returning floats on the 80387 register stack, we cannot
4931 make a sibcall from a function that doesn't return a float to a
4932 function that does or, conversely, from a function that does return
4933 a float to a function that doesn't; the necessary stack adjustment
4934 would not be executed. This is also the place we notice
4935 differences in the return value ABI. Note that it is ok for one
4936 of the functions to have void return type as long as the return
4937 value of the other is passed in a register. */
4938 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4939 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4940 cfun->decl, false);
4941 if (STACK_REG_P (a) || STACK_REG_P (b))
4942 {
4943 if (!rtx_equal_p (a, b))
4944 return false;
4945 }
4946 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4947 {
4948 /* Disable sibcall if we need to generate vzeroupper after
4949 callee returns. */
4950 if (TARGET_VZEROUPPER
4951 && cfun->machine->callee_return_avx256_p
4952 && !cfun->machine->caller_return_avx256_p)
4953 return false;
4954 }
4955 else if (!rtx_equal_p (a, b))
4956 return false;
4957
4958 if (TARGET_64BIT)
4959 {
4960 /* The SYSV ABI has more call-clobbered registers;
4961 disallow sibcalls from MS to SYSV. */
4962 if (cfun->machine->call_abi == MS_ABI
4963 && ix86_function_type_abi (type) == SYSV_ABI)
4964 return false;
4965 }
4966 else
4967 {
4968 /* If this call is indirect, we'll need to be able to use a
4969 call-clobbered register for the address of the target function.
4970 Make sure that all such registers are not used for passing
4971 parameters. Note that DLLIMPORT functions are indirect. */
4972 if (!decl
4973 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4974 {
4975 if (ix86_function_regparm (type, NULL) >= 3)
4976 {
4977 /* ??? Need to count the actual number of registers to be used,
4978 not the possible number of registers. Fix later. */
4979 return false;
4980 }
4981 }
4982 }
4983
4984 /* Otherwise okay. That also includes certain types of indirect calls. */
4985 return true;
4986 }
4987
4988 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4989 and "sseregparm" calling convention attributes;
4990 arguments as in struct attribute_spec.handler. */
4991
4992 static tree
4993 ix86_handle_cconv_attribute (tree *node, tree name,
4994 tree args,
4995 int flags ATTRIBUTE_UNUSED,
4996 bool *no_add_attrs)
4997 {
4998 if (TREE_CODE (*node) != FUNCTION_TYPE
4999 && TREE_CODE (*node) != METHOD_TYPE
5000 && TREE_CODE (*node) != FIELD_DECL
5001 && TREE_CODE (*node) != TYPE_DECL)
5002 {
5003 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5004 name);
5005 *no_add_attrs = true;
5006 return NULL_TREE;
5007 }
5008
5009 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5010 if (is_attribute_p ("regparm", name))
5011 {
5012 tree cst;
5013
5014 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("fastcall and regparm attributes are not compatible");
5017 }
5018
5019 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5020 {
5021 error ("regparam and thiscall attributes are not compatible");
5022 }
5023
5024 cst = TREE_VALUE (args);
5025 if (TREE_CODE (cst) != INTEGER_CST)
5026 {
5027 warning (OPT_Wattributes,
5028 "%qE attribute requires an integer constant argument",
5029 name);
5030 *no_add_attrs = true;
5031 }
5032 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5033 {
5034 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5035 name, REGPARM_MAX);
5036 *no_add_attrs = true;
5037 }
5038
5039 return NULL_TREE;
5040 }
5041
5042 if (TARGET_64BIT)
5043 {
5044 /* Do not warn when emulating the MS ABI. */
5045 if ((TREE_CODE (*node) != FUNCTION_TYPE
5046 && TREE_CODE (*node) != METHOD_TYPE)
5047 || ix86_function_type_abi (*node) != MS_ABI)
5048 warning (OPT_Wattributes, "%qE attribute ignored",
5049 name);
5050 *no_add_attrs = true;
5051 return NULL_TREE;
5052 }
5053
5054 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5055 if (is_attribute_p ("fastcall", name))
5056 {
5057 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5058 {
5059 error ("fastcall and cdecl attributes are not compatible");
5060 }
5061 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5062 {
5063 error ("fastcall and stdcall attributes are not compatible");
5064 }
5065 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5066 {
5067 error ("fastcall and regparm attributes are not compatible");
5068 }
5069 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5070 {
5071 error ("fastcall and thiscall attributes are not compatible");
5072 }
5073 }
5074
5075 /* Can combine stdcall with fastcall (redundant), regparm and
5076 sseregparm. */
5077 else if (is_attribute_p ("stdcall", name))
5078 {
5079 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5080 {
5081 error ("stdcall and cdecl attributes are not compatible");
5082 }
5083 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5084 {
5085 error ("stdcall and fastcall attributes are not compatible");
5086 }
5087 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5088 {
5089 error ("stdcall and thiscall attributes are not compatible");
5090 }
5091 }
5092
5093 /* Can combine cdecl with regparm and sseregparm. */
5094 else if (is_attribute_p ("cdecl", name))
5095 {
5096 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5097 {
5098 error ("stdcall and cdecl attributes are not compatible");
5099 }
5100 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5101 {
5102 error ("fastcall and cdecl attributes are not compatible");
5103 }
5104 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5105 {
5106 error ("cdecl and thiscall attributes are not compatible");
5107 }
5108 }
5109 else if (is_attribute_p ("thiscall", name))
5110 {
5111 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5112 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5113 name);
5114 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5115 {
5116 error ("stdcall and thiscall attributes are not compatible");
5117 }
5118 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5119 {
5120 error ("fastcall and thiscall attributes are not compatible");
5121 }
5122 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5123 {
5124 error ("cdecl and thiscall attributes are not compatible");
5125 }
5126 }
5127
5128 /* Can combine sseregparm with all attributes. */
5129
5130 return NULL_TREE;
5131 }
5132
5133 /* The transactional memory builtins are implicitly regparm or fastcall
5134 depending on the ABI. Override the generic do-nothing attribute that
5135 these builtins were declared with, and replace it with one of the two
5136 attributes that we expect elsewhere. */
5137
5138 static tree
5139 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5140 tree args ATTRIBUTE_UNUSED,
5141 int flags ATTRIBUTE_UNUSED,
5142 bool *no_add_attrs)
5143 {
5144 tree alt;
5145
5146 /* In no case do we want to add the placeholder attribute. */
5147 *no_add_attrs = true;
5148
5149 /* The 64-bit ABI is unchanged for transactional memory. */
5150 if (TARGET_64BIT)
5151 return NULL_TREE;
5152
5153 /* ??? Is there a better way to validate 32-bit windows? We have
5154 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5155 if (CHECK_STACK_LIMIT > 0)
5156 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5157 else
5158 {
5159 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5160 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5161 }
5162 decl_attributes (node, alt, flags);
5163
5164 return NULL_TREE;
5165 }
5166
5167 /* This function determines from TYPE the calling-convention. */
5168
5169 unsigned int
5170 ix86_get_callcvt (const_tree type)
5171 {
5172 unsigned int ret = 0;
5173 bool is_stdarg;
5174 tree attrs;
5175
5176 if (TARGET_64BIT)
5177 return IX86_CALLCVT_CDECL;
5178
5179 attrs = TYPE_ATTRIBUTES (type);
5180 if (attrs != NULL_TREE)
5181 {
5182 if (lookup_attribute ("cdecl", attrs))
5183 ret |= IX86_CALLCVT_CDECL;
5184 else if (lookup_attribute ("stdcall", attrs))
5185 ret |= IX86_CALLCVT_STDCALL;
5186 else if (lookup_attribute ("fastcall", attrs))
5187 ret |= IX86_CALLCVT_FASTCALL;
5188 else if (lookup_attribute ("thiscall", attrs))
5189 ret |= IX86_CALLCVT_THISCALL;
5190
5191 /* Regparam isn't allowed for thiscall and fastcall. */
5192 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5193 {
5194 if (lookup_attribute ("regparm", attrs))
5195 ret |= IX86_CALLCVT_REGPARM;
5196 if (lookup_attribute ("sseregparm", attrs))
5197 ret |= IX86_CALLCVT_SSEREGPARM;
5198 }
5199
5200 if (IX86_BASE_CALLCVT(ret) != 0)
5201 return ret;
5202 }
5203
5204 is_stdarg = stdarg_p (type);
5205 if (TARGET_RTD && !is_stdarg)
5206 return IX86_CALLCVT_STDCALL | ret;
5207
5208 if (ret != 0
5209 || is_stdarg
5210 || TREE_CODE (type) != METHOD_TYPE
5211 || ix86_function_type_abi (type) != MS_ABI)
5212 return IX86_CALLCVT_CDECL | ret;
5213
5214 return IX86_CALLCVT_THISCALL;
5215 }
5216
5217 /* Return 0 if the attributes for two types are incompatible, 1 if they
5218 are compatible, and 2 if they are nearly compatible (which causes a
5219 warning to be generated). */
5220
5221 static int
5222 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5223 {
5224 unsigned int ccvt1, ccvt2;
5225
5226 if (TREE_CODE (type1) != FUNCTION_TYPE
5227 && TREE_CODE (type1) != METHOD_TYPE)
5228 return 1;
5229
5230 ccvt1 = ix86_get_callcvt (type1);
5231 ccvt2 = ix86_get_callcvt (type2);
5232 if (ccvt1 != ccvt2)
5233 return 0;
5234 if (ix86_function_regparm (type1, NULL)
5235 != ix86_function_regparm (type2, NULL))
5236 return 0;
5237
5238 return 1;
5239 }
5240 \f
5241 /* Return the regparm value for a function with the indicated TYPE and DECL.
5242 DECL may be NULL when calling function indirectly
5243 or considering a libcall. */
5244
5245 static int
5246 ix86_function_regparm (const_tree type, const_tree decl)
5247 {
5248 tree attr;
5249 int regparm;
5250 unsigned int ccvt;
5251
5252 if (TARGET_64BIT)
5253 return (ix86_function_type_abi (type) == SYSV_ABI
5254 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5255 ccvt = ix86_get_callcvt (type);
5256 regparm = ix86_regparm;
5257
5258 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5259 {
5260 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5261 if (attr)
5262 {
5263 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5264 return regparm;
5265 }
5266 }
5267 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5268 return 2;
5269 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5270 return 1;
5271
5272 /* Use register calling convention for local functions when possible. */
5273 if (decl
5274 && TREE_CODE (decl) == FUNCTION_DECL
5275 && optimize
5276 && !(profile_flag && !flag_fentry))
5277 {
5278 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5279 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5280 if (i && i->local && i->can_change_signature)
5281 {
5282 int local_regparm, globals = 0, regno;
5283
5284 /* Make sure no regparm register is taken by a
5285 fixed register variable. */
5286 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5287 if (fixed_regs[local_regparm])
5288 break;
5289
5290 /* We don't want to use regparm(3) for nested functions as
5291 these use a static chain pointer in the third argument. */
5292 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5293 local_regparm = 2;
5294
5295 /* In 32-bit mode save a register for the split stack. */
5296 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5297 local_regparm = 2;
5298
5299 /* Each fixed register usage increases register pressure,
5300 so less registers should be used for argument passing.
5301 This functionality can be overriden by an explicit
5302 regparm value. */
5303 for (regno = 0; regno <= DI_REG; regno++)
5304 if (fixed_regs[regno])
5305 globals++;
5306
5307 local_regparm
5308 = globals < local_regparm ? local_regparm - globals : 0;
5309
5310 if (local_regparm > regparm)
5311 regparm = local_regparm;
5312 }
5313 }
5314
5315 return regparm;
5316 }
5317
5318 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5319 DFmode (2) arguments in SSE registers for a function with the
5320 indicated TYPE and DECL. DECL may be NULL when calling function
5321 indirectly or considering a libcall. Otherwise return 0. */
5322
5323 static int
5324 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5325 {
5326 gcc_assert (!TARGET_64BIT);
5327
5328 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5329 by the sseregparm attribute. */
5330 if (TARGET_SSEREGPARM
5331 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5332 {
5333 if (!TARGET_SSE)
5334 {
5335 if (warn)
5336 {
5337 if (decl)
5338 error ("calling %qD with attribute sseregparm without "
5339 "SSE/SSE2 enabled", decl);
5340 else
5341 error ("calling %qT with attribute sseregparm without "
5342 "SSE/SSE2 enabled", type);
5343 }
5344 return 0;
5345 }
5346
5347 return 2;
5348 }
5349
5350 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5351 (and DFmode for SSE2) arguments in SSE registers. */
5352 if (decl && TARGET_SSE_MATH && optimize
5353 && !(profile_flag && !flag_fentry))
5354 {
5355 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5356 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5357 if (i && i->local && i->can_change_signature)
5358 return TARGET_SSE2 ? 2 : 1;
5359 }
5360
5361 return 0;
5362 }
5363
5364 /* Return true if EAX is live at the start of the function. Used by
5365 ix86_expand_prologue to determine if we need special help before
5366 calling allocate_stack_worker. */
5367
5368 static bool
5369 ix86_eax_live_at_start_p (void)
5370 {
5371 /* Cheat. Don't bother working forward from ix86_function_regparm
5372 to the function type to whether an actual argument is located in
5373 eax. Instead just look at cfg info, which is still close enough
5374 to correct at this point. This gives false positives for broken
5375 functions that might use uninitialized data that happens to be
5376 allocated in eax, but who cares? */
5377 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5378 }
5379
5380 static bool
5381 ix86_keep_aggregate_return_pointer (tree fntype)
5382 {
5383 tree attr;
5384
5385 if (!TARGET_64BIT)
5386 {
5387 attr = lookup_attribute ("callee_pop_aggregate_return",
5388 TYPE_ATTRIBUTES (fntype));
5389 if (attr)
5390 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5391
5392 /* For 32-bit MS-ABI the default is to keep aggregate
5393 return pointer. */
5394 if (ix86_function_type_abi (fntype) == MS_ABI)
5395 return true;
5396 }
5397 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5398 }
5399
5400 /* Value is the number of bytes of arguments automatically
5401 popped when returning from a subroutine call.
5402 FUNDECL is the declaration node of the function (as a tree),
5403 FUNTYPE is the data type of the function (as a tree),
5404 or for a library call it is an identifier node for the subroutine name.
5405 SIZE is the number of bytes of arguments passed on the stack.
5406
5407 On the 80386, the RTD insn may be used to pop them if the number
5408 of args is fixed, but if the number is variable then the caller
5409 must pop them all. RTD can't be used for library calls now
5410 because the library is compiled with the Unix compiler.
5411 Use of RTD is a selectable option, since it is incompatible with
5412 standard Unix calling sequences. If the option is not selected,
5413 the caller must always pop the args.
5414
5415 The attribute stdcall is equivalent to RTD on a per module basis. */
5416
5417 static int
5418 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5419 {
5420 unsigned int ccvt;
5421
5422 /* None of the 64-bit ABIs pop arguments. */
5423 if (TARGET_64BIT)
5424 return 0;
5425
5426 ccvt = ix86_get_callcvt (funtype);
5427
5428 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5429 | IX86_CALLCVT_THISCALL)) != 0
5430 && ! stdarg_p (funtype))
5431 return size;
5432
5433 /* Lose any fake structure return argument if it is passed on the stack. */
5434 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5435 && !ix86_keep_aggregate_return_pointer (funtype))
5436 {
5437 int nregs = ix86_function_regparm (funtype, fundecl);
5438 if (nregs == 0)
5439 return GET_MODE_SIZE (Pmode);
5440 }
5441
5442 return 0;
5443 }
5444 \f
5445 /* Argument support functions. */
5446
5447 /* Return true when register may be used to pass function parameters. */
5448 bool
5449 ix86_function_arg_regno_p (int regno)
5450 {
5451 int i;
5452 const int *parm_regs;
5453
5454 if (!TARGET_64BIT)
5455 {
5456 if (TARGET_MACHO)
5457 return (regno < REGPARM_MAX
5458 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5459 else
5460 return (regno < REGPARM_MAX
5461 || (TARGET_MMX && MMX_REGNO_P (regno)
5462 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5463 || (TARGET_SSE && SSE_REGNO_P (regno)
5464 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5465 }
5466
5467 if (TARGET_MACHO)
5468 {
5469 if (SSE_REGNO_P (regno) && TARGET_SSE)
5470 return true;
5471 }
5472 else
5473 {
5474 if (TARGET_SSE && SSE_REGNO_P (regno)
5475 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5476 return true;
5477 }
5478
5479 /* TODO: The function should depend on current function ABI but
5480 builtins.c would need updating then. Therefore we use the
5481 default ABI. */
5482
5483 /* RAX is used as hidden argument to va_arg functions. */
5484 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5485 return true;
5486
5487 if (ix86_abi == MS_ABI)
5488 parm_regs = x86_64_ms_abi_int_parameter_registers;
5489 else
5490 parm_regs = x86_64_int_parameter_registers;
5491 for (i = 0; i < (ix86_abi == MS_ABI
5492 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5493 if (regno == parm_regs[i])
5494 return true;
5495 return false;
5496 }
5497
5498 /* Return if we do not know how to pass TYPE solely in registers. */
5499
5500 static bool
5501 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5502 {
5503 if (must_pass_in_stack_var_size_or_pad (mode, type))
5504 return true;
5505
5506 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5507 The layout_type routine is crafty and tries to trick us into passing
5508 currently unsupported vector types on the stack by using TImode. */
5509 return (!TARGET_64BIT && mode == TImode
5510 && type && TREE_CODE (type) != VECTOR_TYPE);
5511 }
5512
5513 /* It returns the size, in bytes, of the area reserved for arguments passed
5514 in registers for the function represented by fndecl dependent to the used
5515 abi format. */
5516 int
5517 ix86_reg_parm_stack_space (const_tree fndecl)
5518 {
5519 enum calling_abi call_abi = SYSV_ABI;
5520 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5521 call_abi = ix86_function_abi (fndecl);
5522 else
5523 call_abi = ix86_function_type_abi (fndecl);
5524 if (TARGET_64BIT && call_abi == MS_ABI)
5525 return 32;
5526 return 0;
5527 }
5528
5529 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5530 call abi used. */
5531 enum calling_abi
5532 ix86_function_type_abi (const_tree fntype)
5533 {
5534 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5535 {
5536 enum calling_abi abi = ix86_abi;
5537 if (abi == SYSV_ABI)
5538 {
5539 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5540 abi = MS_ABI;
5541 }
5542 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5543 abi = SYSV_ABI;
5544 return abi;
5545 }
5546 return ix86_abi;
5547 }
5548
5549 static bool
5550 ix86_function_ms_hook_prologue (const_tree fn)
5551 {
5552 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5553 {
5554 if (decl_function_context (fn) != NULL_TREE)
5555 error_at (DECL_SOURCE_LOCATION (fn),
5556 "ms_hook_prologue is not compatible with nested function");
5557 else
5558 return true;
5559 }
5560 return false;
5561 }
5562
5563 static enum calling_abi
5564 ix86_function_abi (const_tree fndecl)
5565 {
5566 if (! fndecl)
5567 return ix86_abi;
5568 return ix86_function_type_abi (TREE_TYPE (fndecl));
5569 }
5570
5571 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5572 call abi used. */
5573 enum calling_abi
5574 ix86_cfun_abi (void)
5575 {
5576 if (! cfun)
5577 return ix86_abi;
5578 return cfun->machine->call_abi;
5579 }
5580
5581 /* Write the extra assembler code needed to declare a function properly. */
5582
5583 void
5584 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5585 tree decl)
5586 {
5587 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5588
5589 if (is_ms_hook)
5590 {
5591 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5592 unsigned int filler_cc = 0xcccccccc;
5593
5594 for (i = 0; i < filler_count; i += 4)
5595 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5596 }
5597
5598 #ifdef SUBTARGET_ASM_UNWIND_INIT
5599 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5600 #endif
5601
5602 ASM_OUTPUT_LABEL (asm_out_file, fname);
5603
5604 /* Output magic byte marker, if hot-patch attribute is set. */
5605 if (is_ms_hook)
5606 {
5607 if (TARGET_64BIT)
5608 {
5609 /* leaq [%rsp + 0], %rsp */
5610 asm_fprintf (asm_out_file, ASM_BYTE
5611 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5612 }
5613 else
5614 {
5615 /* movl.s %edi, %edi
5616 push %ebp
5617 movl.s %esp, %ebp */
5618 asm_fprintf (asm_out_file, ASM_BYTE
5619 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5620 }
5621 }
5622 }
5623
5624 /* regclass.c */
5625 extern void init_regs (void);
5626
5627 /* Implementation of call abi switching target hook. Specific to FNDECL
5628 the specific call register sets are set. See also
5629 ix86_conditional_register_usage for more details. */
5630 void
5631 ix86_call_abi_override (const_tree fndecl)
5632 {
5633 if (fndecl == NULL_TREE)
5634 cfun->machine->call_abi = ix86_abi;
5635 else
5636 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5637 }
5638
5639 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5640 expensive re-initialization of init_regs each time we switch function context
5641 since this is needed only during RTL expansion. */
5642 static void
5643 ix86_maybe_switch_abi (void)
5644 {
5645 if (TARGET_64BIT &&
5646 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5647 reinit_regs ();
5648 }
5649
5650 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5651 for a call to a function whose data type is FNTYPE.
5652 For a library call, FNTYPE is 0. */
5653
5654 void
5655 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5656 tree fntype, /* tree ptr for function decl */
5657 rtx libname, /* SYMBOL_REF of library name or 0 */
5658 tree fndecl,
5659 int caller)
5660 {
5661 struct cgraph_local_info *i;
5662 tree fnret_type;
5663
5664 memset (cum, 0, sizeof (*cum));
5665
5666 /* Initialize for the current callee. */
5667 if (caller)
5668 {
5669 cfun->machine->callee_pass_avx256_p = false;
5670 cfun->machine->callee_return_avx256_p = false;
5671 }
5672
5673 if (fndecl)
5674 {
5675 i = cgraph_local_info (fndecl);
5676 cum->call_abi = ix86_function_abi (fndecl);
5677 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5678 }
5679 else
5680 {
5681 i = NULL;
5682 cum->call_abi = ix86_function_type_abi (fntype);
5683 if (fntype)
5684 fnret_type = TREE_TYPE (fntype);
5685 else
5686 fnret_type = NULL;
5687 }
5688
5689 if (TARGET_VZEROUPPER && fnret_type)
5690 {
5691 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5692 false);
5693 if (function_pass_avx256_p (fnret_value))
5694 {
5695 /* The return value of this function uses 256bit AVX modes. */
5696 if (caller)
5697 cfun->machine->callee_return_avx256_p = true;
5698 else
5699 cfun->machine->caller_return_avx256_p = true;
5700 }
5701 }
5702
5703 cum->caller = caller;
5704
5705 /* Set up the number of registers to use for passing arguments. */
5706
5707 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5708 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5709 "or subtarget optimization implying it");
5710 cum->nregs = ix86_regparm;
5711 if (TARGET_64BIT)
5712 {
5713 cum->nregs = (cum->call_abi == SYSV_ABI
5714 ? X86_64_REGPARM_MAX
5715 : X86_64_MS_REGPARM_MAX);
5716 }
5717 if (TARGET_SSE)
5718 {
5719 cum->sse_nregs = SSE_REGPARM_MAX;
5720 if (TARGET_64BIT)
5721 {
5722 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5723 ? X86_64_SSE_REGPARM_MAX
5724 : X86_64_MS_SSE_REGPARM_MAX);
5725 }
5726 }
5727 if (TARGET_MMX)
5728 cum->mmx_nregs = MMX_REGPARM_MAX;
5729 cum->warn_avx = true;
5730 cum->warn_sse = true;
5731 cum->warn_mmx = true;
5732
5733 /* Because type might mismatch in between caller and callee, we need to
5734 use actual type of function for local calls.
5735 FIXME: cgraph_analyze can be told to actually record if function uses
5736 va_start so for local functions maybe_vaarg can be made aggressive
5737 helping K&R code.
5738 FIXME: once typesytem is fixed, we won't need this code anymore. */
5739 if (i && i->local && i->can_change_signature)
5740 fntype = TREE_TYPE (fndecl);
5741 cum->maybe_vaarg = (fntype
5742 ? (!prototype_p (fntype) || stdarg_p (fntype))
5743 : !libname);
5744
5745 if (!TARGET_64BIT)
5746 {
5747 /* If there are variable arguments, then we won't pass anything
5748 in registers in 32-bit mode. */
5749 if (stdarg_p (fntype))
5750 {
5751 cum->nregs = 0;
5752 cum->sse_nregs = 0;
5753 cum->mmx_nregs = 0;
5754 cum->warn_avx = 0;
5755 cum->warn_sse = 0;
5756 cum->warn_mmx = 0;
5757 return;
5758 }
5759
5760 /* Use ecx and edx registers if function has fastcall attribute,
5761 else look for regparm information. */
5762 if (fntype)
5763 {
5764 unsigned int ccvt = ix86_get_callcvt (fntype);
5765 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5766 {
5767 cum->nregs = 1;
5768 cum->fastcall = 1; /* Same first register as in fastcall. */
5769 }
5770 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5771 {
5772 cum->nregs = 2;
5773 cum->fastcall = 1;
5774 }
5775 else
5776 cum->nregs = ix86_function_regparm (fntype, fndecl);
5777 }
5778
5779 /* Set up the number of SSE registers used for passing SFmode
5780 and DFmode arguments. Warn for mismatching ABI. */
5781 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5782 }
5783 }
5784
5785 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5786 But in the case of vector types, it is some vector mode.
5787
5788 When we have only some of our vector isa extensions enabled, then there
5789 are some modes for which vector_mode_supported_p is false. For these
5790 modes, the generic vector support in gcc will choose some non-vector mode
5791 in order to implement the type. By computing the natural mode, we'll
5792 select the proper ABI location for the operand and not depend on whatever
5793 the middle-end decides to do with these vector types.
5794
5795 The midde-end can't deal with the vector types > 16 bytes. In this
5796 case, we return the original mode and warn ABI change if CUM isn't
5797 NULL. */
5798
5799 static enum machine_mode
5800 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5801 {
5802 enum machine_mode mode = TYPE_MODE (type);
5803
5804 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5805 {
5806 HOST_WIDE_INT size = int_size_in_bytes (type);
5807 if ((size == 8 || size == 16 || size == 32)
5808 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5809 && TYPE_VECTOR_SUBPARTS (type) > 1)
5810 {
5811 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5812
5813 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5814 mode = MIN_MODE_VECTOR_FLOAT;
5815 else
5816 mode = MIN_MODE_VECTOR_INT;
5817
5818 /* Get the mode which has this inner mode and number of units. */
5819 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5820 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5821 && GET_MODE_INNER (mode) == innermode)
5822 {
5823 if (size == 32 && !TARGET_AVX)
5824 {
5825 static bool warnedavx;
5826
5827 if (cum
5828 && !warnedavx
5829 && cum->warn_avx)
5830 {
5831 warnedavx = true;
5832 warning (0, "AVX vector argument without AVX "
5833 "enabled changes the ABI");
5834 }
5835 return TYPE_MODE (type);
5836 }
5837 else if ((size == 8 || size == 16) && !TARGET_SSE)
5838 {
5839 static bool warnedsse;
5840
5841 if (cum
5842 && !warnedsse
5843 && cum->warn_sse)
5844 {
5845 warnedsse = true;
5846 warning (0, "SSE vector argument without SSE "
5847 "enabled changes the ABI");
5848 }
5849 return mode;
5850 }
5851 else
5852 return mode;
5853 }
5854
5855 gcc_unreachable ();
5856 }
5857 }
5858
5859 return mode;
5860 }
5861
5862 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5863 this may not agree with the mode that the type system has chosen for the
5864 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5865 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5866
5867 static rtx
5868 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5869 unsigned int regno)
5870 {
5871 rtx tmp;
5872
5873 if (orig_mode != BLKmode)
5874 tmp = gen_rtx_REG (orig_mode, regno);
5875 else
5876 {
5877 tmp = gen_rtx_REG (mode, regno);
5878 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5879 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5880 }
5881
5882 return tmp;
5883 }
5884
5885 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5886 of this code is to classify each 8bytes of incoming argument by the register
5887 class and assign registers accordingly. */
5888
5889 /* Return the union class of CLASS1 and CLASS2.
5890 See the x86-64 PS ABI for details. */
5891
5892 static enum x86_64_reg_class
5893 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5894 {
5895 /* Rule #1: If both classes are equal, this is the resulting class. */
5896 if (class1 == class2)
5897 return class1;
5898
5899 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5900 the other class. */
5901 if (class1 == X86_64_NO_CLASS)
5902 return class2;
5903 if (class2 == X86_64_NO_CLASS)
5904 return class1;
5905
5906 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5907 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5908 return X86_64_MEMORY_CLASS;
5909
5910 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5911 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5912 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5913 return X86_64_INTEGERSI_CLASS;
5914 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5915 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5916 return X86_64_INTEGER_CLASS;
5917
5918 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5919 MEMORY is used. */
5920 if (class1 == X86_64_X87_CLASS
5921 || class1 == X86_64_X87UP_CLASS
5922 || class1 == X86_64_COMPLEX_X87_CLASS
5923 || class2 == X86_64_X87_CLASS
5924 || class2 == X86_64_X87UP_CLASS
5925 || class2 == X86_64_COMPLEX_X87_CLASS)
5926 return X86_64_MEMORY_CLASS;
5927
5928 /* Rule #6: Otherwise class SSE is used. */
5929 return X86_64_SSE_CLASS;
5930 }
5931
5932 /* Classify the argument of type TYPE and mode MODE.
5933 CLASSES will be filled by the register class used to pass each word
5934 of the operand. The number of words is returned. In case the parameter
5935 should be passed in memory, 0 is returned. As a special case for zero
5936 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5937
5938 BIT_OFFSET is used internally for handling records and specifies offset
5939 of the offset in bits modulo 256 to avoid overflow cases.
5940
5941 See the x86-64 PS ABI for details.
5942 */
5943
5944 static int
5945 classify_argument (enum machine_mode mode, const_tree type,
5946 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5947 {
5948 HOST_WIDE_INT bytes =
5949 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5950 int words
5951 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5952
5953 /* Variable sized entities are always passed/returned in memory. */
5954 if (bytes < 0)
5955 return 0;
5956
5957 if (mode != VOIDmode
5958 && targetm.calls.must_pass_in_stack (mode, type))
5959 return 0;
5960
5961 if (type && AGGREGATE_TYPE_P (type))
5962 {
5963 int i;
5964 tree field;
5965 enum x86_64_reg_class subclasses[MAX_CLASSES];
5966
5967 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5968 if (bytes > 32)
5969 return 0;
5970
5971 for (i = 0; i < words; i++)
5972 classes[i] = X86_64_NO_CLASS;
5973
5974 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5975 signalize memory class, so handle it as special case. */
5976 if (!words)
5977 {
5978 classes[0] = X86_64_NO_CLASS;
5979 return 1;
5980 }
5981
5982 /* Classify each field of record and merge classes. */
5983 switch (TREE_CODE (type))
5984 {
5985 case RECORD_TYPE:
5986 /* And now merge the fields of structure. */
5987 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5988 {
5989 if (TREE_CODE (field) == FIELD_DECL)
5990 {
5991 int num;
5992
5993 if (TREE_TYPE (field) == error_mark_node)
5994 continue;
5995
5996 /* Bitfields are always classified as integer. Handle them
5997 early, since later code would consider them to be
5998 misaligned integers. */
5999 if (DECL_BIT_FIELD (field))
6000 {
6001 for (i = (int_bit_position (field)
6002 + (bit_offset % 64)) / 8 / 8;
6003 i < ((int_bit_position (field) + (bit_offset % 64))
6004 + tree_low_cst (DECL_SIZE (field), 0)
6005 + 63) / 8 / 8; i++)
6006 classes[i] =
6007 merge_classes (X86_64_INTEGER_CLASS,
6008 classes[i]);
6009 }
6010 else
6011 {
6012 int pos;
6013
6014 type = TREE_TYPE (field);
6015
6016 /* Flexible array member is ignored. */
6017 if (TYPE_MODE (type) == BLKmode
6018 && TREE_CODE (type) == ARRAY_TYPE
6019 && TYPE_SIZE (type) == NULL_TREE
6020 && TYPE_DOMAIN (type) != NULL_TREE
6021 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6022 == NULL_TREE))
6023 {
6024 static bool warned;
6025
6026 if (!warned && warn_psabi)
6027 {
6028 warned = true;
6029 inform (input_location,
6030 "the ABI of passing struct with"
6031 " a flexible array member has"
6032 " changed in GCC 4.4");
6033 }
6034 continue;
6035 }
6036 num = classify_argument (TYPE_MODE (type), type,
6037 subclasses,
6038 (int_bit_position (field)
6039 + bit_offset) % 256);
6040 if (!num)
6041 return 0;
6042 pos = (int_bit_position (field)
6043 + (bit_offset % 64)) / 8 / 8;
6044 for (i = 0; i < num && (i + pos) < words; i++)
6045 classes[i + pos] =
6046 merge_classes (subclasses[i], classes[i + pos]);
6047 }
6048 }
6049 }
6050 break;
6051
6052 case ARRAY_TYPE:
6053 /* Arrays are handled as small records. */
6054 {
6055 int num;
6056 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6057 TREE_TYPE (type), subclasses, bit_offset);
6058 if (!num)
6059 return 0;
6060
6061 /* The partial classes are now full classes. */
6062 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6063 subclasses[0] = X86_64_SSE_CLASS;
6064 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6065 && !((bit_offset % 64) == 0 && bytes == 4))
6066 subclasses[0] = X86_64_INTEGER_CLASS;
6067
6068 for (i = 0; i < words; i++)
6069 classes[i] = subclasses[i % num];
6070
6071 break;
6072 }
6073 case UNION_TYPE:
6074 case QUAL_UNION_TYPE:
6075 /* Unions are similar to RECORD_TYPE but offset is always 0.
6076 */
6077 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6078 {
6079 if (TREE_CODE (field) == FIELD_DECL)
6080 {
6081 int num;
6082
6083 if (TREE_TYPE (field) == error_mark_node)
6084 continue;
6085
6086 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6087 TREE_TYPE (field), subclasses,
6088 bit_offset);
6089 if (!num)
6090 return 0;
6091 for (i = 0; i < num; i++)
6092 classes[i] = merge_classes (subclasses[i], classes[i]);
6093 }
6094 }
6095 break;
6096
6097 default:
6098 gcc_unreachable ();
6099 }
6100
6101 if (words > 2)
6102 {
6103 /* When size > 16 bytes, if the first one isn't
6104 X86_64_SSE_CLASS or any other ones aren't
6105 X86_64_SSEUP_CLASS, everything should be passed in
6106 memory. */
6107 if (classes[0] != X86_64_SSE_CLASS)
6108 return 0;
6109
6110 for (i = 1; i < words; i++)
6111 if (classes[i] != X86_64_SSEUP_CLASS)
6112 return 0;
6113 }
6114
6115 /* Final merger cleanup. */
6116 for (i = 0; i < words; i++)
6117 {
6118 /* If one class is MEMORY, everything should be passed in
6119 memory. */
6120 if (classes[i] == X86_64_MEMORY_CLASS)
6121 return 0;
6122
6123 /* The X86_64_SSEUP_CLASS should be always preceded by
6124 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6125 if (classes[i] == X86_64_SSEUP_CLASS
6126 && classes[i - 1] != X86_64_SSE_CLASS
6127 && classes[i - 1] != X86_64_SSEUP_CLASS)
6128 {
6129 /* The first one should never be X86_64_SSEUP_CLASS. */
6130 gcc_assert (i != 0);
6131 classes[i] = X86_64_SSE_CLASS;
6132 }
6133
6134 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6135 everything should be passed in memory. */
6136 if (classes[i] == X86_64_X87UP_CLASS
6137 && (classes[i - 1] != X86_64_X87_CLASS))
6138 {
6139 static bool warned;
6140
6141 /* The first one should never be X86_64_X87UP_CLASS. */
6142 gcc_assert (i != 0);
6143 if (!warned && warn_psabi)
6144 {
6145 warned = true;
6146 inform (input_location,
6147 "the ABI of passing union with long double"
6148 " has changed in GCC 4.4");
6149 }
6150 return 0;
6151 }
6152 }
6153 return words;
6154 }
6155
6156 /* Compute alignment needed. We align all types to natural boundaries with
6157 exception of XFmode that is aligned to 64bits. */
6158 if (mode != VOIDmode && mode != BLKmode)
6159 {
6160 int mode_alignment = GET_MODE_BITSIZE (mode);
6161
6162 if (mode == XFmode)
6163 mode_alignment = 128;
6164 else if (mode == XCmode)
6165 mode_alignment = 256;
6166 if (COMPLEX_MODE_P (mode))
6167 mode_alignment /= 2;
6168 /* Misaligned fields are always returned in memory. */
6169 if (bit_offset % mode_alignment)
6170 return 0;
6171 }
6172
6173 /* for V1xx modes, just use the base mode */
6174 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6175 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6176 mode = GET_MODE_INNER (mode);
6177
6178 /* Classification of atomic types. */
6179 switch (mode)
6180 {
6181 case SDmode:
6182 case DDmode:
6183 classes[0] = X86_64_SSE_CLASS;
6184 return 1;
6185 case TDmode:
6186 classes[0] = X86_64_SSE_CLASS;
6187 classes[1] = X86_64_SSEUP_CLASS;
6188 return 2;
6189 case DImode:
6190 case SImode:
6191 case HImode:
6192 case QImode:
6193 case CSImode:
6194 case CHImode:
6195 case CQImode:
6196 {
6197 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6198
6199 if (size <= 32)
6200 {
6201 classes[0] = X86_64_INTEGERSI_CLASS;
6202 return 1;
6203 }
6204 else if (size <= 64)
6205 {
6206 classes[0] = X86_64_INTEGER_CLASS;
6207 return 1;
6208 }
6209 else if (size <= 64+32)
6210 {
6211 classes[0] = X86_64_INTEGER_CLASS;
6212 classes[1] = X86_64_INTEGERSI_CLASS;
6213 return 2;
6214 }
6215 else if (size <= 64+64)
6216 {
6217 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6218 return 2;
6219 }
6220 else
6221 gcc_unreachable ();
6222 }
6223 case CDImode:
6224 case TImode:
6225 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6226 return 2;
6227 case COImode:
6228 case OImode:
6229 /* OImode shouldn't be used directly. */
6230 gcc_unreachable ();
6231 case CTImode:
6232 return 0;
6233 case SFmode:
6234 if (!(bit_offset % 64))
6235 classes[0] = X86_64_SSESF_CLASS;
6236 else
6237 classes[0] = X86_64_SSE_CLASS;
6238 return 1;
6239 case DFmode:
6240 classes[0] = X86_64_SSEDF_CLASS;
6241 return 1;
6242 case XFmode:
6243 classes[0] = X86_64_X87_CLASS;
6244 classes[1] = X86_64_X87UP_CLASS;
6245 return 2;
6246 case TFmode:
6247 classes[0] = X86_64_SSE_CLASS;
6248 classes[1] = X86_64_SSEUP_CLASS;
6249 return 2;
6250 case SCmode:
6251 classes[0] = X86_64_SSE_CLASS;
6252 if (!(bit_offset % 64))
6253 return 1;
6254 else
6255 {
6256 static bool warned;
6257
6258 if (!warned && warn_psabi)
6259 {
6260 warned = true;
6261 inform (input_location,
6262 "the ABI of passing structure with complex float"
6263 " member has changed in GCC 4.4");
6264 }
6265 classes[1] = X86_64_SSESF_CLASS;
6266 return 2;
6267 }
6268 case DCmode:
6269 classes[0] = X86_64_SSEDF_CLASS;
6270 classes[1] = X86_64_SSEDF_CLASS;
6271 return 2;
6272 case XCmode:
6273 classes[0] = X86_64_COMPLEX_X87_CLASS;
6274 return 1;
6275 case TCmode:
6276 /* This modes is larger than 16 bytes. */
6277 return 0;
6278 case V8SFmode:
6279 case V8SImode:
6280 case V32QImode:
6281 case V16HImode:
6282 case V4DFmode:
6283 case V4DImode:
6284 classes[0] = X86_64_SSE_CLASS;
6285 classes[1] = X86_64_SSEUP_CLASS;
6286 classes[2] = X86_64_SSEUP_CLASS;
6287 classes[3] = X86_64_SSEUP_CLASS;
6288 return 4;
6289 case V4SFmode:
6290 case V4SImode:
6291 case V16QImode:
6292 case V8HImode:
6293 case V2DFmode:
6294 case V2DImode:
6295 classes[0] = X86_64_SSE_CLASS;
6296 classes[1] = X86_64_SSEUP_CLASS;
6297 return 2;
6298 case V1TImode:
6299 case V1DImode:
6300 case V2SFmode:
6301 case V2SImode:
6302 case V4HImode:
6303 case V8QImode:
6304 classes[0] = X86_64_SSE_CLASS;
6305 return 1;
6306 case BLKmode:
6307 case VOIDmode:
6308 return 0;
6309 default:
6310 gcc_assert (VECTOR_MODE_P (mode));
6311
6312 if (bytes > 16)
6313 return 0;
6314
6315 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6316
6317 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6318 classes[0] = X86_64_INTEGERSI_CLASS;
6319 else
6320 classes[0] = X86_64_INTEGER_CLASS;
6321 classes[1] = X86_64_INTEGER_CLASS;
6322 return 1 + (bytes > 8);
6323 }
6324 }
6325
6326 /* Examine the argument and return set number of register required in each
6327 class. Return 0 iff parameter should be passed in memory. */
6328 static int
6329 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6330 int *int_nregs, int *sse_nregs)
6331 {
6332 enum x86_64_reg_class regclass[MAX_CLASSES];
6333 int n = classify_argument (mode, type, regclass, 0);
6334
6335 *int_nregs = 0;
6336 *sse_nregs = 0;
6337 if (!n)
6338 return 0;
6339 for (n--; n >= 0; n--)
6340 switch (regclass[n])
6341 {
6342 case X86_64_INTEGER_CLASS:
6343 case X86_64_INTEGERSI_CLASS:
6344 (*int_nregs)++;
6345 break;
6346 case X86_64_SSE_CLASS:
6347 case X86_64_SSESF_CLASS:
6348 case X86_64_SSEDF_CLASS:
6349 (*sse_nregs)++;
6350 break;
6351 case X86_64_NO_CLASS:
6352 case X86_64_SSEUP_CLASS:
6353 break;
6354 case X86_64_X87_CLASS:
6355 case X86_64_X87UP_CLASS:
6356 if (!in_return)
6357 return 0;
6358 break;
6359 case X86_64_COMPLEX_X87_CLASS:
6360 return in_return ? 2 : 0;
6361 case X86_64_MEMORY_CLASS:
6362 gcc_unreachable ();
6363 }
6364 return 1;
6365 }
6366
6367 /* Construct container for the argument used by GCC interface. See
6368 FUNCTION_ARG for the detailed description. */
6369
6370 static rtx
6371 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6372 const_tree type, int in_return, int nintregs, int nsseregs,
6373 const int *intreg, int sse_regno)
6374 {
6375 /* The following variables hold the static issued_error state. */
6376 static bool issued_sse_arg_error;
6377 static bool issued_sse_ret_error;
6378 static bool issued_x87_ret_error;
6379
6380 enum machine_mode tmpmode;
6381 int bytes =
6382 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6383 enum x86_64_reg_class regclass[MAX_CLASSES];
6384 int n;
6385 int i;
6386 int nexps = 0;
6387 int needed_sseregs, needed_intregs;
6388 rtx exp[MAX_CLASSES];
6389 rtx ret;
6390
6391 n = classify_argument (mode, type, regclass, 0);
6392 if (!n)
6393 return NULL;
6394 if (!examine_argument (mode, type, in_return, &needed_intregs,
6395 &needed_sseregs))
6396 return NULL;
6397 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6398 return NULL;
6399
6400 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6401 some less clueful developer tries to use floating-point anyway. */
6402 if (needed_sseregs && !TARGET_SSE)
6403 {
6404 if (in_return)
6405 {
6406 if (!issued_sse_ret_error)
6407 {
6408 error ("SSE register return with SSE disabled");
6409 issued_sse_ret_error = true;
6410 }
6411 }
6412 else if (!issued_sse_arg_error)
6413 {
6414 error ("SSE register argument with SSE disabled");
6415 issued_sse_arg_error = true;
6416 }
6417 return NULL;
6418 }
6419
6420 /* Likewise, error if the ABI requires us to return values in the
6421 x87 registers and the user specified -mno-80387. */
6422 if (!TARGET_80387 && in_return)
6423 for (i = 0; i < n; i++)
6424 if (regclass[i] == X86_64_X87_CLASS
6425 || regclass[i] == X86_64_X87UP_CLASS
6426 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6427 {
6428 if (!issued_x87_ret_error)
6429 {
6430 error ("x87 register return with x87 disabled");
6431 issued_x87_ret_error = true;
6432 }
6433 return NULL;
6434 }
6435
6436 /* First construct simple cases. Avoid SCmode, since we want to use
6437 single register to pass this type. */
6438 if (n == 1 && mode != SCmode)
6439 switch (regclass[0])
6440 {
6441 case X86_64_INTEGER_CLASS:
6442 case X86_64_INTEGERSI_CLASS:
6443 return gen_rtx_REG (mode, intreg[0]);
6444 case X86_64_SSE_CLASS:
6445 case X86_64_SSESF_CLASS:
6446 case X86_64_SSEDF_CLASS:
6447 if (mode != BLKmode)
6448 return gen_reg_or_parallel (mode, orig_mode,
6449 SSE_REGNO (sse_regno));
6450 break;
6451 case X86_64_X87_CLASS:
6452 case X86_64_COMPLEX_X87_CLASS:
6453 return gen_rtx_REG (mode, FIRST_STACK_REG);
6454 case X86_64_NO_CLASS:
6455 /* Zero sized array, struct or class. */
6456 return NULL;
6457 default:
6458 gcc_unreachable ();
6459 }
6460 if (n == 2
6461 && regclass[0] == X86_64_SSE_CLASS
6462 && regclass[1] == X86_64_SSEUP_CLASS
6463 && mode != BLKmode)
6464 return gen_reg_or_parallel (mode, orig_mode,
6465 SSE_REGNO (sse_regno));
6466 if (n == 4
6467 && regclass[0] == X86_64_SSE_CLASS
6468 && regclass[1] == X86_64_SSEUP_CLASS
6469 && regclass[2] == X86_64_SSEUP_CLASS
6470 && regclass[3] == X86_64_SSEUP_CLASS
6471 && mode != BLKmode)
6472 return gen_reg_or_parallel (mode, orig_mode,
6473 SSE_REGNO (sse_regno));
6474 if (n == 2
6475 && regclass[0] == X86_64_X87_CLASS
6476 && regclass[1] == X86_64_X87UP_CLASS)
6477 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6478
6479 if (n == 2
6480 && regclass[0] == X86_64_INTEGER_CLASS
6481 && regclass[1] == X86_64_INTEGER_CLASS
6482 && (mode == CDImode || mode == TImode || mode == TFmode)
6483 && intreg[0] + 1 == intreg[1])
6484 return gen_rtx_REG (mode, intreg[0]);
6485
6486 /* Otherwise figure out the entries of the PARALLEL. */
6487 for (i = 0; i < n; i++)
6488 {
6489 int pos;
6490
6491 switch (regclass[i])
6492 {
6493 case X86_64_NO_CLASS:
6494 break;
6495 case X86_64_INTEGER_CLASS:
6496 case X86_64_INTEGERSI_CLASS:
6497 /* Merge TImodes on aligned occasions here too. */
6498 if (i * 8 + 8 > bytes)
6499 tmpmode
6500 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6501 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6502 tmpmode = SImode;
6503 else
6504 tmpmode = DImode;
6505 /* We've requested 24 bytes we
6506 don't have mode for. Use DImode. */
6507 if (tmpmode == BLKmode)
6508 tmpmode = DImode;
6509 exp [nexps++]
6510 = gen_rtx_EXPR_LIST (VOIDmode,
6511 gen_rtx_REG (tmpmode, *intreg),
6512 GEN_INT (i*8));
6513 intreg++;
6514 break;
6515 case X86_64_SSESF_CLASS:
6516 exp [nexps++]
6517 = gen_rtx_EXPR_LIST (VOIDmode,
6518 gen_rtx_REG (SFmode,
6519 SSE_REGNO (sse_regno)),
6520 GEN_INT (i*8));
6521 sse_regno++;
6522 break;
6523 case X86_64_SSEDF_CLASS:
6524 exp [nexps++]
6525 = gen_rtx_EXPR_LIST (VOIDmode,
6526 gen_rtx_REG (DFmode,
6527 SSE_REGNO (sse_regno)),
6528 GEN_INT (i*8));
6529 sse_regno++;
6530 break;
6531 case X86_64_SSE_CLASS:
6532 pos = i;
6533 switch (n)
6534 {
6535 case 1:
6536 tmpmode = DImode;
6537 break;
6538 case 2:
6539 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6540 {
6541 tmpmode = TImode;
6542 i++;
6543 }
6544 else
6545 tmpmode = DImode;
6546 break;
6547 case 4:
6548 gcc_assert (i == 0
6549 && regclass[1] == X86_64_SSEUP_CLASS
6550 && regclass[2] == X86_64_SSEUP_CLASS
6551 && regclass[3] == X86_64_SSEUP_CLASS);
6552 tmpmode = OImode;
6553 i += 3;
6554 break;
6555 default:
6556 gcc_unreachable ();
6557 }
6558 exp [nexps++]
6559 = gen_rtx_EXPR_LIST (VOIDmode,
6560 gen_rtx_REG (tmpmode,
6561 SSE_REGNO (sse_regno)),
6562 GEN_INT (pos*8));
6563 sse_regno++;
6564 break;
6565 default:
6566 gcc_unreachable ();
6567 }
6568 }
6569
6570 /* Empty aligned struct, union or class. */
6571 if (nexps == 0)
6572 return NULL;
6573
6574 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6575 for (i = 0; i < nexps; i++)
6576 XVECEXP (ret, 0, i) = exp [i];
6577 return ret;
6578 }
6579
6580 /* Update the data in CUM to advance over an argument of mode MODE
6581 and data type TYPE. (TYPE is null for libcalls where that information
6582 may not be available.) */
6583
6584 static void
6585 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6586 const_tree type, HOST_WIDE_INT bytes,
6587 HOST_WIDE_INT words)
6588 {
6589 switch (mode)
6590 {
6591 default:
6592 break;
6593
6594 case BLKmode:
6595 if (bytes < 0)
6596 break;
6597 /* FALLTHRU */
6598
6599 case DImode:
6600 case SImode:
6601 case HImode:
6602 case QImode:
6603 cum->words += words;
6604 cum->nregs -= words;
6605 cum->regno += words;
6606
6607 if (cum->nregs <= 0)
6608 {
6609 cum->nregs = 0;
6610 cum->regno = 0;
6611 }
6612 break;
6613
6614 case OImode:
6615 /* OImode shouldn't be used directly. */
6616 gcc_unreachable ();
6617
6618 case DFmode:
6619 if (cum->float_in_sse < 2)
6620 break;
6621 case SFmode:
6622 if (cum->float_in_sse < 1)
6623 break;
6624 /* FALLTHRU */
6625
6626 case V8SFmode:
6627 case V8SImode:
6628 case V32QImode:
6629 case V16HImode:
6630 case V4DFmode:
6631 case V4DImode:
6632 case TImode:
6633 case V16QImode:
6634 case V8HImode:
6635 case V4SImode:
6636 case V2DImode:
6637 case V4SFmode:
6638 case V2DFmode:
6639 if (!type || !AGGREGATE_TYPE_P (type))
6640 {
6641 cum->sse_words += words;
6642 cum->sse_nregs -= 1;
6643 cum->sse_regno += 1;
6644 if (cum->sse_nregs <= 0)
6645 {
6646 cum->sse_nregs = 0;
6647 cum->sse_regno = 0;
6648 }
6649 }
6650 break;
6651
6652 case V8QImode:
6653 case V4HImode:
6654 case V2SImode:
6655 case V2SFmode:
6656 case V1TImode:
6657 case V1DImode:
6658 if (!type || !AGGREGATE_TYPE_P (type))
6659 {
6660 cum->mmx_words += words;
6661 cum->mmx_nregs -= 1;
6662 cum->mmx_regno += 1;
6663 if (cum->mmx_nregs <= 0)
6664 {
6665 cum->mmx_nregs = 0;
6666 cum->mmx_regno = 0;
6667 }
6668 }
6669 break;
6670 }
6671 }
6672
6673 static void
6674 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6675 const_tree type, HOST_WIDE_INT words, bool named)
6676 {
6677 int int_nregs, sse_nregs;
6678
6679 /* Unnamed 256bit vector mode parameters are passed on stack. */
6680 if (!named && VALID_AVX256_REG_MODE (mode))
6681 return;
6682
6683 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6684 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6685 {
6686 cum->nregs -= int_nregs;
6687 cum->sse_nregs -= sse_nregs;
6688 cum->regno += int_nregs;
6689 cum->sse_regno += sse_nregs;
6690 }
6691 else
6692 {
6693 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6694 cum->words = (cum->words + align - 1) & ~(align - 1);
6695 cum->words += words;
6696 }
6697 }
6698
6699 static void
6700 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6701 HOST_WIDE_INT words)
6702 {
6703 /* Otherwise, this should be passed indirect. */
6704 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6705
6706 cum->words += words;
6707 if (cum->nregs > 0)
6708 {
6709 cum->nregs -= 1;
6710 cum->regno += 1;
6711 }
6712 }
6713
6714 /* Update the data in CUM to advance over an argument of mode MODE and
6715 data type TYPE. (TYPE is null for libcalls where that information
6716 may not be available.) */
6717
6718 static void
6719 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6720 const_tree type, bool named)
6721 {
6722 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6723 HOST_WIDE_INT bytes, words;
6724
6725 if (mode == BLKmode)
6726 bytes = int_size_in_bytes (type);
6727 else
6728 bytes = GET_MODE_SIZE (mode);
6729 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6730
6731 if (type)
6732 mode = type_natural_mode (type, NULL);
6733
6734 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6735 function_arg_advance_ms_64 (cum, bytes, words);
6736 else if (TARGET_64BIT)
6737 function_arg_advance_64 (cum, mode, type, words, named);
6738 else
6739 function_arg_advance_32 (cum, mode, type, bytes, words);
6740 }
6741
6742 /* Define where to put the arguments to a function.
6743 Value is zero to push the argument on the stack,
6744 or a hard register in which to store the argument.
6745
6746 MODE is the argument's machine mode.
6747 TYPE is the data type of the argument (as a tree).
6748 This is null for libcalls where that information may
6749 not be available.
6750 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6751 the preceding args and about the function being called.
6752 NAMED is nonzero if this argument is a named parameter
6753 (otherwise it is an extra parameter matching an ellipsis). */
6754
6755 static rtx
6756 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6757 enum machine_mode orig_mode, const_tree type,
6758 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6759 {
6760 static bool warnedsse, warnedmmx;
6761
6762 /* Avoid the AL settings for the Unix64 ABI. */
6763 if (mode == VOIDmode)
6764 return constm1_rtx;
6765
6766 switch (mode)
6767 {
6768 default:
6769 break;
6770
6771 case BLKmode:
6772 if (bytes < 0)
6773 break;
6774 /* FALLTHRU */
6775 case DImode:
6776 case SImode:
6777 case HImode:
6778 case QImode:
6779 if (words <= cum->nregs)
6780 {
6781 int regno = cum->regno;
6782
6783 /* Fastcall allocates the first two DWORD (SImode) or
6784 smaller arguments to ECX and EDX if it isn't an
6785 aggregate type . */
6786 if (cum->fastcall)
6787 {
6788 if (mode == BLKmode
6789 || mode == DImode
6790 || (type && AGGREGATE_TYPE_P (type)))
6791 break;
6792
6793 /* ECX not EAX is the first allocated register. */
6794 if (regno == AX_REG)
6795 regno = CX_REG;
6796 }
6797 return gen_rtx_REG (mode, regno);
6798 }
6799 break;
6800
6801 case DFmode:
6802 if (cum->float_in_sse < 2)
6803 break;
6804 case SFmode:
6805 if (cum->float_in_sse < 1)
6806 break;
6807 /* FALLTHRU */
6808 case TImode:
6809 /* In 32bit, we pass TImode in xmm registers. */
6810 case V16QImode:
6811 case V8HImode:
6812 case V4SImode:
6813 case V2DImode:
6814 case V4SFmode:
6815 case V2DFmode:
6816 if (!type || !AGGREGATE_TYPE_P (type))
6817 {
6818 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6819 {
6820 warnedsse = true;
6821 warning (0, "SSE vector argument without SSE enabled "
6822 "changes the ABI");
6823 }
6824 if (cum->sse_nregs)
6825 return gen_reg_or_parallel (mode, orig_mode,
6826 cum->sse_regno + FIRST_SSE_REG);
6827 }
6828 break;
6829
6830 case OImode:
6831 /* OImode shouldn't be used directly. */
6832 gcc_unreachable ();
6833
6834 case V8SFmode:
6835 case V8SImode:
6836 case V32QImode:
6837 case V16HImode:
6838 case V4DFmode:
6839 case V4DImode:
6840 if (!type || !AGGREGATE_TYPE_P (type))
6841 {
6842 if (cum->sse_nregs)
6843 return gen_reg_or_parallel (mode, orig_mode,
6844 cum->sse_regno + FIRST_SSE_REG);
6845 }
6846 break;
6847
6848 case V8QImode:
6849 case V4HImode:
6850 case V2SImode:
6851 case V2SFmode:
6852 case V1TImode:
6853 case V1DImode:
6854 if (!type || !AGGREGATE_TYPE_P (type))
6855 {
6856 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6857 {
6858 warnedmmx = true;
6859 warning (0, "MMX vector argument without MMX enabled "
6860 "changes the ABI");
6861 }
6862 if (cum->mmx_nregs)
6863 return gen_reg_or_parallel (mode, orig_mode,
6864 cum->mmx_regno + FIRST_MMX_REG);
6865 }
6866 break;
6867 }
6868
6869 return NULL_RTX;
6870 }
6871
6872 static rtx
6873 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6874 enum machine_mode orig_mode, const_tree type, bool named)
6875 {
6876 /* Handle a hidden AL argument containing number of registers
6877 for varargs x86-64 functions. */
6878 if (mode == VOIDmode)
6879 return GEN_INT (cum->maybe_vaarg
6880 ? (cum->sse_nregs < 0
6881 ? X86_64_SSE_REGPARM_MAX
6882 : cum->sse_regno)
6883 : -1);
6884
6885 switch (mode)
6886 {
6887 default:
6888 break;
6889
6890 case V8SFmode:
6891 case V8SImode:
6892 case V32QImode:
6893 case V16HImode:
6894 case V4DFmode:
6895 case V4DImode:
6896 /* Unnamed 256bit vector mode parameters are passed on stack. */
6897 if (!named)
6898 return NULL;
6899 break;
6900 }
6901
6902 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6903 cum->sse_nregs,
6904 &x86_64_int_parameter_registers [cum->regno],
6905 cum->sse_regno);
6906 }
6907
6908 static rtx
6909 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6910 enum machine_mode orig_mode, bool named,
6911 HOST_WIDE_INT bytes)
6912 {
6913 unsigned int regno;
6914
6915 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6916 We use value of -2 to specify that current function call is MSABI. */
6917 if (mode == VOIDmode)
6918 return GEN_INT (-2);
6919
6920 /* If we've run out of registers, it goes on the stack. */
6921 if (cum->nregs == 0)
6922 return NULL_RTX;
6923
6924 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6925
6926 /* Only floating point modes are passed in anything but integer regs. */
6927 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6928 {
6929 if (named)
6930 regno = cum->regno + FIRST_SSE_REG;
6931 else
6932 {
6933 rtx t1, t2;
6934
6935 /* Unnamed floating parameters are passed in both the
6936 SSE and integer registers. */
6937 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6938 t2 = gen_rtx_REG (mode, regno);
6939 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6940 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6941 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6942 }
6943 }
6944 /* Handle aggregated types passed in register. */
6945 if (orig_mode == BLKmode)
6946 {
6947 if (bytes > 0 && bytes <= 8)
6948 mode = (bytes > 4 ? DImode : SImode);
6949 if (mode == BLKmode)
6950 mode = DImode;
6951 }
6952
6953 return gen_reg_or_parallel (mode, orig_mode, regno);
6954 }
6955
6956 /* Return where to put the arguments to a function.
6957 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6958
6959 MODE is the argument's machine mode. TYPE is the data type of the
6960 argument. It is null for libcalls where that information may not be
6961 available. CUM gives information about the preceding args and about
6962 the function being called. NAMED is nonzero if this argument is a
6963 named parameter (otherwise it is an extra parameter matching an
6964 ellipsis). */
6965
6966 static rtx
6967 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6968 const_tree type, bool named)
6969 {
6970 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6971 enum machine_mode mode = omode;
6972 HOST_WIDE_INT bytes, words;
6973 rtx arg;
6974
6975 if (mode == BLKmode)
6976 bytes = int_size_in_bytes (type);
6977 else
6978 bytes = GET_MODE_SIZE (mode);
6979 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6980
6981 /* To simplify the code below, represent vector types with a vector mode
6982 even if MMX/SSE are not active. */
6983 if (type && TREE_CODE (type) == VECTOR_TYPE)
6984 mode = type_natural_mode (type, cum);
6985
6986 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6987 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6988 else if (TARGET_64BIT)
6989 arg = function_arg_64 (cum, mode, omode, type, named);
6990 else
6991 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6992
6993 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6994 {
6995 /* This argument uses 256bit AVX modes. */
6996 if (cum->caller)
6997 cfun->machine->callee_pass_avx256_p = true;
6998 else
6999 cfun->machine->caller_pass_avx256_p = true;
7000 }
7001
7002 return arg;
7003 }
7004
7005 /* A C expression that indicates when an argument must be passed by
7006 reference. If nonzero for an argument, a copy of that argument is
7007 made in memory and a pointer to the argument is passed instead of
7008 the argument itself. The pointer is passed in whatever way is
7009 appropriate for passing a pointer to that type. */
7010
7011 static bool
7012 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7013 enum machine_mode mode ATTRIBUTE_UNUSED,
7014 const_tree type, bool named ATTRIBUTE_UNUSED)
7015 {
7016 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7017
7018 /* See Windows x64 Software Convention. */
7019 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7020 {
7021 int msize = (int) GET_MODE_SIZE (mode);
7022 if (type)
7023 {
7024 /* Arrays are passed by reference. */
7025 if (TREE_CODE (type) == ARRAY_TYPE)
7026 return true;
7027
7028 if (AGGREGATE_TYPE_P (type))
7029 {
7030 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7031 are passed by reference. */
7032 msize = int_size_in_bytes (type);
7033 }
7034 }
7035
7036 /* __m128 is passed by reference. */
7037 switch (msize) {
7038 case 1: case 2: case 4: case 8:
7039 break;
7040 default:
7041 return true;
7042 }
7043 }
7044 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7045 return 1;
7046
7047 return 0;
7048 }
7049
7050 /* Return true when TYPE should be 128bit aligned for 32bit argument
7051 passing ABI. XXX: This function is obsolete and is only used for
7052 checking psABI compatibility with previous versions of GCC. */
7053
7054 static bool
7055 ix86_compat_aligned_value_p (const_tree type)
7056 {
7057 enum machine_mode mode = TYPE_MODE (type);
7058 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7059 || mode == TDmode
7060 || mode == TFmode
7061 || mode == TCmode)
7062 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7063 return true;
7064 if (TYPE_ALIGN (type) < 128)
7065 return false;
7066
7067 if (AGGREGATE_TYPE_P (type))
7068 {
7069 /* Walk the aggregates recursively. */
7070 switch (TREE_CODE (type))
7071 {
7072 case RECORD_TYPE:
7073 case UNION_TYPE:
7074 case QUAL_UNION_TYPE:
7075 {
7076 tree field;
7077
7078 /* Walk all the structure fields. */
7079 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7080 {
7081 if (TREE_CODE (field) == FIELD_DECL
7082 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7083 return true;
7084 }
7085 break;
7086 }
7087
7088 case ARRAY_TYPE:
7089 /* Just for use if some languages passes arrays by value. */
7090 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7091 return true;
7092 break;
7093
7094 default:
7095 gcc_unreachable ();
7096 }
7097 }
7098 return false;
7099 }
7100
7101 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7102 XXX: This function is obsolete and is only used for checking psABI
7103 compatibility with previous versions of GCC. */
7104
7105 static unsigned int
7106 ix86_compat_function_arg_boundary (enum machine_mode mode,
7107 const_tree type, unsigned int align)
7108 {
7109 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7110 natural boundaries. */
7111 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7112 {
7113 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7114 make an exception for SSE modes since these require 128bit
7115 alignment.
7116
7117 The handling here differs from field_alignment. ICC aligns MMX
7118 arguments to 4 byte boundaries, while structure fields are aligned
7119 to 8 byte boundaries. */
7120 if (!type)
7121 {
7122 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7123 align = PARM_BOUNDARY;
7124 }
7125 else
7126 {
7127 if (!ix86_compat_aligned_value_p (type))
7128 align = PARM_BOUNDARY;
7129 }
7130 }
7131 if (align > BIGGEST_ALIGNMENT)
7132 align = BIGGEST_ALIGNMENT;
7133 return align;
7134 }
7135
7136 /* Return true when TYPE should be 128bit aligned for 32bit argument
7137 passing ABI. */
7138
7139 static bool
7140 ix86_contains_aligned_value_p (const_tree type)
7141 {
7142 enum machine_mode mode = TYPE_MODE (type);
7143
7144 if (mode == XFmode || mode == XCmode)
7145 return false;
7146
7147 if (TYPE_ALIGN (type) < 128)
7148 return false;
7149
7150 if (AGGREGATE_TYPE_P (type))
7151 {
7152 /* Walk the aggregates recursively. */
7153 switch (TREE_CODE (type))
7154 {
7155 case RECORD_TYPE:
7156 case UNION_TYPE:
7157 case QUAL_UNION_TYPE:
7158 {
7159 tree field;
7160
7161 /* Walk all the structure fields. */
7162 for (field = TYPE_FIELDS (type);
7163 field;
7164 field = DECL_CHAIN (field))
7165 {
7166 if (TREE_CODE (field) == FIELD_DECL
7167 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7168 return true;
7169 }
7170 break;
7171 }
7172
7173 case ARRAY_TYPE:
7174 /* Just for use if some languages passes arrays by value. */
7175 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7176 return true;
7177 break;
7178
7179 default:
7180 gcc_unreachable ();
7181 }
7182 }
7183 else
7184 return TYPE_ALIGN (type) >= 128;
7185
7186 return false;
7187 }
7188
7189 /* Gives the alignment boundary, in bits, of an argument with the
7190 specified mode and type. */
7191
7192 static unsigned int
7193 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7194 {
7195 unsigned int align;
7196 if (type)
7197 {
7198 /* Since the main variant type is used for call, we convert it to
7199 the main variant type. */
7200 type = TYPE_MAIN_VARIANT (type);
7201 align = TYPE_ALIGN (type);
7202 }
7203 else
7204 align = GET_MODE_ALIGNMENT (mode);
7205 if (align < PARM_BOUNDARY)
7206 align = PARM_BOUNDARY;
7207 else
7208 {
7209 static bool warned;
7210 unsigned int saved_align = align;
7211
7212 if (!TARGET_64BIT)
7213 {
7214 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7215 if (!type)
7216 {
7217 if (mode == XFmode || mode == XCmode)
7218 align = PARM_BOUNDARY;
7219 }
7220 else if (!ix86_contains_aligned_value_p (type))
7221 align = PARM_BOUNDARY;
7222
7223 if (align < 128)
7224 align = PARM_BOUNDARY;
7225 }
7226
7227 if (warn_psabi
7228 && !warned
7229 && align != ix86_compat_function_arg_boundary (mode, type,
7230 saved_align))
7231 {
7232 warned = true;
7233 inform (input_location,
7234 "The ABI for passing parameters with %d-byte"
7235 " alignment has changed in GCC 4.6",
7236 align / BITS_PER_UNIT);
7237 }
7238 }
7239
7240 return align;
7241 }
7242
7243 /* Return true if N is a possible register number of function value. */
7244
7245 static bool
7246 ix86_function_value_regno_p (const unsigned int regno)
7247 {
7248 switch (regno)
7249 {
7250 case AX_REG:
7251 return true;
7252
7253 case FIRST_FLOAT_REG:
7254 /* TODO: The function should depend on current function ABI but
7255 builtins.c would need updating then. Therefore we use the
7256 default ABI. */
7257 if (TARGET_64BIT && ix86_abi == MS_ABI)
7258 return false;
7259 return TARGET_FLOAT_RETURNS_IN_80387;
7260
7261 case FIRST_SSE_REG:
7262 return TARGET_SSE;
7263
7264 case FIRST_MMX_REG:
7265 if (TARGET_MACHO || TARGET_64BIT)
7266 return false;
7267 return TARGET_MMX;
7268 }
7269
7270 return false;
7271 }
7272
7273 /* Define how to find the value returned by a function.
7274 VALTYPE is the data type of the value (as a tree).
7275 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7276 otherwise, FUNC is 0. */
7277
7278 static rtx
7279 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7280 const_tree fntype, const_tree fn)
7281 {
7282 unsigned int regno;
7283
7284 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7285 we normally prevent this case when mmx is not available. However
7286 some ABIs may require the result to be returned like DImode. */
7287 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7288 regno = FIRST_MMX_REG;
7289
7290 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7291 we prevent this case when sse is not available. However some ABIs
7292 may require the result to be returned like integer TImode. */
7293 else if (mode == TImode
7294 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7295 regno = FIRST_SSE_REG;
7296
7297 /* 32-byte vector modes in %ymm0. */
7298 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7299 regno = FIRST_SSE_REG;
7300
7301 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7302 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7303 regno = FIRST_FLOAT_REG;
7304 else
7305 /* Most things go in %eax. */
7306 regno = AX_REG;
7307
7308 /* Override FP return register with %xmm0 for local functions when
7309 SSE math is enabled or for functions with sseregparm attribute. */
7310 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7311 {
7312 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7313 if ((sse_level >= 1 && mode == SFmode)
7314 || (sse_level == 2 && mode == DFmode))
7315 regno = FIRST_SSE_REG;
7316 }
7317
7318 /* OImode shouldn't be used directly. */
7319 gcc_assert (mode != OImode);
7320
7321 return gen_rtx_REG (orig_mode, regno);
7322 }
7323
7324 static rtx
7325 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7326 const_tree valtype)
7327 {
7328 rtx ret;
7329
7330 /* Handle libcalls, which don't provide a type node. */
7331 if (valtype == NULL)
7332 {
7333 unsigned int regno;
7334
7335 switch (mode)
7336 {
7337 case SFmode:
7338 case SCmode:
7339 case DFmode:
7340 case DCmode:
7341 case TFmode:
7342 case SDmode:
7343 case DDmode:
7344 case TDmode:
7345 regno = FIRST_SSE_REG;
7346 break;
7347 case XFmode:
7348 case XCmode:
7349 regno = FIRST_FLOAT_REG;
7350 break;
7351 case TCmode:
7352 return NULL;
7353 default:
7354 regno = AX_REG;
7355 }
7356
7357 return gen_rtx_REG (mode, regno);
7358 }
7359 else if (POINTER_TYPE_P (valtype))
7360 {
7361 /* Pointers are always returned in word_mode. */
7362 mode = word_mode;
7363 }
7364
7365 ret = construct_container (mode, orig_mode, valtype, 1,
7366 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7367 x86_64_int_return_registers, 0);
7368
7369 /* For zero sized structures, construct_container returns NULL, but we
7370 need to keep rest of compiler happy by returning meaningful value. */
7371 if (!ret)
7372 ret = gen_rtx_REG (orig_mode, AX_REG);
7373
7374 return ret;
7375 }
7376
7377 static rtx
7378 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7379 {
7380 unsigned int regno = AX_REG;
7381
7382 if (TARGET_SSE)
7383 {
7384 switch (GET_MODE_SIZE (mode))
7385 {
7386 case 16:
7387 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7388 && !COMPLEX_MODE_P (mode))
7389 regno = FIRST_SSE_REG;
7390 break;
7391 case 8:
7392 case 4:
7393 if (mode == SFmode || mode == DFmode)
7394 regno = FIRST_SSE_REG;
7395 break;
7396 default:
7397 break;
7398 }
7399 }
7400 return gen_rtx_REG (orig_mode, regno);
7401 }
7402
7403 static rtx
7404 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7405 enum machine_mode orig_mode, enum machine_mode mode)
7406 {
7407 const_tree fn, fntype;
7408
7409 fn = NULL_TREE;
7410 if (fntype_or_decl && DECL_P (fntype_or_decl))
7411 fn = fntype_or_decl;
7412 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7413
7414 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7415 return function_value_ms_64 (orig_mode, mode);
7416 else if (TARGET_64BIT)
7417 return function_value_64 (orig_mode, mode, valtype);
7418 else
7419 return function_value_32 (orig_mode, mode, fntype, fn);
7420 }
7421
7422 static rtx
7423 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7424 bool outgoing ATTRIBUTE_UNUSED)
7425 {
7426 enum machine_mode mode, orig_mode;
7427
7428 orig_mode = TYPE_MODE (valtype);
7429 mode = type_natural_mode (valtype, NULL);
7430 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7431 }
7432
7433 /* Pointer function arguments and return values are promoted to
7434 word_mode. */
7435
7436 static enum machine_mode
7437 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7438 int *punsignedp, const_tree fntype,
7439 int for_return)
7440 {
7441 if (type != NULL_TREE && POINTER_TYPE_P (type))
7442 {
7443 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7444 return word_mode;
7445 }
7446 return default_promote_function_mode (type, mode, punsignedp, fntype,
7447 for_return);
7448 }
7449
7450 rtx
7451 ix86_libcall_value (enum machine_mode mode)
7452 {
7453 return ix86_function_value_1 (NULL, NULL, mode, mode);
7454 }
7455
7456 /* Return true iff type is returned in memory. */
7457
7458 static bool ATTRIBUTE_UNUSED
7459 return_in_memory_32 (const_tree type, enum machine_mode mode)
7460 {
7461 HOST_WIDE_INT size;
7462
7463 if (mode == BLKmode)
7464 return true;
7465
7466 size = int_size_in_bytes (type);
7467
7468 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7469 return false;
7470
7471 if (VECTOR_MODE_P (mode) || mode == TImode)
7472 {
7473 /* User-created vectors small enough to fit in EAX. */
7474 if (size < 8)
7475 return false;
7476
7477 /* MMX/3dNow values are returned in MM0,
7478 except when it doesn't exits or the ABI prescribes otherwise. */
7479 if (size == 8)
7480 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7481
7482 /* SSE values are returned in XMM0, except when it doesn't exist. */
7483 if (size == 16)
7484 return !TARGET_SSE;
7485
7486 /* AVX values are returned in YMM0, except when it doesn't exist. */
7487 if (size == 32)
7488 return !TARGET_AVX;
7489 }
7490
7491 if (mode == XFmode)
7492 return false;
7493
7494 if (size > 12)
7495 return true;
7496
7497 /* OImode shouldn't be used directly. */
7498 gcc_assert (mode != OImode);
7499
7500 return false;
7501 }
7502
7503 static bool ATTRIBUTE_UNUSED
7504 return_in_memory_64 (const_tree type, enum machine_mode mode)
7505 {
7506 int needed_intregs, needed_sseregs;
7507 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7508 }
7509
7510 static bool ATTRIBUTE_UNUSED
7511 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7512 {
7513 HOST_WIDE_INT size = int_size_in_bytes (type);
7514
7515 /* __m128 is returned in xmm0. */
7516 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7517 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7518 return false;
7519
7520 /* Otherwise, the size must be exactly in [1248]. */
7521 return size != 1 && size != 2 && size != 4 && size != 8;
7522 }
7523
7524 static bool
7525 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7526 {
7527 #ifdef SUBTARGET_RETURN_IN_MEMORY
7528 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7529 #else
7530 const enum machine_mode mode = type_natural_mode (type, NULL);
7531
7532 if (TARGET_64BIT)
7533 {
7534 if (ix86_function_type_abi (fntype) == MS_ABI)
7535 return return_in_memory_ms_64 (type, mode);
7536 else
7537 return return_in_memory_64 (type, mode);
7538 }
7539 else
7540 return return_in_memory_32 (type, mode);
7541 #endif
7542 }
7543
7544 /* When returning SSE vector types, we have a choice of either
7545 (1) being abi incompatible with a -march switch, or
7546 (2) generating an error.
7547 Given no good solution, I think the safest thing is one warning.
7548 The user won't be able to use -Werror, but....
7549
7550 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7551 called in response to actually generating a caller or callee that
7552 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7553 via aggregate_value_p for general type probing from tree-ssa. */
7554
7555 static rtx
7556 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7557 {
7558 static bool warnedsse, warnedmmx;
7559
7560 if (!TARGET_64BIT && type)
7561 {
7562 /* Look at the return type of the function, not the function type. */
7563 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7564
7565 if (!TARGET_SSE && !warnedsse)
7566 {
7567 if (mode == TImode
7568 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7569 {
7570 warnedsse = true;
7571 warning (0, "SSE vector return without SSE enabled "
7572 "changes the ABI");
7573 }
7574 }
7575
7576 if (!TARGET_MMX && !warnedmmx)
7577 {
7578 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7579 {
7580 warnedmmx = true;
7581 warning (0, "MMX vector return without MMX enabled "
7582 "changes the ABI");
7583 }
7584 }
7585 }
7586
7587 return NULL;
7588 }
7589
7590 \f
7591 /* Create the va_list data type. */
7592
7593 /* Returns the calling convention specific va_list date type.
7594 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7595
7596 static tree
7597 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7598 {
7599 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7600
7601 /* For i386 we use plain pointer to argument area. */
7602 if (!TARGET_64BIT || abi == MS_ABI)
7603 return build_pointer_type (char_type_node);
7604
7605 record = lang_hooks.types.make_type (RECORD_TYPE);
7606 type_decl = build_decl (BUILTINS_LOCATION,
7607 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7608
7609 f_gpr = build_decl (BUILTINS_LOCATION,
7610 FIELD_DECL, get_identifier ("gp_offset"),
7611 unsigned_type_node);
7612 f_fpr = build_decl (BUILTINS_LOCATION,
7613 FIELD_DECL, get_identifier ("fp_offset"),
7614 unsigned_type_node);
7615 f_ovf = build_decl (BUILTINS_LOCATION,
7616 FIELD_DECL, get_identifier ("overflow_arg_area"),
7617 ptr_type_node);
7618 f_sav = build_decl (BUILTINS_LOCATION,
7619 FIELD_DECL, get_identifier ("reg_save_area"),
7620 ptr_type_node);
7621
7622 va_list_gpr_counter_field = f_gpr;
7623 va_list_fpr_counter_field = f_fpr;
7624
7625 DECL_FIELD_CONTEXT (f_gpr) = record;
7626 DECL_FIELD_CONTEXT (f_fpr) = record;
7627 DECL_FIELD_CONTEXT (f_ovf) = record;
7628 DECL_FIELD_CONTEXT (f_sav) = record;
7629
7630 TYPE_STUB_DECL (record) = type_decl;
7631 TYPE_NAME (record) = type_decl;
7632 TYPE_FIELDS (record) = f_gpr;
7633 DECL_CHAIN (f_gpr) = f_fpr;
7634 DECL_CHAIN (f_fpr) = f_ovf;
7635 DECL_CHAIN (f_ovf) = f_sav;
7636
7637 layout_type (record);
7638
7639 /* The correct type is an array type of one element. */
7640 return build_array_type (record, build_index_type (size_zero_node));
7641 }
7642
7643 /* Setup the builtin va_list data type and for 64-bit the additional
7644 calling convention specific va_list data types. */
7645
7646 static tree
7647 ix86_build_builtin_va_list (void)
7648 {
7649 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7650
7651 /* Initialize abi specific va_list builtin types. */
7652 if (TARGET_64BIT)
7653 {
7654 tree t;
7655 if (ix86_abi == MS_ABI)
7656 {
7657 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7658 if (TREE_CODE (t) != RECORD_TYPE)
7659 t = build_variant_type_copy (t);
7660 sysv_va_list_type_node = t;
7661 }
7662 else
7663 {
7664 t = ret;
7665 if (TREE_CODE (t) != RECORD_TYPE)
7666 t = build_variant_type_copy (t);
7667 sysv_va_list_type_node = t;
7668 }
7669 if (ix86_abi != MS_ABI)
7670 {
7671 t = ix86_build_builtin_va_list_abi (MS_ABI);
7672 if (TREE_CODE (t) != RECORD_TYPE)
7673 t = build_variant_type_copy (t);
7674 ms_va_list_type_node = t;
7675 }
7676 else
7677 {
7678 t = ret;
7679 if (TREE_CODE (t) != RECORD_TYPE)
7680 t = build_variant_type_copy (t);
7681 ms_va_list_type_node = t;
7682 }
7683 }
7684
7685 return ret;
7686 }
7687
7688 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7689
7690 static void
7691 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7692 {
7693 rtx save_area, mem;
7694 alias_set_type set;
7695 int i, max;
7696
7697 /* GPR size of varargs save area. */
7698 if (cfun->va_list_gpr_size)
7699 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7700 else
7701 ix86_varargs_gpr_size = 0;
7702
7703 /* FPR size of varargs save area. We don't need it if we don't pass
7704 anything in SSE registers. */
7705 if (TARGET_SSE && cfun->va_list_fpr_size)
7706 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7707 else
7708 ix86_varargs_fpr_size = 0;
7709
7710 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7711 return;
7712
7713 save_area = frame_pointer_rtx;
7714 set = get_varargs_alias_set ();
7715
7716 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7717 if (max > X86_64_REGPARM_MAX)
7718 max = X86_64_REGPARM_MAX;
7719
7720 for (i = cum->regno; i < max; i++)
7721 {
7722 mem = gen_rtx_MEM (word_mode,
7723 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7724 MEM_NOTRAP_P (mem) = 1;
7725 set_mem_alias_set (mem, set);
7726 emit_move_insn (mem,
7727 gen_rtx_REG (word_mode,
7728 x86_64_int_parameter_registers[i]));
7729 }
7730
7731 if (ix86_varargs_fpr_size)
7732 {
7733 enum machine_mode smode;
7734 rtx label, test;
7735
7736 /* Now emit code to save SSE registers. The AX parameter contains number
7737 of SSE parameter registers used to call this function, though all we
7738 actually check here is the zero/non-zero status. */
7739
7740 label = gen_label_rtx ();
7741 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7742 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7743 label));
7744
7745 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7746 we used movdqa (i.e. TImode) instead? Perhaps even better would
7747 be if we could determine the real mode of the data, via a hook
7748 into pass_stdarg. Ignore all that for now. */
7749 smode = V4SFmode;
7750 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7751 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7752
7753 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7754 if (max > X86_64_SSE_REGPARM_MAX)
7755 max = X86_64_SSE_REGPARM_MAX;
7756
7757 for (i = cum->sse_regno; i < max; ++i)
7758 {
7759 mem = plus_constant (Pmode, save_area,
7760 i * 16 + ix86_varargs_gpr_size);
7761 mem = gen_rtx_MEM (smode, mem);
7762 MEM_NOTRAP_P (mem) = 1;
7763 set_mem_alias_set (mem, set);
7764 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7765
7766 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7767 }
7768
7769 emit_label (label);
7770 }
7771 }
7772
7773 static void
7774 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7775 {
7776 alias_set_type set = get_varargs_alias_set ();
7777 int i;
7778
7779 /* Reset to zero, as there might be a sysv vaarg used
7780 before. */
7781 ix86_varargs_gpr_size = 0;
7782 ix86_varargs_fpr_size = 0;
7783
7784 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7785 {
7786 rtx reg, mem;
7787
7788 mem = gen_rtx_MEM (Pmode,
7789 plus_constant (Pmode, virtual_incoming_args_rtx,
7790 i * UNITS_PER_WORD));
7791 MEM_NOTRAP_P (mem) = 1;
7792 set_mem_alias_set (mem, set);
7793
7794 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7795 emit_move_insn (mem, reg);
7796 }
7797 }
7798
7799 static void
7800 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7801 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7802 int no_rtl)
7803 {
7804 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7805 CUMULATIVE_ARGS next_cum;
7806 tree fntype;
7807
7808 /* This argument doesn't appear to be used anymore. Which is good,
7809 because the old code here didn't suppress rtl generation. */
7810 gcc_assert (!no_rtl);
7811
7812 if (!TARGET_64BIT)
7813 return;
7814
7815 fntype = TREE_TYPE (current_function_decl);
7816
7817 /* For varargs, we do not want to skip the dummy va_dcl argument.
7818 For stdargs, we do want to skip the last named argument. */
7819 next_cum = *cum;
7820 if (stdarg_p (fntype))
7821 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7822 true);
7823
7824 if (cum->call_abi == MS_ABI)
7825 setup_incoming_varargs_ms_64 (&next_cum);
7826 else
7827 setup_incoming_varargs_64 (&next_cum);
7828 }
7829
7830 /* Checks if TYPE is of kind va_list char *. */
7831
7832 static bool
7833 is_va_list_char_pointer (tree type)
7834 {
7835 tree canonic;
7836
7837 /* For 32-bit it is always true. */
7838 if (!TARGET_64BIT)
7839 return true;
7840 canonic = ix86_canonical_va_list_type (type);
7841 return (canonic == ms_va_list_type_node
7842 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7843 }
7844
7845 /* Implement va_start. */
7846
7847 static void
7848 ix86_va_start (tree valist, rtx nextarg)
7849 {
7850 HOST_WIDE_INT words, n_gpr, n_fpr;
7851 tree f_gpr, f_fpr, f_ovf, f_sav;
7852 tree gpr, fpr, ovf, sav, t;
7853 tree type;
7854 rtx ovf_rtx;
7855
7856 if (flag_split_stack
7857 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7858 {
7859 unsigned int scratch_regno;
7860
7861 /* When we are splitting the stack, we can't refer to the stack
7862 arguments using internal_arg_pointer, because they may be on
7863 the old stack. The split stack prologue will arrange to
7864 leave a pointer to the old stack arguments in a scratch
7865 register, which we here copy to a pseudo-register. The split
7866 stack prologue can't set the pseudo-register directly because
7867 it (the prologue) runs before any registers have been saved. */
7868
7869 scratch_regno = split_stack_prologue_scratch_regno ();
7870 if (scratch_regno != INVALID_REGNUM)
7871 {
7872 rtx reg, seq;
7873
7874 reg = gen_reg_rtx (Pmode);
7875 cfun->machine->split_stack_varargs_pointer = reg;
7876
7877 start_sequence ();
7878 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7879 seq = get_insns ();
7880 end_sequence ();
7881
7882 push_topmost_sequence ();
7883 emit_insn_after (seq, entry_of_function ());
7884 pop_topmost_sequence ();
7885 }
7886 }
7887
7888 /* Only 64bit target needs something special. */
7889 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7890 {
7891 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7892 std_expand_builtin_va_start (valist, nextarg);
7893 else
7894 {
7895 rtx va_r, next;
7896
7897 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7898 next = expand_binop (ptr_mode, add_optab,
7899 cfun->machine->split_stack_varargs_pointer,
7900 crtl->args.arg_offset_rtx,
7901 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7902 convert_move (va_r, next, 0);
7903 }
7904 return;
7905 }
7906
7907 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7908 f_fpr = DECL_CHAIN (f_gpr);
7909 f_ovf = DECL_CHAIN (f_fpr);
7910 f_sav = DECL_CHAIN (f_ovf);
7911
7912 valist = build_simple_mem_ref (valist);
7913 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7914 /* The following should be folded into the MEM_REF offset. */
7915 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7916 f_gpr, NULL_TREE);
7917 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7918 f_fpr, NULL_TREE);
7919 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7920 f_ovf, NULL_TREE);
7921 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7922 f_sav, NULL_TREE);
7923
7924 /* Count number of gp and fp argument registers used. */
7925 words = crtl->args.info.words;
7926 n_gpr = crtl->args.info.regno;
7927 n_fpr = crtl->args.info.sse_regno;
7928
7929 if (cfun->va_list_gpr_size)
7930 {
7931 type = TREE_TYPE (gpr);
7932 t = build2 (MODIFY_EXPR, type,
7933 gpr, build_int_cst (type, n_gpr * 8));
7934 TREE_SIDE_EFFECTS (t) = 1;
7935 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7936 }
7937
7938 if (TARGET_SSE && cfun->va_list_fpr_size)
7939 {
7940 type = TREE_TYPE (fpr);
7941 t = build2 (MODIFY_EXPR, type, fpr,
7942 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7943 TREE_SIDE_EFFECTS (t) = 1;
7944 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7945 }
7946
7947 /* Find the overflow area. */
7948 type = TREE_TYPE (ovf);
7949 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7950 ovf_rtx = crtl->args.internal_arg_pointer;
7951 else
7952 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7953 t = make_tree (type, ovf_rtx);
7954 if (words != 0)
7955 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7956 t = build2 (MODIFY_EXPR, type, ovf, t);
7957 TREE_SIDE_EFFECTS (t) = 1;
7958 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7959
7960 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7961 {
7962 /* Find the register save area.
7963 Prologue of the function save it right above stack frame. */
7964 type = TREE_TYPE (sav);
7965 t = make_tree (type, frame_pointer_rtx);
7966 if (!ix86_varargs_gpr_size)
7967 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7968 t = build2 (MODIFY_EXPR, type, sav, t);
7969 TREE_SIDE_EFFECTS (t) = 1;
7970 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7971 }
7972 }
7973
7974 /* Implement va_arg. */
7975
7976 static tree
7977 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7978 gimple_seq *post_p)
7979 {
7980 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7981 tree f_gpr, f_fpr, f_ovf, f_sav;
7982 tree gpr, fpr, ovf, sav, t;
7983 int size, rsize;
7984 tree lab_false, lab_over = NULL_TREE;
7985 tree addr, t2;
7986 rtx container;
7987 int indirect_p = 0;
7988 tree ptrtype;
7989 enum machine_mode nat_mode;
7990 unsigned int arg_boundary;
7991
7992 /* Only 64bit target needs something special. */
7993 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7994 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7995
7996 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7997 f_fpr = DECL_CHAIN (f_gpr);
7998 f_ovf = DECL_CHAIN (f_fpr);
7999 f_sav = DECL_CHAIN (f_ovf);
8000
8001 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8002 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8003 valist = build_va_arg_indirect_ref (valist);
8004 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8005 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8006 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8007
8008 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8009 if (indirect_p)
8010 type = build_pointer_type (type);
8011 size = int_size_in_bytes (type);
8012 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8013
8014 nat_mode = type_natural_mode (type, NULL);
8015 switch (nat_mode)
8016 {
8017 case V8SFmode:
8018 case V8SImode:
8019 case V32QImode:
8020 case V16HImode:
8021 case V4DFmode:
8022 case V4DImode:
8023 /* Unnamed 256bit vector mode parameters are passed on stack. */
8024 if (!TARGET_64BIT_MS_ABI)
8025 {
8026 container = NULL;
8027 break;
8028 }
8029
8030 default:
8031 container = construct_container (nat_mode, TYPE_MODE (type),
8032 type, 0, X86_64_REGPARM_MAX,
8033 X86_64_SSE_REGPARM_MAX, intreg,
8034 0);
8035 break;
8036 }
8037
8038 /* Pull the value out of the saved registers. */
8039
8040 addr = create_tmp_var (ptr_type_node, "addr");
8041
8042 if (container)
8043 {
8044 int needed_intregs, needed_sseregs;
8045 bool need_temp;
8046 tree int_addr, sse_addr;
8047
8048 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8049 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8050
8051 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8052
8053 need_temp = (!REG_P (container)
8054 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8055 || TYPE_ALIGN (type) > 128));
8056
8057 /* In case we are passing structure, verify that it is consecutive block
8058 on the register save area. If not we need to do moves. */
8059 if (!need_temp && !REG_P (container))
8060 {
8061 /* Verify that all registers are strictly consecutive */
8062 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8063 {
8064 int i;
8065
8066 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8067 {
8068 rtx slot = XVECEXP (container, 0, i);
8069 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8070 || INTVAL (XEXP (slot, 1)) != i * 16)
8071 need_temp = 1;
8072 }
8073 }
8074 else
8075 {
8076 int i;
8077
8078 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8079 {
8080 rtx slot = XVECEXP (container, 0, i);
8081 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8082 || INTVAL (XEXP (slot, 1)) != i * 8)
8083 need_temp = 1;
8084 }
8085 }
8086 }
8087 if (!need_temp)
8088 {
8089 int_addr = addr;
8090 sse_addr = addr;
8091 }
8092 else
8093 {
8094 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8095 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8096 }
8097
8098 /* First ensure that we fit completely in registers. */
8099 if (needed_intregs)
8100 {
8101 t = build_int_cst (TREE_TYPE (gpr),
8102 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8103 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8104 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8105 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8106 gimplify_and_add (t, pre_p);
8107 }
8108 if (needed_sseregs)
8109 {
8110 t = build_int_cst (TREE_TYPE (fpr),
8111 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8112 + X86_64_REGPARM_MAX * 8);
8113 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8114 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8115 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8116 gimplify_and_add (t, pre_p);
8117 }
8118
8119 /* Compute index to start of area used for integer regs. */
8120 if (needed_intregs)
8121 {
8122 /* int_addr = gpr + sav; */
8123 t = fold_build_pointer_plus (sav, gpr);
8124 gimplify_assign (int_addr, t, pre_p);
8125 }
8126 if (needed_sseregs)
8127 {
8128 /* sse_addr = fpr + sav; */
8129 t = fold_build_pointer_plus (sav, fpr);
8130 gimplify_assign (sse_addr, t, pre_p);
8131 }
8132 if (need_temp)
8133 {
8134 int i, prev_size = 0;
8135 tree temp = create_tmp_var (type, "va_arg_tmp");
8136
8137 /* addr = &temp; */
8138 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8139 gimplify_assign (addr, t, pre_p);
8140
8141 for (i = 0; i < XVECLEN (container, 0); i++)
8142 {
8143 rtx slot = XVECEXP (container, 0, i);
8144 rtx reg = XEXP (slot, 0);
8145 enum machine_mode mode = GET_MODE (reg);
8146 tree piece_type;
8147 tree addr_type;
8148 tree daddr_type;
8149 tree src_addr, src;
8150 int src_offset;
8151 tree dest_addr, dest;
8152 int cur_size = GET_MODE_SIZE (mode);
8153
8154 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8155 prev_size = INTVAL (XEXP (slot, 1));
8156 if (prev_size + cur_size > size)
8157 {
8158 cur_size = size - prev_size;
8159 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8160 if (mode == BLKmode)
8161 mode = QImode;
8162 }
8163 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8164 if (mode == GET_MODE (reg))
8165 addr_type = build_pointer_type (piece_type);
8166 else
8167 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8168 true);
8169 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8170 true);
8171
8172 if (SSE_REGNO_P (REGNO (reg)))
8173 {
8174 src_addr = sse_addr;
8175 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8176 }
8177 else
8178 {
8179 src_addr = int_addr;
8180 src_offset = REGNO (reg) * 8;
8181 }
8182 src_addr = fold_convert (addr_type, src_addr);
8183 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8184
8185 dest_addr = fold_convert (daddr_type, addr);
8186 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8187 if (cur_size == GET_MODE_SIZE (mode))
8188 {
8189 src = build_va_arg_indirect_ref (src_addr);
8190 dest = build_va_arg_indirect_ref (dest_addr);
8191
8192 gimplify_assign (dest, src, pre_p);
8193 }
8194 else
8195 {
8196 tree copy
8197 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8198 3, dest_addr, src_addr,
8199 size_int (cur_size));
8200 gimplify_and_add (copy, pre_p);
8201 }
8202 prev_size += cur_size;
8203 }
8204 }
8205
8206 if (needed_intregs)
8207 {
8208 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8209 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8210 gimplify_assign (gpr, t, pre_p);
8211 }
8212
8213 if (needed_sseregs)
8214 {
8215 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8216 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8217 gimplify_assign (fpr, t, pre_p);
8218 }
8219
8220 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8221
8222 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8223 }
8224
8225 /* ... otherwise out of the overflow area. */
8226
8227 /* When we align parameter on stack for caller, if the parameter
8228 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8229 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8230 here with caller. */
8231 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8232 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8233 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8234
8235 /* Care for on-stack alignment if needed. */
8236 if (arg_boundary <= 64 || size == 0)
8237 t = ovf;
8238 else
8239 {
8240 HOST_WIDE_INT align = arg_boundary / 8;
8241 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8242 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8243 build_int_cst (TREE_TYPE (t), -align));
8244 }
8245
8246 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8247 gimplify_assign (addr, t, pre_p);
8248
8249 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8250 gimplify_assign (unshare_expr (ovf), t, pre_p);
8251
8252 if (container)
8253 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8254
8255 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8256 addr = fold_convert (ptrtype, addr);
8257
8258 if (indirect_p)
8259 addr = build_va_arg_indirect_ref (addr);
8260 return build_va_arg_indirect_ref (addr);
8261 }
8262 \f
8263 /* Return true if OPNUM's MEM should be matched
8264 in movabs* patterns. */
8265
8266 bool
8267 ix86_check_movabs (rtx insn, int opnum)
8268 {
8269 rtx set, mem;
8270
8271 set = PATTERN (insn);
8272 if (GET_CODE (set) == PARALLEL)
8273 set = XVECEXP (set, 0, 0);
8274 gcc_assert (GET_CODE (set) == SET);
8275 mem = XEXP (set, opnum);
8276 while (GET_CODE (mem) == SUBREG)
8277 mem = SUBREG_REG (mem);
8278 gcc_assert (MEM_P (mem));
8279 return volatile_ok || !MEM_VOLATILE_P (mem);
8280 }
8281 \f
8282 /* Initialize the table of extra 80387 mathematical constants. */
8283
8284 static void
8285 init_ext_80387_constants (void)
8286 {
8287 static const char * cst[5] =
8288 {
8289 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8290 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8291 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8292 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8293 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8294 };
8295 int i;
8296
8297 for (i = 0; i < 5; i++)
8298 {
8299 real_from_string (&ext_80387_constants_table[i], cst[i]);
8300 /* Ensure each constant is rounded to XFmode precision. */
8301 real_convert (&ext_80387_constants_table[i],
8302 XFmode, &ext_80387_constants_table[i]);
8303 }
8304
8305 ext_80387_constants_init = 1;
8306 }
8307
8308 /* Return non-zero if the constant is something that
8309 can be loaded with a special instruction. */
8310
8311 int
8312 standard_80387_constant_p (rtx x)
8313 {
8314 enum machine_mode mode = GET_MODE (x);
8315
8316 REAL_VALUE_TYPE r;
8317
8318 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8319 return -1;
8320
8321 if (x == CONST0_RTX (mode))
8322 return 1;
8323 if (x == CONST1_RTX (mode))
8324 return 2;
8325
8326 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8327
8328 /* For XFmode constants, try to find a special 80387 instruction when
8329 optimizing for size or on those CPUs that benefit from them. */
8330 if (mode == XFmode
8331 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8332 {
8333 int i;
8334
8335 if (! ext_80387_constants_init)
8336 init_ext_80387_constants ();
8337
8338 for (i = 0; i < 5; i++)
8339 if (real_identical (&r, &ext_80387_constants_table[i]))
8340 return i + 3;
8341 }
8342
8343 /* Load of the constant -0.0 or -1.0 will be split as
8344 fldz;fchs or fld1;fchs sequence. */
8345 if (real_isnegzero (&r))
8346 return 8;
8347 if (real_identical (&r, &dconstm1))
8348 return 9;
8349
8350 return 0;
8351 }
8352
8353 /* Return the opcode of the special instruction to be used to load
8354 the constant X. */
8355
8356 const char *
8357 standard_80387_constant_opcode (rtx x)
8358 {
8359 switch (standard_80387_constant_p (x))
8360 {
8361 case 1:
8362 return "fldz";
8363 case 2:
8364 return "fld1";
8365 case 3:
8366 return "fldlg2";
8367 case 4:
8368 return "fldln2";
8369 case 5:
8370 return "fldl2e";
8371 case 6:
8372 return "fldl2t";
8373 case 7:
8374 return "fldpi";
8375 case 8:
8376 case 9:
8377 return "#";
8378 default:
8379 gcc_unreachable ();
8380 }
8381 }
8382
8383 /* Return the CONST_DOUBLE representing the 80387 constant that is
8384 loaded by the specified special instruction. The argument IDX
8385 matches the return value from standard_80387_constant_p. */
8386
8387 rtx
8388 standard_80387_constant_rtx (int idx)
8389 {
8390 int i;
8391
8392 if (! ext_80387_constants_init)
8393 init_ext_80387_constants ();
8394
8395 switch (idx)
8396 {
8397 case 3:
8398 case 4:
8399 case 5:
8400 case 6:
8401 case 7:
8402 i = idx - 3;
8403 break;
8404
8405 default:
8406 gcc_unreachable ();
8407 }
8408
8409 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8410 XFmode);
8411 }
8412
8413 /* Return 1 if X is all 0s and 2 if x is all 1s
8414 in supported SSE/AVX vector mode. */
8415
8416 int
8417 standard_sse_constant_p (rtx x)
8418 {
8419 enum machine_mode mode = GET_MODE (x);
8420
8421 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8422 return 1;
8423 if (vector_all_ones_operand (x, mode))
8424 switch (mode)
8425 {
8426 case V16QImode:
8427 case V8HImode:
8428 case V4SImode:
8429 case V2DImode:
8430 if (TARGET_SSE2)
8431 return 2;
8432 case V32QImode:
8433 case V16HImode:
8434 case V8SImode:
8435 case V4DImode:
8436 if (TARGET_AVX2)
8437 return 2;
8438 default:
8439 break;
8440 }
8441
8442 return 0;
8443 }
8444
8445 /* Return the opcode of the special instruction to be used to load
8446 the constant X. */
8447
8448 const char *
8449 standard_sse_constant_opcode (rtx insn, rtx x)
8450 {
8451 switch (standard_sse_constant_p (x))
8452 {
8453 case 1:
8454 switch (get_attr_mode (insn))
8455 {
8456 case MODE_TI:
8457 return "%vpxor\t%0, %d0";
8458 case MODE_V2DF:
8459 return "%vxorpd\t%0, %d0";
8460 case MODE_V4SF:
8461 return "%vxorps\t%0, %d0";
8462
8463 case MODE_OI:
8464 return "vpxor\t%x0, %x0, %x0";
8465 case MODE_V4DF:
8466 return "vxorpd\t%x0, %x0, %x0";
8467 case MODE_V8SF:
8468 return "vxorps\t%x0, %x0, %x0";
8469
8470 default:
8471 break;
8472 }
8473
8474 case 2:
8475 if (TARGET_AVX)
8476 return "vpcmpeqd\t%0, %0, %0";
8477 else
8478 return "pcmpeqd\t%0, %0";
8479
8480 default:
8481 break;
8482 }
8483 gcc_unreachable ();
8484 }
8485
8486 /* Returns true if OP contains a symbol reference */
8487
8488 bool
8489 symbolic_reference_mentioned_p (rtx op)
8490 {
8491 const char *fmt;
8492 int i;
8493
8494 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8495 return true;
8496
8497 fmt = GET_RTX_FORMAT (GET_CODE (op));
8498 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8499 {
8500 if (fmt[i] == 'E')
8501 {
8502 int j;
8503
8504 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8505 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8506 return true;
8507 }
8508
8509 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8510 return true;
8511 }
8512
8513 return false;
8514 }
8515
8516 /* Return true if it is appropriate to emit `ret' instructions in the
8517 body of a function. Do this only if the epilogue is simple, needing a
8518 couple of insns. Prior to reloading, we can't tell how many registers
8519 must be saved, so return false then. Return false if there is no frame
8520 marker to de-allocate. */
8521
8522 bool
8523 ix86_can_use_return_insn_p (void)
8524 {
8525 struct ix86_frame frame;
8526
8527 if (! reload_completed || frame_pointer_needed)
8528 return 0;
8529
8530 /* Don't allow more than 32k pop, since that's all we can do
8531 with one instruction. */
8532 if (crtl->args.pops_args && crtl->args.size >= 32768)
8533 return 0;
8534
8535 ix86_compute_frame_layout (&frame);
8536 return (frame.stack_pointer_offset == UNITS_PER_WORD
8537 && (frame.nregs + frame.nsseregs) == 0);
8538 }
8539 \f
8540 /* Value should be nonzero if functions must have frame pointers.
8541 Zero means the frame pointer need not be set up (and parms may
8542 be accessed via the stack pointer) in functions that seem suitable. */
8543
8544 static bool
8545 ix86_frame_pointer_required (void)
8546 {
8547 /* If we accessed previous frames, then the generated code expects
8548 to be able to access the saved ebp value in our frame. */
8549 if (cfun->machine->accesses_prev_frame)
8550 return true;
8551
8552 /* Several x86 os'es need a frame pointer for other reasons,
8553 usually pertaining to setjmp. */
8554 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8555 return true;
8556
8557 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8558 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8559 return true;
8560
8561 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8562 allocation is 4GB. */
8563 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8564 return true;
8565
8566 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8567 turns off the frame pointer by default. Turn it back on now if
8568 we've not got a leaf function. */
8569 if (TARGET_OMIT_LEAF_FRAME_POINTER
8570 && (!crtl->is_leaf
8571 || ix86_current_function_calls_tls_descriptor))
8572 return true;
8573
8574 if (crtl->profile && !flag_fentry)
8575 return true;
8576
8577 return false;
8578 }
8579
8580 /* Record that the current function accesses previous call frames. */
8581
8582 void
8583 ix86_setup_frame_addresses (void)
8584 {
8585 cfun->machine->accesses_prev_frame = 1;
8586 }
8587 \f
8588 #ifndef USE_HIDDEN_LINKONCE
8589 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8590 # define USE_HIDDEN_LINKONCE 1
8591 # else
8592 # define USE_HIDDEN_LINKONCE 0
8593 # endif
8594 #endif
8595
8596 static int pic_labels_used;
8597
8598 /* Fills in the label name that should be used for a pc thunk for
8599 the given register. */
8600
8601 static void
8602 get_pc_thunk_name (char name[32], unsigned int regno)
8603 {
8604 gcc_assert (!TARGET_64BIT);
8605
8606 if (USE_HIDDEN_LINKONCE)
8607 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8608 else
8609 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8610 }
8611
8612
8613 /* This function generates code for -fpic that loads %ebx with
8614 the return address of the caller and then returns. */
8615
8616 static void
8617 ix86_code_end (void)
8618 {
8619 rtx xops[2];
8620 int regno;
8621
8622 for (regno = AX_REG; regno <= SP_REG; regno++)
8623 {
8624 char name[32];
8625 tree decl;
8626
8627 if (!(pic_labels_used & (1 << regno)))
8628 continue;
8629
8630 get_pc_thunk_name (name, regno);
8631
8632 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8633 get_identifier (name),
8634 build_function_type_list (void_type_node, NULL_TREE));
8635 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8636 NULL_TREE, void_type_node);
8637 TREE_PUBLIC (decl) = 1;
8638 TREE_STATIC (decl) = 1;
8639 DECL_IGNORED_P (decl) = 1;
8640
8641 #if TARGET_MACHO
8642 if (TARGET_MACHO)
8643 {
8644 switch_to_section (darwin_sections[text_coal_section]);
8645 fputs ("\t.weak_definition\t", asm_out_file);
8646 assemble_name (asm_out_file, name);
8647 fputs ("\n\t.private_extern\t", asm_out_file);
8648 assemble_name (asm_out_file, name);
8649 putc ('\n', asm_out_file);
8650 ASM_OUTPUT_LABEL (asm_out_file, name);
8651 DECL_WEAK (decl) = 1;
8652 }
8653 else
8654 #endif
8655 if (USE_HIDDEN_LINKONCE)
8656 {
8657 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8658
8659 targetm.asm_out.unique_section (decl, 0);
8660 switch_to_section (get_named_section (decl, NULL, 0));
8661
8662 targetm.asm_out.globalize_label (asm_out_file, name);
8663 fputs ("\t.hidden\t", asm_out_file);
8664 assemble_name (asm_out_file, name);
8665 putc ('\n', asm_out_file);
8666 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8667 }
8668 else
8669 {
8670 switch_to_section (text_section);
8671 ASM_OUTPUT_LABEL (asm_out_file, name);
8672 }
8673
8674 DECL_INITIAL (decl) = make_node (BLOCK);
8675 current_function_decl = decl;
8676 init_function_start (decl);
8677 first_function_block_is_cold = false;
8678 /* Make sure unwind info is emitted for the thunk if needed. */
8679 final_start_function (emit_barrier (), asm_out_file, 1);
8680
8681 /* Pad stack IP move with 4 instructions (two NOPs count
8682 as one instruction). */
8683 if (TARGET_PAD_SHORT_FUNCTION)
8684 {
8685 int i = 8;
8686
8687 while (i--)
8688 fputs ("\tnop\n", asm_out_file);
8689 }
8690
8691 xops[0] = gen_rtx_REG (Pmode, regno);
8692 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8693 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8694 fputs ("\tret\n", asm_out_file);
8695 final_end_function ();
8696 init_insn_lengths ();
8697 free_after_compilation (cfun);
8698 set_cfun (NULL);
8699 current_function_decl = NULL;
8700 }
8701
8702 if (flag_split_stack)
8703 file_end_indicate_split_stack ();
8704 }
8705
8706 /* Emit code for the SET_GOT patterns. */
8707
8708 const char *
8709 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8710 {
8711 rtx xops[3];
8712
8713 xops[0] = dest;
8714
8715 if (TARGET_VXWORKS_RTP && flag_pic)
8716 {
8717 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8718 xops[2] = gen_rtx_MEM (Pmode,
8719 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8720 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8721
8722 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8723 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8724 an unadorned address. */
8725 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8726 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8727 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8728 return "";
8729 }
8730
8731 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8732
8733 if (!flag_pic)
8734 {
8735 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8736
8737 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8738
8739 #if TARGET_MACHO
8740 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8741 is what will be referenced by the Mach-O PIC subsystem. */
8742 if (!label)
8743 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8744 #endif
8745
8746 targetm.asm_out.internal_label (asm_out_file, "L",
8747 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8748 }
8749 else
8750 {
8751 char name[32];
8752 get_pc_thunk_name (name, REGNO (dest));
8753 pic_labels_used |= 1 << REGNO (dest);
8754
8755 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8756 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8757 output_asm_insn ("call\t%X2", xops);
8758 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8759 is what will be referenced by the Mach-O PIC subsystem. */
8760 #if TARGET_MACHO
8761 if (!label)
8762 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8763 else
8764 targetm.asm_out.internal_label (asm_out_file, "L",
8765 CODE_LABEL_NUMBER (label));
8766 #endif
8767 }
8768
8769 if (!TARGET_MACHO)
8770 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8771
8772 return "";
8773 }
8774
8775 /* Generate an "push" pattern for input ARG. */
8776
8777 static rtx
8778 gen_push (rtx arg)
8779 {
8780 struct machine_function *m = cfun->machine;
8781
8782 if (m->fs.cfa_reg == stack_pointer_rtx)
8783 m->fs.cfa_offset += UNITS_PER_WORD;
8784 m->fs.sp_offset += UNITS_PER_WORD;
8785
8786 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8787 arg = gen_rtx_REG (word_mode, REGNO (arg));
8788
8789 return gen_rtx_SET (VOIDmode,
8790 gen_rtx_MEM (word_mode,
8791 gen_rtx_PRE_DEC (Pmode,
8792 stack_pointer_rtx)),
8793 arg);
8794 }
8795
8796 /* Generate an "pop" pattern for input ARG. */
8797
8798 static rtx
8799 gen_pop (rtx arg)
8800 {
8801 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8802 arg = gen_rtx_REG (word_mode, REGNO (arg));
8803
8804 return gen_rtx_SET (VOIDmode,
8805 arg,
8806 gen_rtx_MEM (word_mode,
8807 gen_rtx_POST_INC (Pmode,
8808 stack_pointer_rtx)));
8809 }
8810
8811 /* Return >= 0 if there is an unused call-clobbered register available
8812 for the entire function. */
8813
8814 static unsigned int
8815 ix86_select_alt_pic_regnum (void)
8816 {
8817 if (crtl->is_leaf
8818 && !crtl->profile
8819 && !ix86_current_function_calls_tls_descriptor)
8820 {
8821 int i, drap;
8822 /* Can't use the same register for both PIC and DRAP. */
8823 if (crtl->drap_reg)
8824 drap = REGNO (crtl->drap_reg);
8825 else
8826 drap = -1;
8827 for (i = 2; i >= 0; --i)
8828 if (i != drap && !df_regs_ever_live_p (i))
8829 return i;
8830 }
8831
8832 return INVALID_REGNUM;
8833 }
8834
8835 /* Return TRUE if we need to save REGNO. */
8836
8837 static bool
8838 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8839 {
8840 if (pic_offset_table_rtx
8841 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8842 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8843 || crtl->profile
8844 || crtl->calls_eh_return
8845 || crtl->uses_const_pool))
8846 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8847
8848 if (crtl->calls_eh_return && maybe_eh_return)
8849 {
8850 unsigned i;
8851 for (i = 0; ; i++)
8852 {
8853 unsigned test = EH_RETURN_DATA_REGNO (i);
8854 if (test == INVALID_REGNUM)
8855 break;
8856 if (test == regno)
8857 return true;
8858 }
8859 }
8860
8861 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8862 return true;
8863
8864 return (df_regs_ever_live_p (regno)
8865 && !call_used_regs[regno]
8866 && !fixed_regs[regno]
8867 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8868 }
8869
8870 /* Return number of saved general prupose registers. */
8871
8872 static int
8873 ix86_nsaved_regs (void)
8874 {
8875 int nregs = 0;
8876 int regno;
8877
8878 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8879 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8880 nregs ++;
8881 return nregs;
8882 }
8883
8884 /* Return number of saved SSE registrers. */
8885
8886 static int
8887 ix86_nsaved_sseregs (void)
8888 {
8889 int nregs = 0;
8890 int regno;
8891
8892 if (!TARGET_64BIT_MS_ABI)
8893 return 0;
8894 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8895 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8896 nregs ++;
8897 return nregs;
8898 }
8899
8900 /* Given FROM and TO register numbers, say whether this elimination is
8901 allowed. If stack alignment is needed, we can only replace argument
8902 pointer with hard frame pointer, or replace frame pointer with stack
8903 pointer. Otherwise, frame pointer elimination is automatically
8904 handled and all other eliminations are valid. */
8905
8906 static bool
8907 ix86_can_eliminate (const int from, const int to)
8908 {
8909 if (stack_realign_fp)
8910 return ((from == ARG_POINTER_REGNUM
8911 && to == HARD_FRAME_POINTER_REGNUM)
8912 || (from == FRAME_POINTER_REGNUM
8913 && to == STACK_POINTER_REGNUM));
8914 else
8915 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8916 }
8917
8918 /* Return the offset between two registers, one to be eliminated, and the other
8919 its replacement, at the start of a routine. */
8920
8921 HOST_WIDE_INT
8922 ix86_initial_elimination_offset (int from, int to)
8923 {
8924 struct ix86_frame frame;
8925 ix86_compute_frame_layout (&frame);
8926
8927 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8928 return frame.hard_frame_pointer_offset;
8929 else if (from == FRAME_POINTER_REGNUM
8930 && to == HARD_FRAME_POINTER_REGNUM)
8931 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8932 else
8933 {
8934 gcc_assert (to == STACK_POINTER_REGNUM);
8935
8936 if (from == ARG_POINTER_REGNUM)
8937 return frame.stack_pointer_offset;
8938
8939 gcc_assert (from == FRAME_POINTER_REGNUM);
8940 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8941 }
8942 }
8943
8944 /* In a dynamically-aligned function, we can't know the offset from
8945 stack pointer to frame pointer, so we must ensure that setjmp
8946 eliminates fp against the hard fp (%ebp) rather than trying to
8947 index from %esp up to the top of the frame across a gap that is
8948 of unknown (at compile-time) size. */
8949 static rtx
8950 ix86_builtin_setjmp_frame_value (void)
8951 {
8952 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8953 }
8954
8955 /* When using -fsplit-stack, the allocation routines set a field in
8956 the TCB to the bottom of the stack plus this much space, measured
8957 in bytes. */
8958
8959 #define SPLIT_STACK_AVAILABLE 256
8960
8961 /* Fill structure ix86_frame about frame of currently computed function. */
8962
8963 static void
8964 ix86_compute_frame_layout (struct ix86_frame *frame)
8965 {
8966 unsigned HOST_WIDE_INT stack_alignment_needed;
8967 HOST_WIDE_INT offset;
8968 unsigned HOST_WIDE_INT preferred_alignment;
8969 HOST_WIDE_INT size = get_frame_size ();
8970 HOST_WIDE_INT to_allocate;
8971
8972 frame->nregs = ix86_nsaved_regs ();
8973 frame->nsseregs = ix86_nsaved_sseregs ();
8974
8975 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8976 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8977
8978 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8979 function prologues and leaf. */
8980 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8981 && (!crtl->is_leaf || cfun->calls_alloca != 0
8982 || ix86_current_function_calls_tls_descriptor))
8983 {
8984 preferred_alignment = 16;
8985 stack_alignment_needed = 16;
8986 crtl->preferred_stack_boundary = 128;
8987 crtl->stack_alignment_needed = 128;
8988 }
8989
8990 gcc_assert (!size || stack_alignment_needed);
8991 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8992 gcc_assert (preferred_alignment <= stack_alignment_needed);
8993
8994 /* For SEH we have to limit the amount of code movement into the prologue.
8995 At present we do this via a BLOCKAGE, at which point there's very little
8996 scheduling that can be done, which means that there's very little point
8997 in doing anything except PUSHs. */
8998 if (TARGET_SEH)
8999 cfun->machine->use_fast_prologue_epilogue = false;
9000
9001 /* During reload iteration the amount of registers saved can change.
9002 Recompute the value as needed. Do not recompute when amount of registers
9003 didn't change as reload does multiple calls to the function and does not
9004 expect the decision to change within single iteration. */
9005 else if (!optimize_function_for_size_p (cfun)
9006 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9007 {
9008 int count = frame->nregs;
9009 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9010
9011 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9012
9013 /* The fast prologue uses move instead of push to save registers. This
9014 is significantly longer, but also executes faster as modern hardware
9015 can execute the moves in parallel, but can't do that for push/pop.
9016
9017 Be careful about choosing what prologue to emit: When function takes
9018 many instructions to execute we may use slow version as well as in
9019 case function is known to be outside hot spot (this is known with
9020 feedback only). Weight the size of function by number of registers
9021 to save as it is cheap to use one or two push instructions but very
9022 slow to use many of them. */
9023 if (count)
9024 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9025 if (node->frequency < NODE_FREQUENCY_NORMAL
9026 || (flag_branch_probabilities
9027 && node->frequency < NODE_FREQUENCY_HOT))
9028 cfun->machine->use_fast_prologue_epilogue = false;
9029 else
9030 cfun->machine->use_fast_prologue_epilogue
9031 = !expensive_function_p (count);
9032 }
9033
9034 frame->save_regs_using_mov
9035 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9036 /* If static stack checking is enabled and done with probes,
9037 the registers need to be saved before allocating the frame. */
9038 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9039
9040 /* Skip return address. */
9041 offset = UNITS_PER_WORD;
9042
9043 /* Skip pushed static chain. */
9044 if (ix86_static_chain_on_stack)
9045 offset += UNITS_PER_WORD;
9046
9047 /* Skip saved base pointer. */
9048 if (frame_pointer_needed)
9049 offset += UNITS_PER_WORD;
9050 frame->hfp_save_offset = offset;
9051
9052 /* The traditional frame pointer location is at the top of the frame. */
9053 frame->hard_frame_pointer_offset = offset;
9054
9055 /* Register save area */
9056 offset += frame->nregs * UNITS_PER_WORD;
9057 frame->reg_save_offset = offset;
9058
9059 /* On SEH target, registers are pushed just before the frame pointer
9060 location. */
9061 if (TARGET_SEH)
9062 frame->hard_frame_pointer_offset = offset;
9063
9064 /* Align and set SSE register save area. */
9065 if (frame->nsseregs)
9066 {
9067 /* The only ABI that has saved SSE registers (Win64) also has a
9068 16-byte aligned default stack, and thus we don't need to be
9069 within the re-aligned local stack frame to save them. */
9070 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9071 offset = (offset + 16 - 1) & -16;
9072 offset += frame->nsseregs * 16;
9073 }
9074 frame->sse_reg_save_offset = offset;
9075
9076 /* The re-aligned stack starts here. Values before this point are not
9077 directly comparable with values below this point. In order to make
9078 sure that no value happens to be the same before and after, force
9079 the alignment computation below to add a non-zero value. */
9080 if (stack_realign_fp)
9081 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9082
9083 /* Va-arg area */
9084 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9085 offset += frame->va_arg_size;
9086
9087 /* Align start of frame for local function. */
9088 if (stack_realign_fp
9089 || offset != frame->sse_reg_save_offset
9090 || size != 0
9091 || !crtl->is_leaf
9092 || cfun->calls_alloca
9093 || ix86_current_function_calls_tls_descriptor)
9094 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9095
9096 /* Frame pointer points here. */
9097 frame->frame_pointer_offset = offset;
9098
9099 offset += size;
9100
9101 /* Add outgoing arguments area. Can be skipped if we eliminated
9102 all the function calls as dead code.
9103 Skipping is however impossible when function calls alloca. Alloca
9104 expander assumes that last crtl->outgoing_args_size
9105 of stack frame are unused. */
9106 if (ACCUMULATE_OUTGOING_ARGS
9107 && (!crtl->is_leaf || cfun->calls_alloca
9108 || ix86_current_function_calls_tls_descriptor))
9109 {
9110 offset += crtl->outgoing_args_size;
9111 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9112 }
9113 else
9114 frame->outgoing_arguments_size = 0;
9115
9116 /* Align stack boundary. Only needed if we're calling another function
9117 or using alloca. */
9118 if (!crtl->is_leaf || cfun->calls_alloca
9119 || ix86_current_function_calls_tls_descriptor)
9120 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9121
9122 /* We've reached end of stack frame. */
9123 frame->stack_pointer_offset = offset;
9124
9125 /* Size prologue needs to allocate. */
9126 to_allocate = offset - frame->sse_reg_save_offset;
9127
9128 if ((!to_allocate && frame->nregs <= 1)
9129 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9130 frame->save_regs_using_mov = false;
9131
9132 if (ix86_using_red_zone ()
9133 && crtl->sp_is_unchanging
9134 && crtl->is_leaf
9135 && !ix86_current_function_calls_tls_descriptor)
9136 {
9137 frame->red_zone_size = to_allocate;
9138 if (frame->save_regs_using_mov)
9139 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9140 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9141 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9142 }
9143 else
9144 frame->red_zone_size = 0;
9145 frame->stack_pointer_offset -= frame->red_zone_size;
9146
9147 /* The SEH frame pointer location is near the bottom of the frame.
9148 This is enforced by the fact that the difference between the
9149 stack pointer and the frame pointer is limited to 240 bytes in
9150 the unwind data structure. */
9151 if (TARGET_SEH)
9152 {
9153 HOST_WIDE_INT diff;
9154
9155 /* If we can leave the frame pointer where it is, do so. Also, returns
9156 the establisher frame for __builtin_frame_address (0). */
9157 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9158 if (diff <= SEH_MAX_FRAME_SIZE
9159 && (diff > 240 || (diff & 15) != 0)
9160 && !crtl->accesses_prior_frames)
9161 {
9162 /* Ideally we'd determine what portion of the local stack frame
9163 (within the constraint of the lowest 240) is most heavily used.
9164 But without that complication, simply bias the frame pointer
9165 by 128 bytes so as to maximize the amount of the local stack
9166 frame that is addressable with 8-bit offsets. */
9167 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9168 }
9169 }
9170 }
9171
9172 /* This is semi-inlined memory_address_length, but simplified
9173 since we know that we're always dealing with reg+offset, and
9174 to avoid having to create and discard all that rtl. */
9175
9176 static inline int
9177 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9178 {
9179 int len = 4;
9180
9181 if (offset == 0)
9182 {
9183 /* EBP and R13 cannot be encoded without an offset. */
9184 len = (regno == BP_REG || regno == R13_REG);
9185 }
9186 else if (IN_RANGE (offset, -128, 127))
9187 len = 1;
9188
9189 /* ESP and R12 must be encoded with a SIB byte. */
9190 if (regno == SP_REG || regno == R12_REG)
9191 len++;
9192
9193 return len;
9194 }
9195
9196 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9197 The valid base registers are taken from CFUN->MACHINE->FS. */
9198
9199 static rtx
9200 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9201 {
9202 const struct machine_function *m = cfun->machine;
9203 rtx base_reg = NULL;
9204 HOST_WIDE_INT base_offset = 0;
9205
9206 if (m->use_fast_prologue_epilogue)
9207 {
9208 /* Choose the base register most likely to allow the most scheduling
9209 opportunities. Generally FP is valid throughout the function,
9210 while DRAP must be reloaded within the epilogue. But choose either
9211 over the SP due to increased encoding size. */
9212
9213 if (m->fs.fp_valid)
9214 {
9215 base_reg = hard_frame_pointer_rtx;
9216 base_offset = m->fs.fp_offset - cfa_offset;
9217 }
9218 else if (m->fs.drap_valid)
9219 {
9220 base_reg = crtl->drap_reg;
9221 base_offset = 0 - cfa_offset;
9222 }
9223 else if (m->fs.sp_valid)
9224 {
9225 base_reg = stack_pointer_rtx;
9226 base_offset = m->fs.sp_offset - cfa_offset;
9227 }
9228 }
9229 else
9230 {
9231 HOST_WIDE_INT toffset;
9232 int len = 16, tlen;
9233
9234 /* Choose the base register with the smallest address encoding.
9235 With a tie, choose FP > DRAP > SP. */
9236 if (m->fs.sp_valid)
9237 {
9238 base_reg = stack_pointer_rtx;
9239 base_offset = m->fs.sp_offset - cfa_offset;
9240 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9241 }
9242 if (m->fs.drap_valid)
9243 {
9244 toffset = 0 - cfa_offset;
9245 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9246 if (tlen <= len)
9247 {
9248 base_reg = crtl->drap_reg;
9249 base_offset = toffset;
9250 len = tlen;
9251 }
9252 }
9253 if (m->fs.fp_valid)
9254 {
9255 toffset = m->fs.fp_offset - cfa_offset;
9256 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9257 if (tlen <= len)
9258 {
9259 base_reg = hard_frame_pointer_rtx;
9260 base_offset = toffset;
9261 len = tlen;
9262 }
9263 }
9264 }
9265 gcc_assert (base_reg != NULL);
9266
9267 return plus_constant (Pmode, base_reg, base_offset);
9268 }
9269
9270 /* Emit code to save registers in the prologue. */
9271
9272 static void
9273 ix86_emit_save_regs (void)
9274 {
9275 unsigned int regno;
9276 rtx insn;
9277
9278 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9279 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9280 {
9281 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9282 RTX_FRAME_RELATED_P (insn) = 1;
9283 }
9284 }
9285
9286 /* Emit a single register save at CFA - CFA_OFFSET. */
9287
9288 static void
9289 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9290 HOST_WIDE_INT cfa_offset)
9291 {
9292 struct machine_function *m = cfun->machine;
9293 rtx reg = gen_rtx_REG (mode, regno);
9294 rtx mem, addr, base, insn;
9295
9296 addr = choose_baseaddr (cfa_offset);
9297 mem = gen_frame_mem (mode, addr);
9298
9299 /* For SSE saves, we need to indicate the 128-bit alignment. */
9300 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9301
9302 insn = emit_move_insn (mem, reg);
9303 RTX_FRAME_RELATED_P (insn) = 1;
9304
9305 base = addr;
9306 if (GET_CODE (base) == PLUS)
9307 base = XEXP (base, 0);
9308 gcc_checking_assert (REG_P (base));
9309
9310 /* When saving registers into a re-aligned local stack frame, avoid
9311 any tricky guessing by dwarf2out. */
9312 if (m->fs.realigned)
9313 {
9314 gcc_checking_assert (stack_realign_drap);
9315
9316 if (regno == REGNO (crtl->drap_reg))
9317 {
9318 /* A bit of a hack. We force the DRAP register to be saved in
9319 the re-aligned stack frame, which provides us with a copy
9320 of the CFA that will last past the prologue. Install it. */
9321 gcc_checking_assert (cfun->machine->fs.fp_valid);
9322 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9323 cfun->machine->fs.fp_offset - cfa_offset);
9324 mem = gen_rtx_MEM (mode, addr);
9325 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9326 }
9327 else
9328 {
9329 /* The frame pointer is a stable reference within the
9330 aligned frame. Use it. */
9331 gcc_checking_assert (cfun->machine->fs.fp_valid);
9332 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9333 cfun->machine->fs.fp_offset - cfa_offset);
9334 mem = gen_rtx_MEM (mode, addr);
9335 add_reg_note (insn, REG_CFA_EXPRESSION,
9336 gen_rtx_SET (VOIDmode, mem, reg));
9337 }
9338 }
9339
9340 /* The memory may not be relative to the current CFA register,
9341 which means that we may need to generate a new pattern for
9342 use by the unwind info. */
9343 else if (base != m->fs.cfa_reg)
9344 {
9345 addr = plus_constant (Pmode, m->fs.cfa_reg,
9346 m->fs.cfa_offset - cfa_offset);
9347 mem = gen_rtx_MEM (mode, addr);
9348 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9349 }
9350 }
9351
9352 /* Emit code to save registers using MOV insns.
9353 First register is stored at CFA - CFA_OFFSET. */
9354 static void
9355 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9356 {
9357 unsigned int regno;
9358
9359 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9360 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9361 {
9362 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9363 cfa_offset -= UNITS_PER_WORD;
9364 }
9365 }
9366
9367 /* Emit code to save SSE registers using MOV insns.
9368 First register is stored at CFA - CFA_OFFSET. */
9369 static void
9370 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9371 {
9372 unsigned int regno;
9373
9374 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9375 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9376 {
9377 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9378 cfa_offset -= 16;
9379 }
9380 }
9381
9382 static GTY(()) rtx queued_cfa_restores;
9383
9384 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9385 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9386 Don't add the note if the previously saved value will be left untouched
9387 within stack red-zone till return, as unwinders can find the same value
9388 in the register and on the stack. */
9389
9390 static void
9391 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9392 {
9393 if (!crtl->shrink_wrapped
9394 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9395 return;
9396
9397 if (insn)
9398 {
9399 add_reg_note (insn, REG_CFA_RESTORE, reg);
9400 RTX_FRAME_RELATED_P (insn) = 1;
9401 }
9402 else
9403 queued_cfa_restores
9404 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9405 }
9406
9407 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9408
9409 static void
9410 ix86_add_queued_cfa_restore_notes (rtx insn)
9411 {
9412 rtx last;
9413 if (!queued_cfa_restores)
9414 return;
9415 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9416 ;
9417 XEXP (last, 1) = REG_NOTES (insn);
9418 REG_NOTES (insn) = queued_cfa_restores;
9419 queued_cfa_restores = NULL_RTX;
9420 RTX_FRAME_RELATED_P (insn) = 1;
9421 }
9422
9423 /* Expand prologue or epilogue stack adjustment.
9424 The pattern exist to put a dependency on all ebp-based memory accesses.
9425 STYLE should be negative if instructions should be marked as frame related,
9426 zero if %r11 register is live and cannot be freely used and positive
9427 otherwise. */
9428
9429 static void
9430 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9431 int style, bool set_cfa)
9432 {
9433 struct machine_function *m = cfun->machine;
9434 rtx insn;
9435 bool add_frame_related_expr = false;
9436
9437 if (Pmode == SImode)
9438 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9439 else if (x86_64_immediate_operand (offset, DImode))
9440 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9441 else
9442 {
9443 rtx tmp;
9444 /* r11 is used by indirect sibcall return as well, set before the
9445 epilogue and used after the epilogue. */
9446 if (style)
9447 tmp = gen_rtx_REG (DImode, R11_REG);
9448 else
9449 {
9450 gcc_assert (src != hard_frame_pointer_rtx
9451 && dest != hard_frame_pointer_rtx);
9452 tmp = hard_frame_pointer_rtx;
9453 }
9454 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9455 if (style < 0)
9456 add_frame_related_expr = true;
9457
9458 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9459 }
9460
9461 insn = emit_insn (insn);
9462 if (style >= 0)
9463 ix86_add_queued_cfa_restore_notes (insn);
9464
9465 if (set_cfa)
9466 {
9467 rtx r;
9468
9469 gcc_assert (m->fs.cfa_reg == src);
9470 m->fs.cfa_offset += INTVAL (offset);
9471 m->fs.cfa_reg = dest;
9472
9473 r = gen_rtx_PLUS (Pmode, src, offset);
9474 r = gen_rtx_SET (VOIDmode, dest, r);
9475 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9476 RTX_FRAME_RELATED_P (insn) = 1;
9477 }
9478 else if (style < 0)
9479 {
9480 RTX_FRAME_RELATED_P (insn) = 1;
9481 if (add_frame_related_expr)
9482 {
9483 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9484 r = gen_rtx_SET (VOIDmode, dest, r);
9485 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9486 }
9487 }
9488
9489 if (dest == stack_pointer_rtx)
9490 {
9491 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9492 bool valid = m->fs.sp_valid;
9493
9494 if (src == hard_frame_pointer_rtx)
9495 {
9496 valid = m->fs.fp_valid;
9497 ooffset = m->fs.fp_offset;
9498 }
9499 else if (src == crtl->drap_reg)
9500 {
9501 valid = m->fs.drap_valid;
9502 ooffset = 0;
9503 }
9504 else
9505 {
9506 /* Else there are two possibilities: SP itself, which we set
9507 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9508 taken care of this by hand along the eh_return path. */
9509 gcc_checking_assert (src == stack_pointer_rtx
9510 || offset == const0_rtx);
9511 }
9512
9513 m->fs.sp_offset = ooffset - INTVAL (offset);
9514 m->fs.sp_valid = valid;
9515 }
9516 }
9517
9518 /* Find an available register to be used as dynamic realign argument
9519 pointer regsiter. Such a register will be written in prologue and
9520 used in begin of body, so it must not be
9521 1. parameter passing register.
9522 2. GOT pointer.
9523 We reuse static-chain register if it is available. Otherwise, we
9524 use DI for i386 and R13 for x86-64. We chose R13 since it has
9525 shorter encoding.
9526
9527 Return: the regno of chosen register. */
9528
9529 static unsigned int
9530 find_drap_reg (void)
9531 {
9532 tree decl = cfun->decl;
9533
9534 if (TARGET_64BIT)
9535 {
9536 /* Use R13 for nested function or function need static chain.
9537 Since function with tail call may use any caller-saved
9538 registers in epilogue, DRAP must not use caller-saved
9539 register in such case. */
9540 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9541 return R13_REG;
9542
9543 return R10_REG;
9544 }
9545 else
9546 {
9547 /* Use DI for nested function or function need static chain.
9548 Since function with tail call may use any caller-saved
9549 registers in epilogue, DRAP must not use caller-saved
9550 register in such case. */
9551 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9552 return DI_REG;
9553
9554 /* Reuse static chain register if it isn't used for parameter
9555 passing. */
9556 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9557 {
9558 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9559 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9560 return CX_REG;
9561 }
9562 return DI_REG;
9563 }
9564 }
9565
9566 /* Return minimum incoming stack alignment. */
9567
9568 static unsigned int
9569 ix86_minimum_incoming_stack_boundary (bool sibcall)
9570 {
9571 unsigned int incoming_stack_boundary;
9572
9573 /* Prefer the one specified at command line. */
9574 if (ix86_user_incoming_stack_boundary)
9575 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9576 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9577 if -mstackrealign is used, it isn't used for sibcall check and
9578 estimated stack alignment is 128bit. */
9579 else if (!sibcall
9580 && !TARGET_64BIT
9581 && ix86_force_align_arg_pointer
9582 && crtl->stack_alignment_estimated == 128)
9583 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9584 else
9585 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9586
9587 /* Incoming stack alignment can be changed on individual functions
9588 via force_align_arg_pointer attribute. We use the smallest
9589 incoming stack boundary. */
9590 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9591 && lookup_attribute (ix86_force_align_arg_pointer_string,
9592 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9593 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9594
9595 /* The incoming stack frame has to be aligned at least at
9596 parm_stack_boundary. */
9597 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9598 incoming_stack_boundary = crtl->parm_stack_boundary;
9599
9600 /* Stack at entrance of main is aligned by runtime. We use the
9601 smallest incoming stack boundary. */
9602 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9603 && DECL_NAME (current_function_decl)
9604 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9605 && DECL_FILE_SCOPE_P (current_function_decl))
9606 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9607
9608 return incoming_stack_boundary;
9609 }
9610
9611 /* Update incoming stack boundary and estimated stack alignment. */
9612
9613 static void
9614 ix86_update_stack_boundary (void)
9615 {
9616 ix86_incoming_stack_boundary
9617 = ix86_minimum_incoming_stack_boundary (false);
9618
9619 /* x86_64 vararg needs 16byte stack alignment for register save
9620 area. */
9621 if (TARGET_64BIT
9622 && cfun->stdarg
9623 && crtl->stack_alignment_estimated < 128)
9624 crtl->stack_alignment_estimated = 128;
9625 }
9626
9627 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9628 needed or an rtx for DRAP otherwise. */
9629
9630 static rtx
9631 ix86_get_drap_rtx (void)
9632 {
9633 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9634 crtl->need_drap = true;
9635
9636 if (stack_realign_drap)
9637 {
9638 /* Assign DRAP to vDRAP and returns vDRAP */
9639 unsigned int regno = find_drap_reg ();
9640 rtx drap_vreg;
9641 rtx arg_ptr;
9642 rtx seq, insn;
9643
9644 arg_ptr = gen_rtx_REG (Pmode, regno);
9645 crtl->drap_reg = arg_ptr;
9646
9647 start_sequence ();
9648 drap_vreg = copy_to_reg (arg_ptr);
9649 seq = get_insns ();
9650 end_sequence ();
9651
9652 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9653 if (!optimize)
9654 {
9655 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9656 RTX_FRAME_RELATED_P (insn) = 1;
9657 }
9658 return drap_vreg;
9659 }
9660 else
9661 return NULL;
9662 }
9663
9664 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9665
9666 static rtx
9667 ix86_internal_arg_pointer (void)
9668 {
9669 return virtual_incoming_args_rtx;
9670 }
9671
9672 struct scratch_reg {
9673 rtx reg;
9674 bool saved;
9675 };
9676
9677 /* Return a short-lived scratch register for use on function entry.
9678 In 32-bit mode, it is valid only after the registers are saved
9679 in the prologue. This register must be released by means of
9680 release_scratch_register_on_entry once it is dead. */
9681
9682 static void
9683 get_scratch_register_on_entry (struct scratch_reg *sr)
9684 {
9685 int regno;
9686
9687 sr->saved = false;
9688
9689 if (TARGET_64BIT)
9690 {
9691 /* We always use R11 in 64-bit mode. */
9692 regno = R11_REG;
9693 }
9694 else
9695 {
9696 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9697 bool fastcall_p
9698 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9699 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9700 int regparm = ix86_function_regparm (fntype, decl);
9701 int drap_regno
9702 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9703
9704 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9705 for the static chain register. */
9706 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9707 && drap_regno != AX_REG)
9708 regno = AX_REG;
9709 else if (regparm < 2 && drap_regno != DX_REG)
9710 regno = DX_REG;
9711 /* ecx is the static chain register. */
9712 else if (regparm < 3 && !fastcall_p && !static_chain_p
9713 && drap_regno != CX_REG)
9714 regno = CX_REG;
9715 else if (ix86_save_reg (BX_REG, true))
9716 regno = BX_REG;
9717 /* esi is the static chain register. */
9718 else if (!(regparm == 3 && static_chain_p)
9719 && ix86_save_reg (SI_REG, true))
9720 regno = SI_REG;
9721 else if (ix86_save_reg (DI_REG, true))
9722 regno = DI_REG;
9723 else
9724 {
9725 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9726 sr->saved = true;
9727 }
9728 }
9729
9730 sr->reg = gen_rtx_REG (Pmode, regno);
9731 if (sr->saved)
9732 {
9733 rtx insn = emit_insn (gen_push (sr->reg));
9734 RTX_FRAME_RELATED_P (insn) = 1;
9735 }
9736 }
9737
9738 /* Release a scratch register obtained from the preceding function. */
9739
9740 static void
9741 release_scratch_register_on_entry (struct scratch_reg *sr)
9742 {
9743 if (sr->saved)
9744 {
9745 rtx x, insn = emit_insn (gen_pop (sr->reg));
9746
9747 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9748 RTX_FRAME_RELATED_P (insn) = 1;
9749 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9750 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9751 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9752 }
9753 }
9754
9755 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9756
9757 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9758
9759 static void
9760 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9761 {
9762 /* We skip the probe for the first interval + a small dope of 4 words and
9763 probe that many bytes past the specified size to maintain a protection
9764 area at the botton of the stack. */
9765 const int dope = 4 * UNITS_PER_WORD;
9766 rtx size_rtx = GEN_INT (size), last;
9767
9768 /* See if we have a constant small number of probes to generate. If so,
9769 that's the easy case. The run-time loop is made up of 11 insns in the
9770 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9771 for n # of intervals. */
9772 if (size <= 5 * PROBE_INTERVAL)
9773 {
9774 HOST_WIDE_INT i, adjust;
9775 bool first_probe = true;
9776
9777 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9778 values of N from 1 until it exceeds SIZE. If only one probe is
9779 needed, this will not generate any code. Then adjust and probe
9780 to PROBE_INTERVAL + SIZE. */
9781 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9782 {
9783 if (first_probe)
9784 {
9785 adjust = 2 * PROBE_INTERVAL + dope;
9786 first_probe = false;
9787 }
9788 else
9789 adjust = PROBE_INTERVAL;
9790
9791 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9792 plus_constant (Pmode, stack_pointer_rtx,
9793 -adjust)));
9794 emit_stack_probe (stack_pointer_rtx);
9795 }
9796
9797 if (first_probe)
9798 adjust = size + PROBE_INTERVAL + dope;
9799 else
9800 adjust = size + PROBE_INTERVAL - i;
9801
9802 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9803 plus_constant (Pmode, stack_pointer_rtx,
9804 -adjust)));
9805 emit_stack_probe (stack_pointer_rtx);
9806
9807 /* Adjust back to account for the additional first interval. */
9808 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9809 plus_constant (Pmode, stack_pointer_rtx,
9810 PROBE_INTERVAL + dope)));
9811 }
9812
9813 /* Otherwise, do the same as above, but in a loop. Note that we must be
9814 extra careful with variables wrapping around because we might be at
9815 the very top (or the very bottom) of the address space and we have
9816 to be able to handle this case properly; in particular, we use an
9817 equality test for the loop condition. */
9818 else
9819 {
9820 HOST_WIDE_INT rounded_size;
9821 struct scratch_reg sr;
9822
9823 get_scratch_register_on_entry (&sr);
9824
9825
9826 /* Step 1: round SIZE to the previous multiple of the interval. */
9827
9828 rounded_size = size & -PROBE_INTERVAL;
9829
9830
9831 /* Step 2: compute initial and final value of the loop counter. */
9832
9833 /* SP = SP_0 + PROBE_INTERVAL. */
9834 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9835 plus_constant (Pmode, stack_pointer_rtx,
9836 - (PROBE_INTERVAL + dope))));
9837
9838 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9839 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9840 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9841 gen_rtx_PLUS (Pmode, sr.reg,
9842 stack_pointer_rtx)));
9843
9844
9845 /* Step 3: the loop
9846
9847 while (SP != LAST_ADDR)
9848 {
9849 SP = SP + PROBE_INTERVAL
9850 probe at SP
9851 }
9852
9853 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9854 values of N from 1 until it is equal to ROUNDED_SIZE. */
9855
9856 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9857
9858
9859 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9860 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9861
9862 if (size != rounded_size)
9863 {
9864 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9865 plus_constant (Pmode, stack_pointer_rtx,
9866 rounded_size - size)));
9867 emit_stack_probe (stack_pointer_rtx);
9868 }
9869
9870 /* Adjust back to account for the additional first interval. */
9871 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9872 plus_constant (Pmode, stack_pointer_rtx,
9873 PROBE_INTERVAL + dope)));
9874
9875 release_scratch_register_on_entry (&sr);
9876 }
9877
9878 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9879
9880 /* Even if the stack pointer isn't the CFA register, we need to correctly
9881 describe the adjustments made to it, in particular differentiate the
9882 frame-related ones from the frame-unrelated ones. */
9883 if (size > 0)
9884 {
9885 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9886 XVECEXP (expr, 0, 0)
9887 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9888 plus_constant (Pmode, stack_pointer_rtx, -size));
9889 XVECEXP (expr, 0, 1)
9890 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9891 plus_constant (Pmode, stack_pointer_rtx,
9892 PROBE_INTERVAL + dope + size));
9893 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9894 RTX_FRAME_RELATED_P (last) = 1;
9895
9896 cfun->machine->fs.sp_offset += size;
9897 }
9898
9899 /* Make sure nothing is scheduled before we are done. */
9900 emit_insn (gen_blockage ());
9901 }
9902
9903 /* Adjust the stack pointer up to REG while probing it. */
9904
9905 const char *
9906 output_adjust_stack_and_probe (rtx reg)
9907 {
9908 static int labelno = 0;
9909 char loop_lab[32], end_lab[32];
9910 rtx xops[2];
9911
9912 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9913 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9914
9915 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9916
9917 /* Jump to END_LAB if SP == LAST_ADDR. */
9918 xops[0] = stack_pointer_rtx;
9919 xops[1] = reg;
9920 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9921 fputs ("\tje\t", asm_out_file);
9922 assemble_name_raw (asm_out_file, end_lab);
9923 fputc ('\n', asm_out_file);
9924
9925 /* SP = SP + PROBE_INTERVAL. */
9926 xops[1] = GEN_INT (PROBE_INTERVAL);
9927 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9928
9929 /* Probe at SP. */
9930 xops[1] = const0_rtx;
9931 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9932
9933 fprintf (asm_out_file, "\tjmp\t");
9934 assemble_name_raw (asm_out_file, loop_lab);
9935 fputc ('\n', asm_out_file);
9936
9937 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9938
9939 return "";
9940 }
9941
9942 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9943 inclusive. These are offsets from the current stack pointer. */
9944
9945 static void
9946 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9947 {
9948 /* See if we have a constant small number of probes to generate. If so,
9949 that's the easy case. The run-time loop is made up of 7 insns in the
9950 generic case while the compile-time loop is made up of n insns for n #
9951 of intervals. */
9952 if (size <= 7 * PROBE_INTERVAL)
9953 {
9954 HOST_WIDE_INT i;
9955
9956 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9957 it exceeds SIZE. If only one probe is needed, this will not
9958 generate any code. Then probe at FIRST + SIZE. */
9959 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9960 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9961 -(first + i)));
9962
9963 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9964 -(first + size)));
9965 }
9966
9967 /* Otherwise, do the same as above, but in a loop. Note that we must be
9968 extra careful with variables wrapping around because we might be at
9969 the very top (or the very bottom) of the address space and we have
9970 to be able to handle this case properly; in particular, we use an
9971 equality test for the loop condition. */
9972 else
9973 {
9974 HOST_WIDE_INT rounded_size, last;
9975 struct scratch_reg sr;
9976
9977 get_scratch_register_on_entry (&sr);
9978
9979
9980 /* Step 1: round SIZE to the previous multiple of the interval. */
9981
9982 rounded_size = size & -PROBE_INTERVAL;
9983
9984
9985 /* Step 2: compute initial and final value of the loop counter. */
9986
9987 /* TEST_OFFSET = FIRST. */
9988 emit_move_insn (sr.reg, GEN_INT (-first));
9989
9990 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9991 last = first + rounded_size;
9992
9993
9994 /* Step 3: the loop
9995
9996 while (TEST_ADDR != LAST_ADDR)
9997 {
9998 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9999 probe at TEST_ADDR
10000 }
10001
10002 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10003 until it is equal to ROUNDED_SIZE. */
10004
10005 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10006
10007
10008 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10009 that SIZE is equal to ROUNDED_SIZE. */
10010
10011 if (size != rounded_size)
10012 emit_stack_probe (plus_constant (Pmode,
10013 gen_rtx_PLUS (Pmode,
10014 stack_pointer_rtx,
10015 sr.reg),
10016 rounded_size - size));
10017
10018 release_scratch_register_on_entry (&sr);
10019 }
10020
10021 /* Make sure nothing is scheduled before we are done. */
10022 emit_insn (gen_blockage ());
10023 }
10024
10025 /* Probe a range of stack addresses from REG to END, inclusive. These are
10026 offsets from the current stack pointer. */
10027
10028 const char *
10029 output_probe_stack_range (rtx reg, rtx end)
10030 {
10031 static int labelno = 0;
10032 char loop_lab[32], end_lab[32];
10033 rtx xops[3];
10034
10035 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10036 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10037
10038 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10039
10040 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10041 xops[0] = reg;
10042 xops[1] = end;
10043 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10044 fputs ("\tje\t", asm_out_file);
10045 assemble_name_raw (asm_out_file, end_lab);
10046 fputc ('\n', asm_out_file);
10047
10048 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10049 xops[1] = GEN_INT (PROBE_INTERVAL);
10050 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10051
10052 /* Probe at TEST_ADDR. */
10053 xops[0] = stack_pointer_rtx;
10054 xops[1] = reg;
10055 xops[2] = const0_rtx;
10056 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10057
10058 fprintf (asm_out_file, "\tjmp\t");
10059 assemble_name_raw (asm_out_file, loop_lab);
10060 fputc ('\n', asm_out_file);
10061
10062 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10063
10064 return "";
10065 }
10066
10067 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10068 to be generated in correct form. */
10069 static void
10070 ix86_finalize_stack_realign_flags (void)
10071 {
10072 /* Check if stack realign is really needed after reload, and
10073 stores result in cfun */
10074 unsigned int incoming_stack_boundary
10075 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10076 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10077 unsigned int stack_realign = (incoming_stack_boundary
10078 < (crtl->is_leaf
10079 ? crtl->max_used_stack_slot_alignment
10080 : crtl->stack_alignment_needed));
10081
10082 if (crtl->stack_realign_finalized)
10083 {
10084 /* After stack_realign_needed is finalized, we can't no longer
10085 change it. */
10086 gcc_assert (crtl->stack_realign_needed == stack_realign);
10087 return;
10088 }
10089
10090 /* If the only reason for frame_pointer_needed is that we conservatively
10091 assumed stack realignment might be needed, but in the end nothing that
10092 needed the stack alignment had been spilled, clear frame_pointer_needed
10093 and say we don't need stack realignment. */
10094 if (stack_realign
10095 && !crtl->need_drap
10096 && frame_pointer_needed
10097 && crtl->is_leaf
10098 && flag_omit_frame_pointer
10099 && crtl->sp_is_unchanging
10100 && !ix86_current_function_calls_tls_descriptor
10101 && !crtl->accesses_prior_frames
10102 && !cfun->calls_alloca
10103 && !crtl->calls_eh_return
10104 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10105 && !ix86_frame_pointer_required ()
10106 && get_frame_size () == 0
10107 && ix86_nsaved_sseregs () == 0
10108 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10109 {
10110 HARD_REG_SET set_up_by_prologue, prologue_used;
10111 basic_block bb;
10112
10113 CLEAR_HARD_REG_SET (prologue_used);
10114 CLEAR_HARD_REG_SET (set_up_by_prologue);
10115 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10116 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10117 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10118 HARD_FRAME_POINTER_REGNUM);
10119 FOR_EACH_BB (bb)
10120 {
10121 rtx insn;
10122 FOR_BB_INSNS (bb, insn)
10123 if (NONDEBUG_INSN_P (insn)
10124 && requires_stack_frame_p (insn, prologue_used,
10125 set_up_by_prologue))
10126 {
10127 crtl->stack_realign_needed = stack_realign;
10128 crtl->stack_realign_finalized = true;
10129 return;
10130 }
10131 }
10132
10133 frame_pointer_needed = false;
10134 stack_realign = false;
10135 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10136 crtl->stack_alignment_needed = incoming_stack_boundary;
10137 crtl->stack_alignment_estimated = incoming_stack_boundary;
10138 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10139 crtl->preferred_stack_boundary = incoming_stack_boundary;
10140 df_finish_pass (true);
10141 df_scan_alloc (NULL);
10142 df_scan_blocks ();
10143 df_compute_regs_ever_live (true);
10144 df_analyze ();
10145 }
10146
10147 crtl->stack_realign_needed = stack_realign;
10148 crtl->stack_realign_finalized = true;
10149 }
10150
10151 /* Expand the prologue into a bunch of separate insns. */
10152
10153 void
10154 ix86_expand_prologue (void)
10155 {
10156 struct machine_function *m = cfun->machine;
10157 rtx insn, t;
10158 bool pic_reg_used;
10159 struct ix86_frame frame;
10160 HOST_WIDE_INT allocate;
10161 bool int_registers_saved;
10162 bool sse_registers_saved;
10163
10164 ix86_finalize_stack_realign_flags ();
10165
10166 /* DRAP should not coexist with stack_realign_fp */
10167 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10168
10169 memset (&m->fs, 0, sizeof (m->fs));
10170
10171 /* Initialize CFA state for before the prologue. */
10172 m->fs.cfa_reg = stack_pointer_rtx;
10173 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10174
10175 /* Track SP offset to the CFA. We continue tracking this after we've
10176 swapped the CFA register away from SP. In the case of re-alignment
10177 this is fudged; we're interested to offsets within the local frame. */
10178 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10179 m->fs.sp_valid = true;
10180
10181 ix86_compute_frame_layout (&frame);
10182
10183 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10184 {
10185 /* We should have already generated an error for any use of
10186 ms_hook on a nested function. */
10187 gcc_checking_assert (!ix86_static_chain_on_stack);
10188
10189 /* Check if profiling is active and we shall use profiling before
10190 prologue variant. If so sorry. */
10191 if (crtl->profile && flag_fentry != 0)
10192 sorry ("ms_hook_prologue attribute isn%'t compatible "
10193 "with -mfentry for 32-bit");
10194
10195 /* In ix86_asm_output_function_label we emitted:
10196 8b ff movl.s %edi,%edi
10197 55 push %ebp
10198 8b ec movl.s %esp,%ebp
10199
10200 This matches the hookable function prologue in Win32 API
10201 functions in Microsoft Windows XP Service Pack 2 and newer.
10202 Wine uses this to enable Windows apps to hook the Win32 API
10203 functions provided by Wine.
10204
10205 What that means is that we've already set up the frame pointer. */
10206
10207 if (frame_pointer_needed
10208 && !(crtl->drap_reg && crtl->stack_realign_needed))
10209 {
10210 rtx push, mov;
10211
10212 /* We've decided to use the frame pointer already set up.
10213 Describe this to the unwinder by pretending that both
10214 push and mov insns happen right here.
10215
10216 Putting the unwind info here at the end of the ms_hook
10217 is done so that we can make absolutely certain we get
10218 the required byte sequence at the start of the function,
10219 rather than relying on an assembler that can produce
10220 the exact encoding required.
10221
10222 However it does mean (in the unpatched case) that we have
10223 a 1 insn window where the asynchronous unwind info is
10224 incorrect. However, if we placed the unwind info at
10225 its correct location we would have incorrect unwind info
10226 in the patched case. Which is probably all moot since
10227 I don't expect Wine generates dwarf2 unwind info for the
10228 system libraries that use this feature. */
10229
10230 insn = emit_insn (gen_blockage ());
10231
10232 push = gen_push (hard_frame_pointer_rtx);
10233 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10234 stack_pointer_rtx);
10235 RTX_FRAME_RELATED_P (push) = 1;
10236 RTX_FRAME_RELATED_P (mov) = 1;
10237
10238 RTX_FRAME_RELATED_P (insn) = 1;
10239 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10240 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10241
10242 /* Note that gen_push incremented m->fs.cfa_offset, even
10243 though we didn't emit the push insn here. */
10244 m->fs.cfa_reg = hard_frame_pointer_rtx;
10245 m->fs.fp_offset = m->fs.cfa_offset;
10246 m->fs.fp_valid = true;
10247 }
10248 else
10249 {
10250 /* The frame pointer is not needed so pop %ebp again.
10251 This leaves us with a pristine state. */
10252 emit_insn (gen_pop (hard_frame_pointer_rtx));
10253 }
10254 }
10255
10256 /* The first insn of a function that accepts its static chain on the
10257 stack is to push the register that would be filled in by a direct
10258 call. This insn will be skipped by the trampoline. */
10259 else if (ix86_static_chain_on_stack)
10260 {
10261 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10262 emit_insn (gen_blockage ());
10263
10264 /* We don't want to interpret this push insn as a register save,
10265 only as a stack adjustment. The real copy of the register as
10266 a save will be done later, if needed. */
10267 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10268 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10269 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10270 RTX_FRAME_RELATED_P (insn) = 1;
10271 }
10272
10273 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10274 of DRAP is needed and stack realignment is really needed after reload */
10275 if (stack_realign_drap)
10276 {
10277 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10278
10279 /* Only need to push parameter pointer reg if it is caller saved. */
10280 if (!call_used_regs[REGNO (crtl->drap_reg)])
10281 {
10282 /* Push arg pointer reg */
10283 insn = emit_insn (gen_push (crtl->drap_reg));
10284 RTX_FRAME_RELATED_P (insn) = 1;
10285 }
10286
10287 /* Grab the argument pointer. */
10288 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10289 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10290 RTX_FRAME_RELATED_P (insn) = 1;
10291 m->fs.cfa_reg = crtl->drap_reg;
10292 m->fs.cfa_offset = 0;
10293
10294 /* Align the stack. */
10295 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10296 stack_pointer_rtx,
10297 GEN_INT (-align_bytes)));
10298 RTX_FRAME_RELATED_P (insn) = 1;
10299
10300 /* Replicate the return address on the stack so that return
10301 address can be reached via (argp - 1) slot. This is needed
10302 to implement macro RETURN_ADDR_RTX and intrinsic function
10303 expand_builtin_return_addr etc. */
10304 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10305 t = gen_frame_mem (word_mode, t);
10306 insn = emit_insn (gen_push (t));
10307 RTX_FRAME_RELATED_P (insn) = 1;
10308
10309 /* For the purposes of frame and register save area addressing,
10310 we've started over with a new frame. */
10311 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10312 m->fs.realigned = true;
10313 }
10314
10315 int_registers_saved = (frame.nregs == 0);
10316 sse_registers_saved = (frame.nsseregs == 0);
10317
10318 if (frame_pointer_needed && !m->fs.fp_valid)
10319 {
10320 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10321 slower on all targets. Also sdb doesn't like it. */
10322 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10323 RTX_FRAME_RELATED_P (insn) = 1;
10324
10325 /* Push registers now, before setting the frame pointer
10326 on SEH target. */
10327 if (!int_registers_saved
10328 && TARGET_SEH
10329 && !frame.save_regs_using_mov)
10330 {
10331 ix86_emit_save_regs ();
10332 int_registers_saved = true;
10333 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10334 }
10335
10336 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10337 {
10338 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10339 RTX_FRAME_RELATED_P (insn) = 1;
10340
10341 if (m->fs.cfa_reg == stack_pointer_rtx)
10342 m->fs.cfa_reg = hard_frame_pointer_rtx;
10343 m->fs.fp_offset = m->fs.sp_offset;
10344 m->fs.fp_valid = true;
10345 }
10346 }
10347
10348 if (!int_registers_saved)
10349 {
10350 /* If saving registers via PUSH, do so now. */
10351 if (!frame.save_regs_using_mov)
10352 {
10353 ix86_emit_save_regs ();
10354 int_registers_saved = true;
10355 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10356 }
10357
10358 /* When using red zone we may start register saving before allocating
10359 the stack frame saving one cycle of the prologue. However, avoid
10360 doing this if we have to probe the stack; at least on x86_64 the
10361 stack probe can turn into a call that clobbers a red zone location. */
10362 else if (ix86_using_red_zone ()
10363 && (! TARGET_STACK_PROBE
10364 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10365 {
10366 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10367 int_registers_saved = true;
10368 }
10369 }
10370
10371 if (stack_realign_fp)
10372 {
10373 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10374 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10375
10376 /* The computation of the size of the re-aligned stack frame means
10377 that we must allocate the size of the register save area before
10378 performing the actual alignment. Otherwise we cannot guarantee
10379 that there's enough storage above the realignment point. */
10380 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10381 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10382 GEN_INT (m->fs.sp_offset
10383 - frame.sse_reg_save_offset),
10384 -1, false);
10385
10386 /* Align the stack. */
10387 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10388 stack_pointer_rtx,
10389 GEN_INT (-align_bytes)));
10390
10391 /* For the purposes of register save area addressing, the stack
10392 pointer is no longer valid. As for the value of sp_offset,
10393 see ix86_compute_frame_layout, which we need to match in order
10394 to pass verification of stack_pointer_offset at the end. */
10395 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10396 m->fs.sp_valid = false;
10397 }
10398
10399 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10400
10401 if (flag_stack_usage_info)
10402 {
10403 /* We start to count from ARG_POINTER. */
10404 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10405
10406 /* If it was realigned, take into account the fake frame. */
10407 if (stack_realign_drap)
10408 {
10409 if (ix86_static_chain_on_stack)
10410 stack_size += UNITS_PER_WORD;
10411
10412 if (!call_used_regs[REGNO (crtl->drap_reg)])
10413 stack_size += UNITS_PER_WORD;
10414
10415 /* This over-estimates by 1 minimal-stack-alignment-unit but
10416 mitigates that by counting in the new return address slot. */
10417 current_function_dynamic_stack_size
10418 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10419 }
10420
10421 current_function_static_stack_size = stack_size;
10422 }
10423
10424 /* On SEH target with very large frame size, allocate an area to save
10425 SSE registers (as the very large allocation won't be described). */
10426 if (TARGET_SEH
10427 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10428 && !sse_registers_saved)
10429 {
10430 HOST_WIDE_INT sse_size =
10431 frame.sse_reg_save_offset - frame.reg_save_offset;
10432
10433 gcc_assert (int_registers_saved);
10434
10435 /* No need to do stack checking as the area will be immediately
10436 written. */
10437 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10438 GEN_INT (-sse_size), -1,
10439 m->fs.cfa_reg == stack_pointer_rtx);
10440 allocate -= sse_size;
10441 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10442 sse_registers_saved = true;
10443 }
10444
10445 /* The stack has already been decremented by the instruction calling us
10446 so probe if the size is non-negative to preserve the protection area. */
10447 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10448 {
10449 /* We expect the registers to be saved when probes are used. */
10450 gcc_assert (int_registers_saved);
10451
10452 if (STACK_CHECK_MOVING_SP)
10453 {
10454 ix86_adjust_stack_and_probe (allocate);
10455 allocate = 0;
10456 }
10457 else
10458 {
10459 HOST_WIDE_INT size = allocate;
10460
10461 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10462 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10463
10464 if (TARGET_STACK_PROBE)
10465 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10466 else
10467 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10468 }
10469 }
10470
10471 if (allocate == 0)
10472 ;
10473 else if (!ix86_target_stack_probe ()
10474 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10475 {
10476 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10477 GEN_INT (-allocate), -1,
10478 m->fs.cfa_reg == stack_pointer_rtx);
10479 }
10480 else
10481 {
10482 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10483 rtx r10 = NULL;
10484 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10485
10486 bool eax_live = false;
10487 bool r10_live = false;
10488
10489 if (TARGET_64BIT)
10490 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10491 if (!TARGET_64BIT_MS_ABI)
10492 eax_live = ix86_eax_live_at_start_p ();
10493
10494 if (eax_live)
10495 {
10496 emit_insn (gen_push (eax));
10497 allocate -= UNITS_PER_WORD;
10498 }
10499 if (r10_live)
10500 {
10501 r10 = gen_rtx_REG (Pmode, R10_REG);
10502 emit_insn (gen_push (r10));
10503 allocate -= UNITS_PER_WORD;
10504 }
10505
10506 emit_move_insn (eax, GEN_INT (allocate));
10507 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10508
10509 /* Use the fact that AX still contains ALLOCATE. */
10510 adjust_stack_insn = (Pmode == DImode
10511 ? gen_pro_epilogue_adjust_stack_di_sub
10512 : gen_pro_epilogue_adjust_stack_si_sub);
10513
10514 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10515 stack_pointer_rtx, eax));
10516
10517 /* Note that SEH directives need to continue tracking the stack
10518 pointer even after the frame pointer has been set up. */
10519 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10520 {
10521 if (m->fs.cfa_reg == stack_pointer_rtx)
10522 m->fs.cfa_offset += allocate;
10523
10524 RTX_FRAME_RELATED_P (insn) = 1;
10525 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10526 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10527 plus_constant (Pmode, stack_pointer_rtx,
10528 -allocate)));
10529 }
10530 m->fs.sp_offset += allocate;
10531
10532 if (r10_live && eax_live)
10533 {
10534 t = choose_baseaddr (m->fs.sp_offset - allocate);
10535 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10536 gen_frame_mem (word_mode, t));
10537 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10538 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10539 gen_frame_mem (word_mode, t));
10540 }
10541 else if (eax_live || r10_live)
10542 {
10543 t = choose_baseaddr (m->fs.sp_offset - allocate);
10544 emit_move_insn (gen_rtx_REG (word_mode,
10545 (eax_live ? AX_REG : R10_REG)),
10546 gen_frame_mem (word_mode, t));
10547 }
10548 }
10549 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10550
10551 /* If we havn't already set up the frame pointer, do so now. */
10552 if (frame_pointer_needed && !m->fs.fp_valid)
10553 {
10554 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10555 GEN_INT (frame.stack_pointer_offset
10556 - frame.hard_frame_pointer_offset));
10557 insn = emit_insn (insn);
10558 RTX_FRAME_RELATED_P (insn) = 1;
10559 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10560
10561 if (m->fs.cfa_reg == stack_pointer_rtx)
10562 m->fs.cfa_reg = hard_frame_pointer_rtx;
10563 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10564 m->fs.fp_valid = true;
10565 }
10566
10567 if (!int_registers_saved)
10568 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10569 if (!sse_registers_saved)
10570 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10571
10572 pic_reg_used = false;
10573 if (pic_offset_table_rtx
10574 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10575 || crtl->profile))
10576 {
10577 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10578
10579 if (alt_pic_reg_used != INVALID_REGNUM)
10580 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10581
10582 pic_reg_used = true;
10583 }
10584
10585 if (pic_reg_used)
10586 {
10587 if (TARGET_64BIT)
10588 {
10589 if (ix86_cmodel == CM_LARGE_PIC)
10590 {
10591 rtx label, tmp_reg;
10592
10593 gcc_assert (Pmode == DImode);
10594 label = gen_label_rtx ();
10595 emit_label (label);
10596 LABEL_PRESERVE_P (label) = 1;
10597 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10598 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10599 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10600 label));
10601 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10602 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10603 pic_offset_table_rtx, tmp_reg));
10604 }
10605 else
10606 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10607 }
10608 else
10609 {
10610 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10611 RTX_FRAME_RELATED_P (insn) = 1;
10612 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10613 }
10614 }
10615
10616 /* In the pic_reg_used case, make sure that the got load isn't deleted
10617 when mcount needs it. Blockage to avoid call movement across mcount
10618 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10619 note. */
10620 if (crtl->profile && !flag_fentry && pic_reg_used)
10621 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10622
10623 if (crtl->drap_reg && !crtl->stack_realign_needed)
10624 {
10625 /* vDRAP is setup but after reload it turns out stack realign
10626 isn't necessary, here we will emit prologue to setup DRAP
10627 without stack realign adjustment */
10628 t = choose_baseaddr (0);
10629 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10630 }
10631
10632 /* Prevent instructions from being scheduled into register save push
10633 sequence when access to the redzone area is done through frame pointer.
10634 The offset between the frame pointer and the stack pointer is calculated
10635 relative to the value of the stack pointer at the end of the function
10636 prologue, and moving instructions that access redzone area via frame
10637 pointer inside push sequence violates this assumption. */
10638 if (frame_pointer_needed && frame.red_zone_size)
10639 emit_insn (gen_memory_blockage ());
10640
10641 /* Emit cld instruction if stringops are used in the function. */
10642 if (TARGET_CLD && ix86_current_function_needs_cld)
10643 emit_insn (gen_cld ());
10644
10645 /* SEH requires that the prologue end within 256 bytes of the start of
10646 the function. Prevent instruction schedules that would extend that.
10647 Further, prevent alloca modifications to the stack pointer from being
10648 combined with prologue modifications. */
10649 if (TARGET_SEH)
10650 emit_insn (gen_prologue_use (stack_pointer_rtx));
10651 }
10652
10653 /* Emit code to restore REG using a POP insn. */
10654
10655 static void
10656 ix86_emit_restore_reg_using_pop (rtx reg)
10657 {
10658 struct machine_function *m = cfun->machine;
10659 rtx insn = emit_insn (gen_pop (reg));
10660
10661 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10662 m->fs.sp_offset -= UNITS_PER_WORD;
10663
10664 if (m->fs.cfa_reg == crtl->drap_reg
10665 && REGNO (reg) == REGNO (crtl->drap_reg))
10666 {
10667 /* Previously we'd represented the CFA as an expression
10668 like *(%ebp - 8). We've just popped that value from
10669 the stack, which means we need to reset the CFA to
10670 the drap register. This will remain until we restore
10671 the stack pointer. */
10672 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10673 RTX_FRAME_RELATED_P (insn) = 1;
10674
10675 /* This means that the DRAP register is valid for addressing too. */
10676 m->fs.drap_valid = true;
10677 return;
10678 }
10679
10680 if (m->fs.cfa_reg == stack_pointer_rtx)
10681 {
10682 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10683 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10684 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10685 RTX_FRAME_RELATED_P (insn) = 1;
10686
10687 m->fs.cfa_offset -= UNITS_PER_WORD;
10688 }
10689
10690 /* When the frame pointer is the CFA, and we pop it, we are
10691 swapping back to the stack pointer as the CFA. This happens
10692 for stack frames that don't allocate other data, so we assume
10693 the stack pointer is now pointing at the return address, i.e.
10694 the function entry state, which makes the offset be 1 word. */
10695 if (reg == hard_frame_pointer_rtx)
10696 {
10697 m->fs.fp_valid = false;
10698 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10699 {
10700 m->fs.cfa_reg = stack_pointer_rtx;
10701 m->fs.cfa_offset -= UNITS_PER_WORD;
10702
10703 add_reg_note (insn, REG_CFA_DEF_CFA,
10704 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10705 GEN_INT (m->fs.cfa_offset)));
10706 RTX_FRAME_RELATED_P (insn) = 1;
10707 }
10708 }
10709 }
10710
10711 /* Emit code to restore saved registers using POP insns. */
10712
10713 static void
10714 ix86_emit_restore_regs_using_pop (void)
10715 {
10716 unsigned int regno;
10717
10718 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10719 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10720 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10721 }
10722
10723 /* Emit code and notes for the LEAVE instruction. */
10724
10725 static void
10726 ix86_emit_leave (void)
10727 {
10728 struct machine_function *m = cfun->machine;
10729 rtx insn = emit_insn (ix86_gen_leave ());
10730
10731 ix86_add_queued_cfa_restore_notes (insn);
10732
10733 gcc_assert (m->fs.fp_valid);
10734 m->fs.sp_valid = true;
10735 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10736 m->fs.fp_valid = false;
10737
10738 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10739 {
10740 m->fs.cfa_reg = stack_pointer_rtx;
10741 m->fs.cfa_offset = m->fs.sp_offset;
10742
10743 add_reg_note (insn, REG_CFA_DEF_CFA,
10744 plus_constant (Pmode, stack_pointer_rtx,
10745 m->fs.sp_offset));
10746 RTX_FRAME_RELATED_P (insn) = 1;
10747 }
10748 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10749 m->fs.fp_offset);
10750 }
10751
10752 /* Emit code to restore saved registers using MOV insns.
10753 First register is restored from CFA - CFA_OFFSET. */
10754 static void
10755 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10756 bool maybe_eh_return)
10757 {
10758 struct machine_function *m = cfun->machine;
10759 unsigned int regno;
10760
10761 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10762 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10763 {
10764 rtx reg = gen_rtx_REG (word_mode, regno);
10765 rtx insn, mem;
10766
10767 mem = choose_baseaddr (cfa_offset);
10768 mem = gen_frame_mem (word_mode, mem);
10769 insn = emit_move_insn (reg, mem);
10770
10771 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10772 {
10773 /* Previously we'd represented the CFA as an expression
10774 like *(%ebp - 8). We've just popped that value from
10775 the stack, which means we need to reset the CFA to
10776 the drap register. This will remain until we restore
10777 the stack pointer. */
10778 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10779 RTX_FRAME_RELATED_P (insn) = 1;
10780
10781 /* This means that the DRAP register is valid for addressing. */
10782 m->fs.drap_valid = true;
10783 }
10784 else
10785 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10786
10787 cfa_offset -= UNITS_PER_WORD;
10788 }
10789 }
10790
10791 /* Emit code to restore saved registers using MOV insns.
10792 First register is restored from CFA - CFA_OFFSET. */
10793 static void
10794 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10795 bool maybe_eh_return)
10796 {
10797 unsigned int regno;
10798
10799 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10800 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10801 {
10802 rtx reg = gen_rtx_REG (V4SFmode, regno);
10803 rtx mem;
10804
10805 mem = choose_baseaddr (cfa_offset);
10806 mem = gen_rtx_MEM (V4SFmode, mem);
10807 set_mem_align (mem, 128);
10808 emit_move_insn (reg, mem);
10809
10810 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10811
10812 cfa_offset -= 16;
10813 }
10814 }
10815
10816 /* Emit vzeroupper if needed. */
10817
10818 void
10819 ix86_maybe_emit_epilogue_vzeroupper (void)
10820 {
10821 if (TARGET_VZEROUPPER
10822 && !TREE_THIS_VOLATILE (cfun->decl)
10823 && !cfun->machine->caller_return_avx256_p)
10824 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10825 }
10826
10827 /* Restore function stack, frame, and registers. */
10828
10829 void
10830 ix86_expand_epilogue (int style)
10831 {
10832 struct machine_function *m = cfun->machine;
10833 struct machine_frame_state frame_state_save = m->fs;
10834 struct ix86_frame frame;
10835 bool restore_regs_via_mov;
10836 bool using_drap;
10837
10838 ix86_finalize_stack_realign_flags ();
10839 ix86_compute_frame_layout (&frame);
10840
10841 m->fs.sp_valid = (!frame_pointer_needed
10842 || (crtl->sp_is_unchanging
10843 && !stack_realign_fp));
10844 gcc_assert (!m->fs.sp_valid
10845 || m->fs.sp_offset == frame.stack_pointer_offset);
10846
10847 /* The FP must be valid if the frame pointer is present. */
10848 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10849 gcc_assert (!m->fs.fp_valid
10850 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10851
10852 /* We must have *some* valid pointer to the stack frame. */
10853 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10854
10855 /* The DRAP is never valid at this point. */
10856 gcc_assert (!m->fs.drap_valid);
10857
10858 /* See the comment about red zone and frame
10859 pointer usage in ix86_expand_prologue. */
10860 if (frame_pointer_needed && frame.red_zone_size)
10861 emit_insn (gen_memory_blockage ());
10862
10863 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10864 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10865
10866 /* Determine the CFA offset of the end of the red-zone. */
10867 m->fs.red_zone_offset = 0;
10868 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10869 {
10870 /* The red-zone begins below the return address. */
10871 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10872
10873 /* When the register save area is in the aligned portion of
10874 the stack, determine the maximum runtime displacement that
10875 matches up with the aligned frame. */
10876 if (stack_realign_drap)
10877 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10878 + UNITS_PER_WORD);
10879 }
10880
10881 /* Special care must be taken for the normal return case of a function
10882 using eh_return: the eax and edx registers are marked as saved, but
10883 not restored along this path. Adjust the save location to match. */
10884 if (crtl->calls_eh_return && style != 2)
10885 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10886
10887 /* EH_RETURN requires the use of moves to function properly. */
10888 if (crtl->calls_eh_return)
10889 restore_regs_via_mov = true;
10890 /* SEH requires the use of pops to identify the epilogue. */
10891 else if (TARGET_SEH)
10892 restore_regs_via_mov = false;
10893 /* If we're only restoring one register and sp is not valid then
10894 using a move instruction to restore the register since it's
10895 less work than reloading sp and popping the register. */
10896 else if (!m->fs.sp_valid && frame.nregs <= 1)
10897 restore_regs_via_mov = true;
10898 else if (TARGET_EPILOGUE_USING_MOVE
10899 && cfun->machine->use_fast_prologue_epilogue
10900 && (frame.nregs > 1
10901 || m->fs.sp_offset != frame.reg_save_offset))
10902 restore_regs_via_mov = true;
10903 else if (frame_pointer_needed
10904 && !frame.nregs
10905 && m->fs.sp_offset != frame.reg_save_offset)
10906 restore_regs_via_mov = true;
10907 else if (frame_pointer_needed
10908 && TARGET_USE_LEAVE
10909 && cfun->machine->use_fast_prologue_epilogue
10910 && frame.nregs == 1)
10911 restore_regs_via_mov = true;
10912 else
10913 restore_regs_via_mov = false;
10914
10915 if (restore_regs_via_mov || frame.nsseregs)
10916 {
10917 /* Ensure that the entire register save area is addressable via
10918 the stack pointer, if we will restore via sp. */
10919 if (TARGET_64BIT
10920 && m->fs.sp_offset > 0x7fffffff
10921 && !(m->fs.fp_valid || m->fs.drap_valid)
10922 && (frame.nsseregs + frame.nregs) != 0)
10923 {
10924 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10925 GEN_INT (m->fs.sp_offset
10926 - frame.sse_reg_save_offset),
10927 style,
10928 m->fs.cfa_reg == stack_pointer_rtx);
10929 }
10930 }
10931
10932 /* If there are any SSE registers to restore, then we have to do it
10933 via moves, since there's obviously no pop for SSE regs. */
10934 if (frame.nsseregs)
10935 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10936 style == 2);
10937
10938 if (restore_regs_via_mov)
10939 {
10940 rtx t;
10941
10942 if (frame.nregs)
10943 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10944
10945 /* eh_return epilogues need %ecx added to the stack pointer. */
10946 if (style == 2)
10947 {
10948 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10949
10950 /* Stack align doesn't work with eh_return. */
10951 gcc_assert (!stack_realign_drap);
10952 /* Neither does regparm nested functions. */
10953 gcc_assert (!ix86_static_chain_on_stack);
10954
10955 if (frame_pointer_needed)
10956 {
10957 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10958 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10959 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10960
10961 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10962 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10963
10964 /* Note that we use SA as a temporary CFA, as the return
10965 address is at the proper place relative to it. We
10966 pretend this happens at the FP restore insn because
10967 prior to this insn the FP would be stored at the wrong
10968 offset relative to SA, and after this insn we have no
10969 other reasonable register to use for the CFA. We don't
10970 bother resetting the CFA to the SP for the duration of
10971 the return insn. */
10972 add_reg_note (insn, REG_CFA_DEF_CFA,
10973 plus_constant (Pmode, sa, UNITS_PER_WORD));
10974 ix86_add_queued_cfa_restore_notes (insn);
10975 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10976 RTX_FRAME_RELATED_P (insn) = 1;
10977
10978 m->fs.cfa_reg = sa;
10979 m->fs.cfa_offset = UNITS_PER_WORD;
10980 m->fs.fp_valid = false;
10981
10982 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10983 const0_rtx, style, false);
10984 }
10985 else
10986 {
10987 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10988 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10989 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10990 ix86_add_queued_cfa_restore_notes (insn);
10991
10992 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10993 if (m->fs.cfa_offset != UNITS_PER_WORD)
10994 {
10995 m->fs.cfa_offset = UNITS_PER_WORD;
10996 add_reg_note (insn, REG_CFA_DEF_CFA,
10997 plus_constant (Pmode, stack_pointer_rtx,
10998 UNITS_PER_WORD));
10999 RTX_FRAME_RELATED_P (insn) = 1;
11000 }
11001 }
11002 m->fs.sp_offset = UNITS_PER_WORD;
11003 m->fs.sp_valid = true;
11004 }
11005 }
11006 else
11007 {
11008 /* SEH requires that the function end with (1) a stack adjustment
11009 if necessary, (2) a sequence of pops, and (3) a return or
11010 jump instruction. Prevent insns from the function body from
11011 being scheduled into this sequence. */
11012 if (TARGET_SEH)
11013 {
11014 /* Prevent a catch region from being adjacent to the standard
11015 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11016 several other flags that would be interesting to test are
11017 not yet set up. */
11018 if (flag_non_call_exceptions)
11019 emit_insn (gen_nops (const1_rtx));
11020 else
11021 emit_insn (gen_blockage ());
11022 }
11023
11024 /* First step is to deallocate the stack frame so that we can
11025 pop the registers. Also do it on SEH target for very large
11026 frame as the emitted instructions aren't allowed by the ABI in
11027 epilogues. */
11028 if (!m->fs.sp_valid
11029 || (TARGET_SEH
11030 && (m->fs.sp_offset - frame.reg_save_offset
11031 >= SEH_MAX_FRAME_SIZE)))
11032 {
11033 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11034 GEN_INT (m->fs.fp_offset
11035 - frame.reg_save_offset),
11036 style, false);
11037 }
11038 else if (m->fs.sp_offset != frame.reg_save_offset)
11039 {
11040 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11041 GEN_INT (m->fs.sp_offset
11042 - frame.reg_save_offset),
11043 style,
11044 m->fs.cfa_reg == stack_pointer_rtx);
11045 }
11046
11047 ix86_emit_restore_regs_using_pop ();
11048 }
11049
11050 /* If we used a stack pointer and haven't already got rid of it,
11051 then do so now. */
11052 if (m->fs.fp_valid)
11053 {
11054 /* If the stack pointer is valid and pointing at the frame
11055 pointer store address, then we only need a pop. */
11056 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11057 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11058 /* Leave results in shorter dependency chains on CPUs that are
11059 able to grok it fast. */
11060 else if (TARGET_USE_LEAVE
11061 || optimize_function_for_size_p (cfun)
11062 || !cfun->machine->use_fast_prologue_epilogue)
11063 ix86_emit_leave ();
11064 else
11065 {
11066 pro_epilogue_adjust_stack (stack_pointer_rtx,
11067 hard_frame_pointer_rtx,
11068 const0_rtx, style, !using_drap);
11069 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11070 }
11071 }
11072
11073 if (using_drap)
11074 {
11075 int param_ptr_offset = UNITS_PER_WORD;
11076 rtx insn;
11077
11078 gcc_assert (stack_realign_drap);
11079
11080 if (ix86_static_chain_on_stack)
11081 param_ptr_offset += UNITS_PER_WORD;
11082 if (!call_used_regs[REGNO (crtl->drap_reg)])
11083 param_ptr_offset += UNITS_PER_WORD;
11084
11085 insn = emit_insn (gen_rtx_SET
11086 (VOIDmode, stack_pointer_rtx,
11087 gen_rtx_PLUS (Pmode,
11088 crtl->drap_reg,
11089 GEN_INT (-param_ptr_offset))));
11090 m->fs.cfa_reg = stack_pointer_rtx;
11091 m->fs.cfa_offset = param_ptr_offset;
11092 m->fs.sp_offset = param_ptr_offset;
11093 m->fs.realigned = false;
11094
11095 add_reg_note (insn, REG_CFA_DEF_CFA,
11096 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11097 GEN_INT (param_ptr_offset)));
11098 RTX_FRAME_RELATED_P (insn) = 1;
11099
11100 if (!call_used_regs[REGNO (crtl->drap_reg)])
11101 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11102 }
11103
11104 /* At this point the stack pointer must be valid, and we must have
11105 restored all of the registers. We may not have deallocated the
11106 entire stack frame. We've delayed this until now because it may
11107 be possible to merge the local stack deallocation with the
11108 deallocation forced by ix86_static_chain_on_stack. */
11109 gcc_assert (m->fs.sp_valid);
11110 gcc_assert (!m->fs.fp_valid);
11111 gcc_assert (!m->fs.realigned);
11112 if (m->fs.sp_offset != UNITS_PER_WORD)
11113 {
11114 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11115 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11116 style, true);
11117 }
11118 else
11119 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11120
11121 /* Sibcall epilogues don't want a return instruction. */
11122 if (style == 0)
11123 {
11124 m->fs = frame_state_save;
11125 return;
11126 }
11127
11128 /* Emit vzeroupper if needed. */
11129 ix86_maybe_emit_epilogue_vzeroupper ();
11130
11131 if (crtl->args.pops_args && crtl->args.size)
11132 {
11133 rtx popc = GEN_INT (crtl->args.pops_args);
11134
11135 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11136 address, do explicit add, and jump indirectly to the caller. */
11137
11138 if (crtl->args.pops_args >= 65536)
11139 {
11140 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11141 rtx insn;
11142
11143 /* There is no "pascal" calling convention in any 64bit ABI. */
11144 gcc_assert (!TARGET_64BIT);
11145
11146 insn = emit_insn (gen_pop (ecx));
11147 m->fs.cfa_offset -= UNITS_PER_WORD;
11148 m->fs.sp_offset -= UNITS_PER_WORD;
11149
11150 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11151 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11152 add_reg_note (insn, REG_CFA_REGISTER,
11153 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11154 RTX_FRAME_RELATED_P (insn) = 1;
11155
11156 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11157 popc, -1, true);
11158 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11159 }
11160 else
11161 emit_jump_insn (gen_simple_return_pop_internal (popc));
11162 }
11163 else
11164 emit_jump_insn (gen_simple_return_internal ());
11165
11166 /* Restore the state back to the state from the prologue,
11167 so that it's correct for the next epilogue. */
11168 m->fs = frame_state_save;
11169 }
11170
11171 /* Reset from the function's potential modifications. */
11172
11173 static void
11174 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11175 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11176 {
11177 if (pic_offset_table_rtx)
11178 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11179 #if TARGET_MACHO
11180 /* Mach-O doesn't support labels at the end of objects, so if
11181 it looks like we might want one, insert a NOP. */
11182 {
11183 rtx insn = get_last_insn ();
11184 rtx deleted_debug_label = NULL_RTX;
11185 while (insn
11186 && NOTE_P (insn)
11187 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11188 {
11189 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11190 notes only, instead set their CODE_LABEL_NUMBER to -1,
11191 otherwise there would be code generation differences
11192 in between -g and -g0. */
11193 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11194 deleted_debug_label = insn;
11195 insn = PREV_INSN (insn);
11196 }
11197 if (insn
11198 && (LABEL_P (insn)
11199 || (NOTE_P (insn)
11200 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11201 fputs ("\tnop\n", file);
11202 else if (deleted_debug_label)
11203 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11204 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11205 CODE_LABEL_NUMBER (insn) = -1;
11206 }
11207 #endif
11208
11209 }
11210
11211 /* Return a scratch register to use in the split stack prologue. The
11212 split stack prologue is used for -fsplit-stack. It is the first
11213 instructions in the function, even before the regular prologue.
11214 The scratch register can be any caller-saved register which is not
11215 used for parameters or for the static chain. */
11216
11217 static unsigned int
11218 split_stack_prologue_scratch_regno (void)
11219 {
11220 if (TARGET_64BIT)
11221 return R11_REG;
11222 else
11223 {
11224 bool is_fastcall;
11225 int regparm;
11226
11227 is_fastcall = (lookup_attribute ("fastcall",
11228 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11229 != NULL);
11230 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11231
11232 if (is_fastcall)
11233 {
11234 if (DECL_STATIC_CHAIN (cfun->decl))
11235 {
11236 sorry ("-fsplit-stack does not support fastcall with "
11237 "nested function");
11238 return INVALID_REGNUM;
11239 }
11240 return AX_REG;
11241 }
11242 else if (regparm < 3)
11243 {
11244 if (!DECL_STATIC_CHAIN (cfun->decl))
11245 return CX_REG;
11246 else
11247 {
11248 if (regparm >= 2)
11249 {
11250 sorry ("-fsplit-stack does not support 2 register "
11251 " parameters for a nested function");
11252 return INVALID_REGNUM;
11253 }
11254 return DX_REG;
11255 }
11256 }
11257 else
11258 {
11259 /* FIXME: We could make this work by pushing a register
11260 around the addition and comparison. */
11261 sorry ("-fsplit-stack does not support 3 register parameters");
11262 return INVALID_REGNUM;
11263 }
11264 }
11265 }
11266
11267 /* A SYMBOL_REF for the function which allocates new stackspace for
11268 -fsplit-stack. */
11269
11270 static GTY(()) rtx split_stack_fn;
11271
11272 /* A SYMBOL_REF for the more stack function when using the large
11273 model. */
11274
11275 static GTY(()) rtx split_stack_fn_large;
11276
11277 /* Handle -fsplit-stack. These are the first instructions in the
11278 function, even before the regular prologue. */
11279
11280 void
11281 ix86_expand_split_stack_prologue (void)
11282 {
11283 struct ix86_frame frame;
11284 HOST_WIDE_INT allocate;
11285 unsigned HOST_WIDE_INT args_size;
11286 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11287 rtx scratch_reg = NULL_RTX;
11288 rtx varargs_label = NULL_RTX;
11289 rtx fn;
11290
11291 gcc_assert (flag_split_stack && reload_completed);
11292
11293 ix86_finalize_stack_realign_flags ();
11294 ix86_compute_frame_layout (&frame);
11295 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11296
11297 /* This is the label we will branch to if we have enough stack
11298 space. We expect the basic block reordering pass to reverse this
11299 branch if optimizing, so that we branch in the unlikely case. */
11300 label = gen_label_rtx ();
11301
11302 /* We need to compare the stack pointer minus the frame size with
11303 the stack boundary in the TCB. The stack boundary always gives
11304 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11305 can compare directly. Otherwise we need to do an addition. */
11306
11307 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11308 UNSPEC_STACK_CHECK);
11309 limit = gen_rtx_CONST (Pmode, limit);
11310 limit = gen_rtx_MEM (Pmode, limit);
11311 if (allocate < SPLIT_STACK_AVAILABLE)
11312 current = stack_pointer_rtx;
11313 else
11314 {
11315 unsigned int scratch_regno;
11316 rtx offset;
11317
11318 /* We need a scratch register to hold the stack pointer minus
11319 the required frame size. Since this is the very start of the
11320 function, the scratch register can be any caller-saved
11321 register which is not used for parameters. */
11322 offset = GEN_INT (- allocate);
11323 scratch_regno = split_stack_prologue_scratch_regno ();
11324 if (scratch_regno == INVALID_REGNUM)
11325 return;
11326 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11327 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11328 {
11329 /* We don't use ix86_gen_add3 in this case because it will
11330 want to split to lea, but when not optimizing the insn
11331 will not be split after this point. */
11332 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11333 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11334 offset)));
11335 }
11336 else
11337 {
11338 emit_move_insn (scratch_reg, offset);
11339 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11340 stack_pointer_rtx));
11341 }
11342 current = scratch_reg;
11343 }
11344
11345 ix86_expand_branch (GEU, current, limit, label);
11346 jump_insn = get_last_insn ();
11347 JUMP_LABEL (jump_insn) = label;
11348
11349 /* Mark the jump as very likely to be taken. */
11350 add_reg_note (jump_insn, REG_BR_PROB,
11351 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11352
11353 if (split_stack_fn == NULL_RTX)
11354 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11355 fn = split_stack_fn;
11356
11357 /* Get more stack space. We pass in the desired stack space and the
11358 size of the arguments to copy to the new stack. In 32-bit mode
11359 we push the parameters; __morestack will return on a new stack
11360 anyhow. In 64-bit mode we pass the parameters in r10 and
11361 r11. */
11362 allocate_rtx = GEN_INT (allocate);
11363 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11364 call_fusage = NULL_RTX;
11365 if (TARGET_64BIT)
11366 {
11367 rtx reg10, reg11;
11368
11369 reg10 = gen_rtx_REG (Pmode, R10_REG);
11370 reg11 = gen_rtx_REG (Pmode, R11_REG);
11371
11372 /* If this function uses a static chain, it will be in %r10.
11373 Preserve it across the call to __morestack. */
11374 if (DECL_STATIC_CHAIN (cfun->decl))
11375 {
11376 rtx rax;
11377
11378 rax = gen_rtx_REG (word_mode, AX_REG);
11379 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11380 use_reg (&call_fusage, rax);
11381 }
11382
11383 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11384 {
11385 HOST_WIDE_INT argval;
11386
11387 gcc_assert (Pmode == DImode);
11388 /* When using the large model we need to load the address
11389 into a register, and we've run out of registers. So we
11390 switch to a different calling convention, and we call a
11391 different function: __morestack_large. We pass the
11392 argument size in the upper 32 bits of r10 and pass the
11393 frame size in the lower 32 bits. */
11394 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11395 gcc_assert ((args_size & 0xffffffff) == args_size);
11396
11397 if (split_stack_fn_large == NULL_RTX)
11398 split_stack_fn_large =
11399 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11400
11401 if (ix86_cmodel == CM_LARGE_PIC)
11402 {
11403 rtx label, x;
11404
11405 label = gen_label_rtx ();
11406 emit_label (label);
11407 LABEL_PRESERVE_P (label) = 1;
11408 emit_insn (gen_set_rip_rex64 (reg10, label));
11409 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11410 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11411 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11412 UNSPEC_GOT);
11413 x = gen_rtx_CONST (Pmode, x);
11414 emit_move_insn (reg11, x);
11415 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11416 x = gen_const_mem (Pmode, x);
11417 emit_move_insn (reg11, x);
11418 }
11419 else
11420 emit_move_insn (reg11, split_stack_fn_large);
11421
11422 fn = reg11;
11423
11424 argval = ((args_size << 16) << 16) + allocate;
11425 emit_move_insn (reg10, GEN_INT (argval));
11426 }
11427 else
11428 {
11429 emit_move_insn (reg10, allocate_rtx);
11430 emit_move_insn (reg11, GEN_INT (args_size));
11431 use_reg (&call_fusage, reg11);
11432 }
11433
11434 use_reg (&call_fusage, reg10);
11435 }
11436 else
11437 {
11438 emit_insn (gen_push (GEN_INT (args_size)));
11439 emit_insn (gen_push (allocate_rtx));
11440 }
11441 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11442 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11443 NULL_RTX, false);
11444 add_function_usage_to (call_insn, call_fusage);
11445
11446 /* In order to make call/return prediction work right, we now need
11447 to execute a return instruction. See
11448 libgcc/config/i386/morestack.S for the details on how this works.
11449
11450 For flow purposes gcc must not see this as a return
11451 instruction--we need control flow to continue at the subsequent
11452 label. Therefore, we use an unspec. */
11453 gcc_assert (crtl->args.pops_args < 65536);
11454 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11455
11456 /* If we are in 64-bit mode and this function uses a static chain,
11457 we saved %r10 in %rax before calling _morestack. */
11458 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11459 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11460 gen_rtx_REG (word_mode, AX_REG));
11461
11462 /* If this function calls va_start, we need to store a pointer to
11463 the arguments on the old stack, because they may not have been
11464 all copied to the new stack. At this point the old stack can be
11465 found at the frame pointer value used by __morestack, because
11466 __morestack has set that up before calling back to us. Here we
11467 store that pointer in a scratch register, and in
11468 ix86_expand_prologue we store the scratch register in a stack
11469 slot. */
11470 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11471 {
11472 unsigned int scratch_regno;
11473 rtx frame_reg;
11474 int words;
11475
11476 scratch_regno = split_stack_prologue_scratch_regno ();
11477 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11478 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11479
11480 /* 64-bit:
11481 fp -> old fp value
11482 return address within this function
11483 return address of caller of this function
11484 stack arguments
11485 So we add three words to get to the stack arguments.
11486
11487 32-bit:
11488 fp -> old fp value
11489 return address within this function
11490 first argument to __morestack
11491 second argument to __morestack
11492 return address of caller of this function
11493 stack arguments
11494 So we add five words to get to the stack arguments.
11495 */
11496 words = TARGET_64BIT ? 3 : 5;
11497 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11498 gen_rtx_PLUS (Pmode, frame_reg,
11499 GEN_INT (words * UNITS_PER_WORD))));
11500
11501 varargs_label = gen_label_rtx ();
11502 emit_jump_insn (gen_jump (varargs_label));
11503 JUMP_LABEL (get_last_insn ()) = varargs_label;
11504
11505 emit_barrier ();
11506 }
11507
11508 emit_label (label);
11509 LABEL_NUSES (label) = 1;
11510
11511 /* If this function calls va_start, we now have to set the scratch
11512 register for the case where we do not call __morestack. In this
11513 case we need to set it based on the stack pointer. */
11514 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11515 {
11516 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11517 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11518 GEN_INT (UNITS_PER_WORD))));
11519
11520 emit_label (varargs_label);
11521 LABEL_NUSES (varargs_label) = 1;
11522 }
11523 }
11524
11525 /* We may have to tell the dataflow pass that the split stack prologue
11526 is initializing a scratch register. */
11527
11528 static void
11529 ix86_live_on_entry (bitmap regs)
11530 {
11531 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11532 {
11533 gcc_assert (flag_split_stack);
11534 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11535 }
11536 }
11537 \f
11538 /* Determine if op is suitable SUBREG RTX for address. */
11539
11540 static bool
11541 ix86_address_subreg_operand (rtx op)
11542 {
11543 enum machine_mode mode;
11544
11545 if (!REG_P (op))
11546 return false;
11547
11548 mode = GET_MODE (op);
11549
11550 if (GET_MODE_CLASS (mode) != MODE_INT)
11551 return false;
11552
11553 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11554 failures when the register is one word out of a two word structure. */
11555 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11556 return false;
11557
11558 /* Allow only SUBREGs of non-eliminable hard registers. */
11559 return register_no_elim_operand (op, mode);
11560 }
11561
11562 /* Extract the parts of an RTL expression that is a valid memory address
11563 for an instruction. Return 0 if the structure of the address is
11564 grossly off. Return -1 if the address contains ASHIFT, so it is not
11565 strictly valid, but still used for computing length of lea instruction. */
11566
11567 int
11568 ix86_decompose_address (rtx addr, struct ix86_address *out)
11569 {
11570 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11571 rtx base_reg, index_reg;
11572 HOST_WIDE_INT scale = 1;
11573 rtx scale_rtx = NULL_RTX;
11574 rtx tmp;
11575 int retval = 1;
11576 enum ix86_address_seg seg = SEG_DEFAULT;
11577
11578 /* Allow zero-extended SImode addresses,
11579 they will be emitted with addr32 prefix. */
11580 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11581 {
11582 if (GET_CODE (addr) == ZERO_EXTEND
11583 && GET_MODE (XEXP (addr, 0)) == SImode)
11584 addr = XEXP (addr, 0);
11585 else if (GET_CODE (addr) == AND
11586 && const_32bit_mask (XEXP (addr, 1), DImode))
11587 {
11588 addr = XEXP (addr, 0);
11589
11590 /* Adjust SUBREGs. */
11591 if (GET_CODE (addr) == SUBREG
11592 && GET_MODE (SUBREG_REG (addr)) == SImode)
11593 addr = SUBREG_REG (addr);
11594 else if (GET_MODE (addr) == DImode)
11595 addr = gen_rtx_SUBREG (SImode, addr, 0);
11596 else if (GET_MODE (addr) != VOIDmode)
11597 return 0;
11598 }
11599 }
11600
11601 if (REG_P (addr))
11602 base = addr;
11603 else if (GET_CODE (addr) == SUBREG)
11604 {
11605 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11606 base = addr;
11607 else
11608 return 0;
11609 }
11610 else if (GET_CODE (addr) == PLUS)
11611 {
11612 rtx addends[4], op;
11613 int n = 0, i;
11614
11615 op = addr;
11616 do
11617 {
11618 if (n >= 4)
11619 return 0;
11620 addends[n++] = XEXP (op, 1);
11621 op = XEXP (op, 0);
11622 }
11623 while (GET_CODE (op) == PLUS);
11624 if (n >= 4)
11625 return 0;
11626 addends[n] = op;
11627
11628 for (i = n; i >= 0; --i)
11629 {
11630 op = addends[i];
11631 switch (GET_CODE (op))
11632 {
11633 case MULT:
11634 if (index)
11635 return 0;
11636 index = XEXP (op, 0);
11637 scale_rtx = XEXP (op, 1);
11638 break;
11639
11640 case ASHIFT:
11641 if (index)
11642 return 0;
11643 index = XEXP (op, 0);
11644 tmp = XEXP (op, 1);
11645 if (!CONST_INT_P (tmp))
11646 return 0;
11647 scale = INTVAL (tmp);
11648 if ((unsigned HOST_WIDE_INT) scale > 3)
11649 return 0;
11650 scale = 1 << scale;
11651 break;
11652
11653 case ZERO_EXTEND:
11654 op = XEXP (op, 0);
11655 if (GET_CODE (op) != UNSPEC)
11656 return 0;
11657 /* FALLTHRU */
11658
11659 case UNSPEC:
11660 if (XINT (op, 1) == UNSPEC_TP
11661 && TARGET_TLS_DIRECT_SEG_REFS
11662 && seg == SEG_DEFAULT)
11663 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11664 else
11665 return 0;
11666 break;
11667
11668 case SUBREG:
11669 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11670 return 0;
11671 /* FALLTHRU */
11672
11673 case REG:
11674 if (!base)
11675 base = op;
11676 else if (!index)
11677 index = op;
11678 else
11679 return 0;
11680 break;
11681
11682 case CONST:
11683 case CONST_INT:
11684 case SYMBOL_REF:
11685 case LABEL_REF:
11686 if (disp)
11687 return 0;
11688 disp = op;
11689 break;
11690
11691 default:
11692 return 0;
11693 }
11694 }
11695 }
11696 else if (GET_CODE (addr) == MULT)
11697 {
11698 index = XEXP (addr, 0); /* index*scale */
11699 scale_rtx = XEXP (addr, 1);
11700 }
11701 else if (GET_CODE (addr) == ASHIFT)
11702 {
11703 /* We're called for lea too, which implements ashift on occasion. */
11704 index = XEXP (addr, 0);
11705 tmp = XEXP (addr, 1);
11706 if (!CONST_INT_P (tmp))
11707 return 0;
11708 scale = INTVAL (tmp);
11709 if ((unsigned HOST_WIDE_INT) scale > 3)
11710 return 0;
11711 scale = 1 << scale;
11712 retval = -1;
11713 }
11714 else
11715 disp = addr; /* displacement */
11716
11717 if (index)
11718 {
11719 if (REG_P (index))
11720 ;
11721 else if (GET_CODE (index) == SUBREG
11722 && ix86_address_subreg_operand (SUBREG_REG (index)))
11723 ;
11724 else
11725 return 0;
11726 }
11727
11728 /* Address override works only on the (%reg) part of %fs:(%reg). */
11729 if (seg != SEG_DEFAULT
11730 && ((base && GET_MODE (base) != word_mode)
11731 || (index && GET_MODE (index) != word_mode)))
11732 return 0;
11733
11734 /* Extract the integral value of scale. */
11735 if (scale_rtx)
11736 {
11737 if (!CONST_INT_P (scale_rtx))
11738 return 0;
11739 scale = INTVAL (scale_rtx);
11740 }
11741
11742 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11743 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11744
11745 /* Avoid useless 0 displacement. */
11746 if (disp == const0_rtx && (base || index))
11747 disp = NULL_RTX;
11748
11749 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11750 if (base_reg && index_reg && scale == 1
11751 && (index_reg == arg_pointer_rtx
11752 || index_reg == frame_pointer_rtx
11753 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11754 {
11755 rtx tmp;
11756 tmp = base, base = index, index = tmp;
11757 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11758 }
11759
11760 /* Special case: %ebp cannot be encoded as a base without a displacement.
11761 Similarly %r13. */
11762 if (!disp
11763 && base_reg
11764 && (base_reg == hard_frame_pointer_rtx
11765 || base_reg == frame_pointer_rtx
11766 || base_reg == arg_pointer_rtx
11767 || (REG_P (base_reg)
11768 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11769 || REGNO (base_reg) == R13_REG))))
11770 disp = const0_rtx;
11771
11772 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11773 Avoid this by transforming to [%esi+0].
11774 Reload calls address legitimization without cfun defined, so we need
11775 to test cfun for being non-NULL. */
11776 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11777 && base_reg && !index_reg && !disp
11778 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11779 disp = const0_rtx;
11780
11781 /* Special case: encode reg+reg instead of reg*2. */
11782 if (!base && index && scale == 2)
11783 base = index, base_reg = index_reg, scale = 1;
11784
11785 /* Special case: scaling cannot be encoded without base or displacement. */
11786 if (!base && !disp && index && scale != 1)
11787 disp = const0_rtx;
11788
11789 out->base = base;
11790 out->index = index;
11791 out->disp = disp;
11792 out->scale = scale;
11793 out->seg = seg;
11794
11795 return retval;
11796 }
11797 \f
11798 /* Return cost of the memory address x.
11799 For i386, it is better to use a complex address than let gcc copy
11800 the address into a reg and make a new pseudo. But not if the address
11801 requires to two regs - that would mean more pseudos with longer
11802 lifetimes. */
11803 static int
11804 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11805 {
11806 struct ix86_address parts;
11807 int cost = 1;
11808 int ok = ix86_decompose_address (x, &parts);
11809
11810 gcc_assert (ok);
11811
11812 if (parts.base && GET_CODE (parts.base) == SUBREG)
11813 parts.base = SUBREG_REG (parts.base);
11814 if (parts.index && GET_CODE (parts.index) == SUBREG)
11815 parts.index = SUBREG_REG (parts.index);
11816
11817 /* Attempt to minimize number of registers in the address. */
11818 if ((parts.base
11819 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11820 || (parts.index
11821 && (!REG_P (parts.index)
11822 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11823 cost++;
11824
11825 if (parts.base
11826 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11827 && parts.index
11828 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11829 && parts.base != parts.index)
11830 cost++;
11831
11832 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11833 since it's predecode logic can't detect the length of instructions
11834 and it degenerates to vector decoded. Increase cost of such
11835 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11836 to split such addresses or even refuse such addresses at all.
11837
11838 Following addressing modes are affected:
11839 [base+scale*index]
11840 [scale*index+disp]
11841 [base+index]
11842
11843 The first and last case may be avoidable by explicitly coding the zero in
11844 memory address, but I don't have AMD-K6 machine handy to check this
11845 theory. */
11846
11847 if (TARGET_K6
11848 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11849 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11850 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11851 cost += 10;
11852
11853 return cost;
11854 }
11855 \f
11856 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11857 this is used for to form addresses to local data when -fPIC is in
11858 use. */
11859
11860 static bool
11861 darwin_local_data_pic (rtx disp)
11862 {
11863 return (GET_CODE (disp) == UNSPEC
11864 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11865 }
11866
11867 /* Determine if a given RTX is a valid constant. We already know this
11868 satisfies CONSTANT_P. */
11869
11870 static bool
11871 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11872 {
11873 switch (GET_CODE (x))
11874 {
11875 case CONST:
11876 x = XEXP (x, 0);
11877
11878 if (GET_CODE (x) == PLUS)
11879 {
11880 if (!CONST_INT_P (XEXP (x, 1)))
11881 return false;
11882 x = XEXP (x, 0);
11883 }
11884
11885 if (TARGET_MACHO && darwin_local_data_pic (x))
11886 return true;
11887
11888 /* Only some unspecs are valid as "constants". */
11889 if (GET_CODE (x) == UNSPEC)
11890 switch (XINT (x, 1))
11891 {
11892 case UNSPEC_GOT:
11893 case UNSPEC_GOTOFF:
11894 case UNSPEC_PLTOFF:
11895 return TARGET_64BIT;
11896 case UNSPEC_TPOFF:
11897 case UNSPEC_NTPOFF:
11898 x = XVECEXP (x, 0, 0);
11899 return (GET_CODE (x) == SYMBOL_REF
11900 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11901 case UNSPEC_DTPOFF:
11902 x = XVECEXP (x, 0, 0);
11903 return (GET_CODE (x) == SYMBOL_REF
11904 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11905 default:
11906 return false;
11907 }
11908
11909 /* We must have drilled down to a symbol. */
11910 if (GET_CODE (x) == LABEL_REF)
11911 return true;
11912 if (GET_CODE (x) != SYMBOL_REF)
11913 return false;
11914 /* FALLTHRU */
11915
11916 case SYMBOL_REF:
11917 /* TLS symbols are never valid. */
11918 if (SYMBOL_REF_TLS_MODEL (x))
11919 return false;
11920
11921 /* DLLIMPORT symbols are never valid. */
11922 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11923 && SYMBOL_REF_DLLIMPORT_P (x))
11924 return false;
11925
11926 #if TARGET_MACHO
11927 /* mdynamic-no-pic */
11928 if (MACHO_DYNAMIC_NO_PIC_P)
11929 return machopic_symbol_defined_p (x);
11930 #endif
11931 break;
11932
11933 case CONST_DOUBLE:
11934 if (GET_MODE (x) == TImode
11935 && x != CONST0_RTX (TImode)
11936 && !TARGET_64BIT)
11937 return false;
11938 break;
11939
11940 case CONST_VECTOR:
11941 if (!standard_sse_constant_p (x))
11942 return false;
11943
11944 default:
11945 break;
11946 }
11947
11948 /* Otherwise we handle everything else in the move patterns. */
11949 return true;
11950 }
11951
11952 /* Determine if it's legal to put X into the constant pool. This
11953 is not possible for the address of thread-local symbols, which
11954 is checked above. */
11955
11956 static bool
11957 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11958 {
11959 /* We can always put integral constants and vectors in memory. */
11960 switch (GET_CODE (x))
11961 {
11962 case CONST_INT:
11963 case CONST_DOUBLE:
11964 case CONST_VECTOR:
11965 return false;
11966
11967 default:
11968 break;
11969 }
11970 return !ix86_legitimate_constant_p (mode, x);
11971 }
11972
11973
11974 /* Nonzero if the constant value X is a legitimate general operand
11975 when generating PIC code. It is given that flag_pic is on and
11976 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11977
11978 bool
11979 legitimate_pic_operand_p (rtx x)
11980 {
11981 rtx inner;
11982
11983 switch (GET_CODE (x))
11984 {
11985 case CONST:
11986 inner = XEXP (x, 0);
11987 if (GET_CODE (inner) == PLUS
11988 && CONST_INT_P (XEXP (inner, 1)))
11989 inner = XEXP (inner, 0);
11990
11991 /* Only some unspecs are valid as "constants". */
11992 if (GET_CODE (inner) == UNSPEC)
11993 switch (XINT (inner, 1))
11994 {
11995 case UNSPEC_GOT:
11996 case UNSPEC_GOTOFF:
11997 case UNSPEC_PLTOFF:
11998 return TARGET_64BIT;
11999 case UNSPEC_TPOFF:
12000 x = XVECEXP (inner, 0, 0);
12001 return (GET_CODE (x) == SYMBOL_REF
12002 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12003 case UNSPEC_MACHOPIC_OFFSET:
12004 return legitimate_pic_address_disp_p (x);
12005 default:
12006 return false;
12007 }
12008 /* FALLTHRU */
12009
12010 case SYMBOL_REF:
12011 case LABEL_REF:
12012 return legitimate_pic_address_disp_p (x);
12013
12014 default:
12015 return true;
12016 }
12017 }
12018
12019 /* Determine if a given CONST RTX is a valid memory displacement
12020 in PIC mode. */
12021
12022 bool
12023 legitimate_pic_address_disp_p (rtx disp)
12024 {
12025 bool saw_plus;
12026
12027 /* In 64bit mode we can allow direct addresses of symbols and labels
12028 when they are not dynamic symbols. */
12029 if (TARGET_64BIT)
12030 {
12031 rtx op0 = disp, op1;
12032
12033 switch (GET_CODE (disp))
12034 {
12035 case LABEL_REF:
12036 return true;
12037
12038 case CONST:
12039 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12040 break;
12041 op0 = XEXP (XEXP (disp, 0), 0);
12042 op1 = XEXP (XEXP (disp, 0), 1);
12043 if (!CONST_INT_P (op1)
12044 || INTVAL (op1) >= 16*1024*1024
12045 || INTVAL (op1) < -16*1024*1024)
12046 break;
12047 if (GET_CODE (op0) == LABEL_REF)
12048 return true;
12049 if (GET_CODE (op0) == CONST
12050 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12051 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12052 return true;
12053 if (GET_CODE (op0) == UNSPEC
12054 && XINT (op0, 1) == UNSPEC_PCREL)
12055 return true;
12056 if (GET_CODE (op0) != SYMBOL_REF)
12057 break;
12058 /* FALLTHRU */
12059
12060 case SYMBOL_REF:
12061 /* TLS references should always be enclosed in UNSPEC. */
12062 if (SYMBOL_REF_TLS_MODEL (op0))
12063 return false;
12064 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12065 && ix86_cmodel != CM_LARGE_PIC)
12066 return true;
12067 break;
12068
12069 default:
12070 break;
12071 }
12072 }
12073 if (GET_CODE (disp) != CONST)
12074 return false;
12075 disp = XEXP (disp, 0);
12076
12077 if (TARGET_64BIT)
12078 {
12079 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12080 of GOT tables. We should not need these anyway. */
12081 if (GET_CODE (disp) != UNSPEC
12082 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12083 && XINT (disp, 1) != UNSPEC_GOTOFF
12084 && XINT (disp, 1) != UNSPEC_PCREL
12085 && XINT (disp, 1) != UNSPEC_PLTOFF))
12086 return false;
12087
12088 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12089 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12090 return false;
12091 return true;
12092 }
12093
12094 saw_plus = false;
12095 if (GET_CODE (disp) == PLUS)
12096 {
12097 if (!CONST_INT_P (XEXP (disp, 1)))
12098 return false;
12099 disp = XEXP (disp, 0);
12100 saw_plus = true;
12101 }
12102
12103 if (TARGET_MACHO && darwin_local_data_pic (disp))
12104 return true;
12105
12106 if (GET_CODE (disp) != UNSPEC)
12107 return false;
12108
12109 switch (XINT (disp, 1))
12110 {
12111 case UNSPEC_GOT:
12112 if (saw_plus)
12113 return false;
12114 /* We need to check for both symbols and labels because VxWorks loads
12115 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12116 details. */
12117 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12118 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12119 case UNSPEC_GOTOFF:
12120 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12121 While ABI specify also 32bit relocation but we don't produce it in
12122 small PIC model at all. */
12123 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12124 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12125 && !TARGET_64BIT)
12126 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12127 return false;
12128 case UNSPEC_GOTTPOFF:
12129 case UNSPEC_GOTNTPOFF:
12130 case UNSPEC_INDNTPOFF:
12131 if (saw_plus)
12132 return false;
12133 disp = XVECEXP (disp, 0, 0);
12134 return (GET_CODE (disp) == SYMBOL_REF
12135 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12136 case UNSPEC_NTPOFF:
12137 disp = XVECEXP (disp, 0, 0);
12138 return (GET_CODE (disp) == SYMBOL_REF
12139 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12140 case UNSPEC_DTPOFF:
12141 disp = XVECEXP (disp, 0, 0);
12142 return (GET_CODE (disp) == SYMBOL_REF
12143 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12144 }
12145
12146 return false;
12147 }
12148
12149 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12150 replace the input X, or the original X if no replacement is called for.
12151 The output parameter *WIN is 1 if the calling macro should goto WIN,
12152 0 if it should not. */
12153
12154 bool
12155 ix86_legitimize_reload_address (rtx x,
12156 enum machine_mode mode ATTRIBUTE_UNUSED,
12157 int opnum, int type,
12158 int ind_levels ATTRIBUTE_UNUSED)
12159 {
12160 /* Reload can generate:
12161
12162 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12163 (reg:DI 97))
12164 (reg:DI 2 cx))
12165
12166 This RTX is rejected from ix86_legitimate_address_p due to
12167 non-strictness of base register 97. Following this rejection,
12168 reload pushes all three components into separate registers,
12169 creating invalid memory address RTX.
12170
12171 Following code reloads only the invalid part of the
12172 memory address RTX. */
12173
12174 if (GET_CODE (x) == PLUS
12175 && REG_P (XEXP (x, 1))
12176 && GET_CODE (XEXP (x, 0)) == PLUS
12177 && REG_P (XEXP (XEXP (x, 0), 1)))
12178 {
12179 rtx base, index;
12180 bool something_reloaded = false;
12181
12182 base = XEXP (XEXP (x, 0), 1);
12183 if (!REG_OK_FOR_BASE_STRICT_P (base))
12184 {
12185 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12186 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12187 opnum, (enum reload_type) type);
12188 something_reloaded = true;
12189 }
12190
12191 index = XEXP (x, 1);
12192 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12193 {
12194 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12195 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12196 opnum, (enum reload_type) type);
12197 something_reloaded = true;
12198 }
12199
12200 gcc_assert (something_reloaded);
12201 return true;
12202 }
12203
12204 return false;
12205 }
12206
12207 /* Recognizes RTL expressions that are valid memory addresses for an
12208 instruction. The MODE argument is the machine mode for the MEM
12209 expression that wants to use this address.
12210
12211 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12212 convert common non-canonical forms to canonical form so that they will
12213 be recognized. */
12214
12215 static bool
12216 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12217 rtx addr, bool strict)
12218 {
12219 struct ix86_address parts;
12220 rtx base, index, disp;
12221 HOST_WIDE_INT scale;
12222
12223 /* Since constant address in x32 is signed extended to 64bit,
12224 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12225 if (TARGET_X32
12226 && CONST_INT_P (addr)
12227 && INTVAL (addr) < 0)
12228 return false;
12229
12230 if (ix86_decompose_address (addr, &parts) <= 0)
12231 /* Decomposition failed. */
12232 return false;
12233
12234 base = parts.base;
12235 index = parts.index;
12236 disp = parts.disp;
12237 scale = parts.scale;
12238
12239 /* Validate base register. */
12240 if (base)
12241 {
12242 rtx reg;
12243
12244 if (REG_P (base))
12245 reg = base;
12246 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12247 reg = SUBREG_REG (base);
12248 else
12249 /* Base is not a register. */
12250 return false;
12251
12252 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12253 return false;
12254
12255 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12256 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12257 /* Base is not valid. */
12258 return false;
12259 }
12260
12261 /* Validate index register. */
12262 if (index)
12263 {
12264 rtx reg;
12265
12266 if (REG_P (index))
12267 reg = index;
12268 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12269 reg = SUBREG_REG (index);
12270 else
12271 /* Index is not a register. */
12272 return false;
12273
12274 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12275 return false;
12276
12277 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12278 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12279 /* Index is not valid. */
12280 return false;
12281 }
12282
12283 /* Index and base should have the same mode. */
12284 if (base && index
12285 && GET_MODE (base) != GET_MODE (index))
12286 return false;
12287
12288 /* Validate scale factor. */
12289 if (scale != 1)
12290 {
12291 if (!index)
12292 /* Scale without index. */
12293 return false;
12294
12295 if (scale != 2 && scale != 4 && scale != 8)
12296 /* Scale is not a valid multiplier. */
12297 return false;
12298 }
12299
12300 /* Validate displacement. */
12301 if (disp)
12302 {
12303 if (GET_CODE (disp) == CONST
12304 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12305 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12306 switch (XINT (XEXP (disp, 0), 1))
12307 {
12308 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12309 used. While ABI specify also 32bit relocations, we don't produce
12310 them at all and use IP relative instead. */
12311 case UNSPEC_GOT:
12312 case UNSPEC_GOTOFF:
12313 gcc_assert (flag_pic);
12314 if (!TARGET_64BIT)
12315 goto is_legitimate_pic;
12316
12317 /* 64bit address unspec. */
12318 return false;
12319
12320 case UNSPEC_GOTPCREL:
12321 case UNSPEC_PCREL:
12322 gcc_assert (flag_pic);
12323 goto is_legitimate_pic;
12324
12325 case UNSPEC_GOTTPOFF:
12326 case UNSPEC_GOTNTPOFF:
12327 case UNSPEC_INDNTPOFF:
12328 case UNSPEC_NTPOFF:
12329 case UNSPEC_DTPOFF:
12330 break;
12331
12332 case UNSPEC_STACK_CHECK:
12333 gcc_assert (flag_split_stack);
12334 break;
12335
12336 default:
12337 /* Invalid address unspec. */
12338 return false;
12339 }
12340
12341 else if (SYMBOLIC_CONST (disp)
12342 && (flag_pic
12343 || (TARGET_MACHO
12344 #if TARGET_MACHO
12345 && MACHOPIC_INDIRECT
12346 && !machopic_operand_p (disp)
12347 #endif
12348 )))
12349 {
12350
12351 is_legitimate_pic:
12352 if (TARGET_64BIT && (index || base))
12353 {
12354 /* foo@dtpoff(%rX) is ok. */
12355 if (GET_CODE (disp) != CONST
12356 || GET_CODE (XEXP (disp, 0)) != PLUS
12357 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12358 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12359 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12360 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12361 /* Non-constant pic memory reference. */
12362 return false;
12363 }
12364 else if ((!TARGET_MACHO || flag_pic)
12365 && ! legitimate_pic_address_disp_p (disp))
12366 /* Displacement is an invalid pic construct. */
12367 return false;
12368 #if TARGET_MACHO
12369 else if (MACHO_DYNAMIC_NO_PIC_P
12370 && !ix86_legitimate_constant_p (Pmode, disp))
12371 /* displacment must be referenced via non_lazy_pointer */
12372 return false;
12373 #endif
12374
12375 /* This code used to verify that a symbolic pic displacement
12376 includes the pic_offset_table_rtx register.
12377
12378 While this is good idea, unfortunately these constructs may
12379 be created by "adds using lea" optimization for incorrect
12380 code like:
12381
12382 int a;
12383 int foo(int i)
12384 {
12385 return *(&a+i);
12386 }
12387
12388 This code is nonsensical, but results in addressing
12389 GOT table with pic_offset_table_rtx base. We can't
12390 just refuse it easily, since it gets matched by
12391 "addsi3" pattern, that later gets split to lea in the
12392 case output register differs from input. While this
12393 can be handled by separate addsi pattern for this case
12394 that never results in lea, this seems to be easier and
12395 correct fix for crash to disable this test. */
12396 }
12397 else if (GET_CODE (disp) != LABEL_REF
12398 && !CONST_INT_P (disp)
12399 && (GET_CODE (disp) != CONST
12400 || !ix86_legitimate_constant_p (Pmode, disp))
12401 && (GET_CODE (disp) != SYMBOL_REF
12402 || !ix86_legitimate_constant_p (Pmode, disp)))
12403 /* Displacement is not constant. */
12404 return false;
12405 else if (TARGET_64BIT
12406 && !x86_64_immediate_operand (disp, VOIDmode))
12407 /* Displacement is out of range. */
12408 return false;
12409 }
12410
12411 /* Everything looks valid. */
12412 return true;
12413 }
12414
12415 /* Determine if a given RTX is a valid constant address. */
12416
12417 bool
12418 constant_address_p (rtx x)
12419 {
12420 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12421 }
12422 \f
12423 /* Return a unique alias set for the GOT. */
12424
12425 static alias_set_type
12426 ix86_GOT_alias_set (void)
12427 {
12428 static alias_set_type set = -1;
12429 if (set == -1)
12430 set = new_alias_set ();
12431 return set;
12432 }
12433
12434 /* Return a legitimate reference for ORIG (an address) using the
12435 register REG. If REG is 0, a new pseudo is generated.
12436
12437 There are two types of references that must be handled:
12438
12439 1. Global data references must load the address from the GOT, via
12440 the PIC reg. An insn is emitted to do this load, and the reg is
12441 returned.
12442
12443 2. Static data references, constant pool addresses, and code labels
12444 compute the address as an offset from the GOT, whose base is in
12445 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12446 differentiate them from global data objects. The returned
12447 address is the PIC reg + an unspec constant.
12448
12449 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12450 reg also appears in the address. */
12451
12452 static rtx
12453 legitimize_pic_address (rtx orig, rtx reg)
12454 {
12455 rtx addr = orig;
12456 rtx new_rtx = orig;
12457 rtx base;
12458
12459 #if TARGET_MACHO
12460 if (TARGET_MACHO && !TARGET_64BIT)
12461 {
12462 if (reg == 0)
12463 reg = gen_reg_rtx (Pmode);
12464 /* Use the generic Mach-O PIC machinery. */
12465 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12466 }
12467 #endif
12468
12469 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12470 new_rtx = addr;
12471 else if (TARGET_64BIT
12472 && ix86_cmodel != CM_SMALL_PIC
12473 && gotoff_operand (addr, Pmode))
12474 {
12475 rtx tmpreg;
12476 /* This symbol may be referenced via a displacement from the PIC
12477 base address (@GOTOFF). */
12478
12479 if (reload_in_progress)
12480 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12481 if (GET_CODE (addr) == CONST)
12482 addr = XEXP (addr, 0);
12483 if (GET_CODE (addr) == PLUS)
12484 {
12485 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12486 UNSPEC_GOTOFF);
12487 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12488 }
12489 else
12490 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12491 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12492 if (!reg)
12493 tmpreg = gen_reg_rtx (Pmode);
12494 else
12495 tmpreg = reg;
12496 emit_move_insn (tmpreg, new_rtx);
12497
12498 if (reg != 0)
12499 {
12500 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12501 tmpreg, 1, OPTAB_DIRECT);
12502 new_rtx = reg;
12503 }
12504 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12505 }
12506 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12507 {
12508 /* This symbol may be referenced via a displacement from the PIC
12509 base address (@GOTOFF). */
12510
12511 if (reload_in_progress)
12512 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12513 if (GET_CODE (addr) == CONST)
12514 addr = XEXP (addr, 0);
12515 if (GET_CODE (addr) == PLUS)
12516 {
12517 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12518 UNSPEC_GOTOFF);
12519 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12520 }
12521 else
12522 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12523 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12524 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12525
12526 if (reg != 0)
12527 {
12528 emit_move_insn (reg, new_rtx);
12529 new_rtx = reg;
12530 }
12531 }
12532 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12533 /* We can't use @GOTOFF for text labels on VxWorks;
12534 see gotoff_operand. */
12535 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12536 {
12537 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12538 {
12539 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12540 return legitimize_dllimport_symbol (addr, true);
12541 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12542 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12543 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12544 {
12545 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12546 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12547 }
12548 }
12549
12550 /* For x64 PE-COFF there is no GOT table. So we use address
12551 directly. */
12552 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12553 {
12554 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12555 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12556
12557 if (reg == 0)
12558 reg = gen_reg_rtx (Pmode);
12559 emit_move_insn (reg, new_rtx);
12560 new_rtx = reg;
12561 }
12562 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12563 {
12564 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12565 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12566 new_rtx = gen_const_mem (Pmode, new_rtx);
12567 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12568
12569 if (reg == 0)
12570 reg = gen_reg_rtx (Pmode);
12571 /* Use directly gen_movsi, otherwise the address is loaded
12572 into register for CSE. We don't want to CSE this addresses,
12573 instead we CSE addresses from the GOT table, so skip this. */
12574 emit_insn (gen_movsi (reg, new_rtx));
12575 new_rtx = reg;
12576 }
12577 else
12578 {
12579 /* This symbol must be referenced via a load from the
12580 Global Offset Table (@GOT). */
12581
12582 if (reload_in_progress)
12583 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12584 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12585 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12586 if (TARGET_64BIT)
12587 new_rtx = force_reg (Pmode, new_rtx);
12588 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12589 new_rtx = gen_const_mem (Pmode, new_rtx);
12590 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12591
12592 if (reg == 0)
12593 reg = gen_reg_rtx (Pmode);
12594 emit_move_insn (reg, new_rtx);
12595 new_rtx = reg;
12596 }
12597 }
12598 else
12599 {
12600 if (CONST_INT_P (addr)
12601 && !x86_64_immediate_operand (addr, VOIDmode))
12602 {
12603 if (reg)
12604 {
12605 emit_move_insn (reg, addr);
12606 new_rtx = reg;
12607 }
12608 else
12609 new_rtx = force_reg (Pmode, addr);
12610 }
12611 else if (GET_CODE (addr) == CONST)
12612 {
12613 addr = XEXP (addr, 0);
12614
12615 /* We must match stuff we generate before. Assume the only
12616 unspecs that can get here are ours. Not that we could do
12617 anything with them anyway.... */
12618 if (GET_CODE (addr) == UNSPEC
12619 || (GET_CODE (addr) == PLUS
12620 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12621 return orig;
12622 gcc_assert (GET_CODE (addr) == PLUS);
12623 }
12624 if (GET_CODE (addr) == PLUS)
12625 {
12626 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12627
12628 /* Check first to see if this is a constant offset from a @GOTOFF
12629 symbol reference. */
12630 if (gotoff_operand (op0, Pmode)
12631 && CONST_INT_P (op1))
12632 {
12633 if (!TARGET_64BIT)
12634 {
12635 if (reload_in_progress)
12636 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12637 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12638 UNSPEC_GOTOFF);
12639 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12640 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12641 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12642
12643 if (reg != 0)
12644 {
12645 emit_move_insn (reg, new_rtx);
12646 new_rtx = reg;
12647 }
12648 }
12649 else
12650 {
12651 if (INTVAL (op1) < -16*1024*1024
12652 || INTVAL (op1) >= 16*1024*1024)
12653 {
12654 if (!x86_64_immediate_operand (op1, Pmode))
12655 op1 = force_reg (Pmode, op1);
12656 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12657 }
12658 }
12659 }
12660 else
12661 {
12662 base = legitimize_pic_address (XEXP (addr, 0), reg);
12663 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12664 base == reg ? NULL_RTX : reg);
12665
12666 if (CONST_INT_P (new_rtx))
12667 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12668 else
12669 {
12670 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12671 {
12672 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12673 new_rtx = XEXP (new_rtx, 1);
12674 }
12675 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12676 }
12677 }
12678 }
12679 }
12680 return new_rtx;
12681 }
12682 \f
12683 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12684
12685 static rtx
12686 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12687 {
12688 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12689
12690 if (GET_MODE (tp) != tp_mode)
12691 {
12692 gcc_assert (GET_MODE (tp) == SImode);
12693 gcc_assert (tp_mode == DImode);
12694
12695 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12696 }
12697
12698 if (to_reg)
12699 tp = copy_to_mode_reg (tp_mode, tp);
12700
12701 return tp;
12702 }
12703
12704 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12705
12706 static GTY(()) rtx ix86_tls_symbol;
12707
12708 static rtx
12709 ix86_tls_get_addr (void)
12710 {
12711 if (!ix86_tls_symbol)
12712 {
12713 const char *sym
12714 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12715 ? "___tls_get_addr" : "__tls_get_addr");
12716
12717 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12718 }
12719
12720 return ix86_tls_symbol;
12721 }
12722
12723 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12724
12725 static GTY(()) rtx ix86_tls_module_base_symbol;
12726
12727 rtx
12728 ix86_tls_module_base (void)
12729 {
12730 if (!ix86_tls_module_base_symbol)
12731 {
12732 ix86_tls_module_base_symbol
12733 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12734
12735 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12736 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12737 }
12738
12739 return ix86_tls_module_base_symbol;
12740 }
12741
12742 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12743 false if we expect this to be used for a memory address and true if
12744 we expect to load the address into a register. */
12745
12746 static rtx
12747 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12748 {
12749 rtx dest, base, off;
12750 rtx pic = NULL_RTX, tp = NULL_RTX;
12751 enum machine_mode tp_mode = Pmode;
12752 int type;
12753
12754 switch (model)
12755 {
12756 case TLS_MODEL_GLOBAL_DYNAMIC:
12757 dest = gen_reg_rtx (Pmode);
12758
12759 if (!TARGET_64BIT)
12760 {
12761 if (flag_pic)
12762 pic = pic_offset_table_rtx;
12763 else
12764 {
12765 pic = gen_reg_rtx (Pmode);
12766 emit_insn (gen_set_got (pic));
12767 }
12768 }
12769
12770 if (TARGET_GNU2_TLS)
12771 {
12772 if (TARGET_64BIT)
12773 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12774 else
12775 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12776
12777 tp = get_thread_pointer (Pmode, true);
12778 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12779
12780 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12781 }
12782 else
12783 {
12784 rtx caddr = ix86_tls_get_addr ();
12785
12786 if (TARGET_64BIT)
12787 {
12788 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12789
12790 start_sequence ();
12791 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12792 caddr));
12793 insns = get_insns ();
12794 end_sequence ();
12795
12796 RTL_CONST_CALL_P (insns) = 1;
12797 emit_libcall_block (insns, dest, rax, x);
12798 }
12799 else
12800 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12801 }
12802 break;
12803
12804 case TLS_MODEL_LOCAL_DYNAMIC:
12805 base = gen_reg_rtx (Pmode);
12806
12807 if (!TARGET_64BIT)
12808 {
12809 if (flag_pic)
12810 pic = pic_offset_table_rtx;
12811 else
12812 {
12813 pic = gen_reg_rtx (Pmode);
12814 emit_insn (gen_set_got (pic));
12815 }
12816 }
12817
12818 if (TARGET_GNU2_TLS)
12819 {
12820 rtx tmp = ix86_tls_module_base ();
12821
12822 if (TARGET_64BIT)
12823 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12824 else
12825 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12826
12827 tp = get_thread_pointer (Pmode, true);
12828 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12829 gen_rtx_MINUS (Pmode, tmp, tp));
12830 }
12831 else
12832 {
12833 rtx caddr = ix86_tls_get_addr ();
12834
12835 if (TARGET_64BIT)
12836 {
12837 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12838
12839 start_sequence ();
12840 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12841 caddr));
12842 insns = get_insns ();
12843 end_sequence ();
12844
12845 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12846 share the LD_BASE result with other LD model accesses. */
12847 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12848 UNSPEC_TLS_LD_BASE);
12849
12850 RTL_CONST_CALL_P (insns) = 1;
12851 emit_libcall_block (insns, base, rax, eqv);
12852 }
12853 else
12854 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12855 }
12856
12857 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12858 off = gen_rtx_CONST (Pmode, off);
12859
12860 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12861
12862 if (TARGET_GNU2_TLS)
12863 {
12864 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12865
12866 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12867 }
12868 break;
12869
12870 case TLS_MODEL_INITIAL_EXEC:
12871 if (TARGET_64BIT)
12872 {
12873 if (TARGET_SUN_TLS && !TARGET_X32)
12874 {
12875 /* The Sun linker took the AMD64 TLS spec literally
12876 and can only handle %rax as destination of the
12877 initial executable code sequence. */
12878
12879 dest = gen_reg_rtx (DImode);
12880 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12881 return dest;
12882 }
12883
12884 /* Generate DImode references to avoid %fs:(%reg32)
12885 problems and linker IE->LE relaxation bug. */
12886 tp_mode = DImode;
12887 pic = NULL;
12888 type = UNSPEC_GOTNTPOFF;
12889 }
12890 else if (flag_pic)
12891 {
12892 if (reload_in_progress)
12893 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12894 pic = pic_offset_table_rtx;
12895 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12896 }
12897 else if (!TARGET_ANY_GNU_TLS)
12898 {
12899 pic = gen_reg_rtx (Pmode);
12900 emit_insn (gen_set_got (pic));
12901 type = UNSPEC_GOTTPOFF;
12902 }
12903 else
12904 {
12905 pic = NULL;
12906 type = UNSPEC_INDNTPOFF;
12907 }
12908
12909 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12910 off = gen_rtx_CONST (tp_mode, off);
12911 if (pic)
12912 off = gen_rtx_PLUS (tp_mode, pic, off);
12913 off = gen_const_mem (tp_mode, off);
12914 set_mem_alias_set (off, ix86_GOT_alias_set ());
12915
12916 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12917 {
12918 base = get_thread_pointer (tp_mode,
12919 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12920 off = force_reg (tp_mode, off);
12921 return gen_rtx_PLUS (tp_mode, base, off);
12922 }
12923 else
12924 {
12925 base = get_thread_pointer (Pmode, true);
12926 dest = gen_reg_rtx (Pmode);
12927 emit_insn (ix86_gen_sub3 (dest, base, off));
12928 }
12929 break;
12930
12931 case TLS_MODEL_LOCAL_EXEC:
12932 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12933 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12934 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12935 off = gen_rtx_CONST (Pmode, off);
12936
12937 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12938 {
12939 base = get_thread_pointer (Pmode,
12940 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12941 return gen_rtx_PLUS (Pmode, base, off);
12942 }
12943 else
12944 {
12945 base = get_thread_pointer (Pmode, true);
12946 dest = gen_reg_rtx (Pmode);
12947 emit_insn (ix86_gen_sub3 (dest, base, off));
12948 }
12949 break;
12950
12951 default:
12952 gcc_unreachable ();
12953 }
12954
12955 return dest;
12956 }
12957
12958 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12959 to symbol DECL. */
12960
12961 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12962 htab_t dllimport_map;
12963
12964 static tree
12965 get_dllimport_decl (tree decl)
12966 {
12967 struct tree_map *h, in;
12968 void **loc;
12969 const char *name;
12970 const char *prefix;
12971 size_t namelen, prefixlen;
12972 char *imp_name;
12973 tree to;
12974 rtx rtl;
12975
12976 if (!dllimport_map)
12977 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12978
12979 in.hash = htab_hash_pointer (decl);
12980 in.base.from = decl;
12981 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12982 h = (struct tree_map *) *loc;
12983 if (h)
12984 return h->to;
12985
12986 *loc = h = ggc_alloc_tree_map ();
12987 h->hash = in.hash;
12988 h->base.from = decl;
12989 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12990 VAR_DECL, NULL, ptr_type_node);
12991 DECL_ARTIFICIAL (to) = 1;
12992 DECL_IGNORED_P (to) = 1;
12993 DECL_EXTERNAL (to) = 1;
12994 TREE_READONLY (to) = 1;
12995
12996 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12997 name = targetm.strip_name_encoding (name);
12998 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12999 ? "*__imp_" : "*__imp__";
13000 namelen = strlen (name);
13001 prefixlen = strlen (prefix);
13002 imp_name = (char *) alloca (namelen + prefixlen + 1);
13003 memcpy (imp_name, prefix, prefixlen);
13004 memcpy (imp_name + prefixlen, name, namelen + 1);
13005
13006 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13007 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13008 SET_SYMBOL_REF_DECL (rtl, to);
13009 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13010
13011 rtl = gen_const_mem (Pmode, rtl);
13012 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13013
13014 SET_DECL_RTL (to, rtl);
13015 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13016
13017 return to;
13018 }
13019
13020 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13021 true if we require the result be a register. */
13022
13023 static rtx
13024 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13025 {
13026 tree imp_decl;
13027 rtx x;
13028
13029 gcc_assert (SYMBOL_REF_DECL (symbol));
13030 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13031
13032 x = DECL_RTL (imp_decl);
13033 if (want_reg)
13034 x = force_reg (Pmode, x);
13035 return x;
13036 }
13037
13038 /* Try machine-dependent ways of modifying an illegitimate address
13039 to be legitimate. If we find one, return the new, valid address.
13040 This macro is used in only one place: `memory_address' in explow.c.
13041
13042 OLDX is the address as it was before break_out_memory_refs was called.
13043 In some cases it is useful to look at this to decide what needs to be done.
13044
13045 It is always safe for this macro to do nothing. It exists to recognize
13046 opportunities to optimize the output.
13047
13048 For the 80386, we handle X+REG by loading X into a register R and
13049 using R+REG. R will go in a general reg and indexing will be used.
13050 However, if REG is a broken-out memory address or multiplication,
13051 nothing needs to be done because REG can certainly go in a general reg.
13052
13053 When -fpic is used, special handling is needed for symbolic references.
13054 See comments by legitimize_pic_address in i386.c for details. */
13055
13056 static rtx
13057 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13058 enum machine_mode mode)
13059 {
13060 int changed = 0;
13061 unsigned log;
13062
13063 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13064 if (log)
13065 return legitimize_tls_address (x, (enum tls_model) log, false);
13066 if (GET_CODE (x) == CONST
13067 && GET_CODE (XEXP (x, 0)) == PLUS
13068 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13069 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13070 {
13071 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13072 (enum tls_model) log, false);
13073 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13074 }
13075
13076 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13077 {
13078 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13079 return legitimize_dllimport_symbol (x, true);
13080 if (GET_CODE (x) == CONST
13081 && GET_CODE (XEXP (x, 0)) == PLUS
13082 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13083 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13084 {
13085 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13086 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13087 }
13088 }
13089
13090 if (flag_pic && SYMBOLIC_CONST (x))
13091 return legitimize_pic_address (x, 0);
13092
13093 #if TARGET_MACHO
13094 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13095 return machopic_indirect_data_reference (x, 0);
13096 #endif
13097
13098 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13099 if (GET_CODE (x) == ASHIFT
13100 && CONST_INT_P (XEXP (x, 1))
13101 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13102 {
13103 changed = 1;
13104 log = INTVAL (XEXP (x, 1));
13105 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13106 GEN_INT (1 << log));
13107 }
13108
13109 if (GET_CODE (x) == PLUS)
13110 {
13111 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13112
13113 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13114 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13115 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13116 {
13117 changed = 1;
13118 log = INTVAL (XEXP (XEXP (x, 0), 1));
13119 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13120 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13121 GEN_INT (1 << log));
13122 }
13123
13124 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13125 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13126 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13127 {
13128 changed = 1;
13129 log = INTVAL (XEXP (XEXP (x, 1), 1));
13130 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13131 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13132 GEN_INT (1 << log));
13133 }
13134
13135 /* Put multiply first if it isn't already. */
13136 if (GET_CODE (XEXP (x, 1)) == MULT)
13137 {
13138 rtx tmp = XEXP (x, 0);
13139 XEXP (x, 0) = XEXP (x, 1);
13140 XEXP (x, 1) = tmp;
13141 changed = 1;
13142 }
13143
13144 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13145 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13146 created by virtual register instantiation, register elimination, and
13147 similar optimizations. */
13148 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13149 {
13150 changed = 1;
13151 x = gen_rtx_PLUS (Pmode,
13152 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13153 XEXP (XEXP (x, 1), 0)),
13154 XEXP (XEXP (x, 1), 1));
13155 }
13156
13157 /* Canonicalize
13158 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13159 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13160 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13161 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13162 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13163 && CONSTANT_P (XEXP (x, 1)))
13164 {
13165 rtx constant;
13166 rtx other = NULL_RTX;
13167
13168 if (CONST_INT_P (XEXP (x, 1)))
13169 {
13170 constant = XEXP (x, 1);
13171 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13172 }
13173 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13174 {
13175 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13176 other = XEXP (x, 1);
13177 }
13178 else
13179 constant = 0;
13180
13181 if (constant)
13182 {
13183 changed = 1;
13184 x = gen_rtx_PLUS (Pmode,
13185 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13186 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13187 plus_constant (Pmode, other,
13188 INTVAL (constant)));
13189 }
13190 }
13191
13192 if (changed && ix86_legitimate_address_p (mode, x, false))
13193 return x;
13194
13195 if (GET_CODE (XEXP (x, 0)) == MULT)
13196 {
13197 changed = 1;
13198 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13199 }
13200
13201 if (GET_CODE (XEXP (x, 1)) == MULT)
13202 {
13203 changed = 1;
13204 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13205 }
13206
13207 if (changed
13208 && REG_P (XEXP (x, 1))
13209 && REG_P (XEXP (x, 0)))
13210 return x;
13211
13212 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13213 {
13214 changed = 1;
13215 x = legitimize_pic_address (x, 0);
13216 }
13217
13218 if (changed && ix86_legitimate_address_p (mode, x, false))
13219 return x;
13220
13221 if (REG_P (XEXP (x, 0)))
13222 {
13223 rtx temp = gen_reg_rtx (Pmode);
13224 rtx val = force_operand (XEXP (x, 1), temp);
13225 if (val != temp)
13226 {
13227 if (GET_MODE (val) != Pmode)
13228 val = convert_to_mode (Pmode, val, 1);
13229 emit_move_insn (temp, val);
13230 }
13231
13232 XEXP (x, 1) = temp;
13233 return x;
13234 }
13235
13236 else if (REG_P (XEXP (x, 1)))
13237 {
13238 rtx temp = gen_reg_rtx (Pmode);
13239 rtx val = force_operand (XEXP (x, 0), temp);
13240 if (val != temp)
13241 {
13242 if (GET_MODE (val) != Pmode)
13243 val = convert_to_mode (Pmode, val, 1);
13244 emit_move_insn (temp, val);
13245 }
13246
13247 XEXP (x, 0) = temp;
13248 return x;
13249 }
13250 }
13251
13252 return x;
13253 }
13254 \f
13255 /* Print an integer constant expression in assembler syntax. Addition
13256 and subtraction are the only arithmetic that may appear in these
13257 expressions. FILE is the stdio stream to write to, X is the rtx, and
13258 CODE is the operand print code from the output string. */
13259
13260 static void
13261 output_pic_addr_const (FILE *file, rtx x, int code)
13262 {
13263 char buf[256];
13264
13265 switch (GET_CODE (x))
13266 {
13267 case PC:
13268 gcc_assert (flag_pic);
13269 putc ('.', file);
13270 break;
13271
13272 case SYMBOL_REF:
13273 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13274 output_addr_const (file, x);
13275 else
13276 {
13277 const char *name = XSTR (x, 0);
13278
13279 /* Mark the decl as referenced so that cgraph will
13280 output the function. */
13281 if (SYMBOL_REF_DECL (x))
13282 mark_decl_referenced (SYMBOL_REF_DECL (x));
13283
13284 #if TARGET_MACHO
13285 if (MACHOPIC_INDIRECT
13286 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13287 name = machopic_indirection_name (x, /*stub_p=*/true);
13288 #endif
13289 assemble_name (file, name);
13290 }
13291 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13292 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13293 fputs ("@PLT", file);
13294 break;
13295
13296 case LABEL_REF:
13297 x = XEXP (x, 0);
13298 /* FALLTHRU */
13299 case CODE_LABEL:
13300 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13301 assemble_name (asm_out_file, buf);
13302 break;
13303
13304 case CONST_INT:
13305 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13306 break;
13307
13308 case CONST:
13309 /* This used to output parentheses around the expression,
13310 but that does not work on the 386 (either ATT or BSD assembler). */
13311 output_pic_addr_const (file, XEXP (x, 0), code);
13312 break;
13313
13314 case CONST_DOUBLE:
13315 if (GET_MODE (x) == VOIDmode)
13316 {
13317 /* We can use %d if the number is <32 bits and positive. */
13318 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13319 fprintf (file, "0x%lx%08lx",
13320 (unsigned long) CONST_DOUBLE_HIGH (x),
13321 (unsigned long) CONST_DOUBLE_LOW (x));
13322 else
13323 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13324 }
13325 else
13326 /* We can't handle floating point constants;
13327 TARGET_PRINT_OPERAND must handle them. */
13328 output_operand_lossage ("floating constant misused");
13329 break;
13330
13331 case PLUS:
13332 /* Some assemblers need integer constants to appear first. */
13333 if (CONST_INT_P (XEXP (x, 0)))
13334 {
13335 output_pic_addr_const (file, XEXP (x, 0), code);
13336 putc ('+', file);
13337 output_pic_addr_const (file, XEXP (x, 1), code);
13338 }
13339 else
13340 {
13341 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13342 output_pic_addr_const (file, XEXP (x, 1), code);
13343 putc ('+', file);
13344 output_pic_addr_const (file, XEXP (x, 0), code);
13345 }
13346 break;
13347
13348 case MINUS:
13349 if (!TARGET_MACHO)
13350 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13351 output_pic_addr_const (file, XEXP (x, 0), code);
13352 putc ('-', file);
13353 output_pic_addr_const (file, XEXP (x, 1), code);
13354 if (!TARGET_MACHO)
13355 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13356 break;
13357
13358 case UNSPEC:
13359 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13360 {
13361 bool f = i386_asm_output_addr_const_extra (file, x);
13362 gcc_assert (f);
13363 break;
13364 }
13365
13366 gcc_assert (XVECLEN (x, 0) == 1);
13367 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13368 switch (XINT (x, 1))
13369 {
13370 case UNSPEC_GOT:
13371 fputs ("@GOT", file);
13372 break;
13373 case UNSPEC_GOTOFF:
13374 fputs ("@GOTOFF", file);
13375 break;
13376 case UNSPEC_PLTOFF:
13377 fputs ("@PLTOFF", file);
13378 break;
13379 case UNSPEC_PCREL:
13380 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13381 "(%rip)" : "[rip]", file);
13382 break;
13383 case UNSPEC_GOTPCREL:
13384 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13385 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13386 break;
13387 case UNSPEC_GOTTPOFF:
13388 /* FIXME: This might be @TPOFF in Sun ld too. */
13389 fputs ("@gottpoff", file);
13390 break;
13391 case UNSPEC_TPOFF:
13392 fputs ("@tpoff", file);
13393 break;
13394 case UNSPEC_NTPOFF:
13395 if (TARGET_64BIT)
13396 fputs ("@tpoff", file);
13397 else
13398 fputs ("@ntpoff", file);
13399 break;
13400 case UNSPEC_DTPOFF:
13401 fputs ("@dtpoff", file);
13402 break;
13403 case UNSPEC_GOTNTPOFF:
13404 if (TARGET_64BIT)
13405 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13406 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13407 else
13408 fputs ("@gotntpoff", file);
13409 break;
13410 case UNSPEC_INDNTPOFF:
13411 fputs ("@indntpoff", file);
13412 break;
13413 #if TARGET_MACHO
13414 case UNSPEC_MACHOPIC_OFFSET:
13415 putc ('-', file);
13416 machopic_output_function_base_name (file);
13417 break;
13418 #endif
13419 default:
13420 output_operand_lossage ("invalid UNSPEC as operand");
13421 break;
13422 }
13423 break;
13424
13425 default:
13426 output_operand_lossage ("invalid expression as operand");
13427 }
13428 }
13429
13430 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13431 We need to emit DTP-relative relocations. */
13432
13433 static void ATTRIBUTE_UNUSED
13434 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13435 {
13436 fputs (ASM_LONG, file);
13437 output_addr_const (file, x);
13438 fputs ("@dtpoff", file);
13439 switch (size)
13440 {
13441 case 4:
13442 break;
13443 case 8:
13444 fputs (", 0", file);
13445 break;
13446 default:
13447 gcc_unreachable ();
13448 }
13449 }
13450
13451 /* Return true if X is a representation of the PIC register. This copes
13452 with calls from ix86_find_base_term, where the register might have
13453 been replaced by a cselib value. */
13454
13455 static bool
13456 ix86_pic_register_p (rtx x)
13457 {
13458 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13459 return (pic_offset_table_rtx
13460 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13461 else
13462 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13463 }
13464
13465 /* Helper function for ix86_delegitimize_address.
13466 Attempt to delegitimize TLS local-exec accesses. */
13467
13468 static rtx
13469 ix86_delegitimize_tls_address (rtx orig_x)
13470 {
13471 rtx x = orig_x, unspec;
13472 struct ix86_address addr;
13473
13474 if (!TARGET_TLS_DIRECT_SEG_REFS)
13475 return orig_x;
13476 if (MEM_P (x))
13477 x = XEXP (x, 0);
13478 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13479 return orig_x;
13480 if (ix86_decompose_address (x, &addr) == 0
13481 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13482 || addr.disp == NULL_RTX
13483 || GET_CODE (addr.disp) != CONST)
13484 return orig_x;
13485 unspec = XEXP (addr.disp, 0);
13486 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13487 unspec = XEXP (unspec, 0);
13488 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13489 return orig_x;
13490 x = XVECEXP (unspec, 0, 0);
13491 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13492 if (unspec != XEXP (addr.disp, 0))
13493 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13494 if (addr.index)
13495 {
13496 rtx idx = addr.index;
13497 if (addr.scale != 1)
13498 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13499 x = gen_rtx_PLUS (Pmode, idx, x);
13500 }
13501 if (addr.base)
13502 x = gen_rtx_PLUS (Pmode, addr.base, x);
13503 if (MEM_P (orig_x))
13504 x = replace_equiv_address_nv (orig_x, x);
13505 return x;
13506 }
13507
13508 /* In the name of slightly smaller debug output, and to cater to
13509 general assembler lossage, recognize PIC+GOTOFF and turn it back
13510 into a direct symbol reference.
13511
13512 On Darwin, this is necessary to avoid a crash, because Darwin
13513 has a different PIC label for each routine but the DWARF debugging
13514 information is not associated with any particular routine, so it's
13515 necessary to remove references to the PIC label from RTL stored by
13516 the DWARF output code. */
13517
13518 static rtx
13519 ix86_delegitimize_address (rtx x)
13520 {
13521 rtx orig_x = delegitimize_mem_from_attrs (x);
13522 /* addend is NULL or some rtx if x is something+GOTOFF where
13523 something doesn't include the PIC register. */
13524 rtx addend = NULL_RTX;
13525 /* reg_addend is NULL or a multiple of some register. */
13526 rtx reg_addend = NULL_RTX;
13527 /* const_addend is NULL or a const_int. */
13528 rtx const_addend = NULL_RTX;
13529 /* This is the result, or NULL. */
13530 rtx result = NULL_RTX;
13531
13532 x = orig_x;
13533
13534 if (MEM_P (x))
13535 x = XEXP (x, 0);
13536
13537 if (TARGET_64BIT)
13538 {
13539 if (GET_CODE (x) == CONST
13540 && GET_CODE (XEXP (x, 0)) == PLUS
13541 && GET_MODE (XEXP (x, 0)) == Pmode
13542 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13543 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13544 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13545 {
13546 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13547 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13548 if (MEM_P (orig_x))
13549 x = replace_equiv_address_nv (orig_x, x);
13550 return x;
13551 }
13552 if (GET_CODE (x) != CONST
13553 || GET_CODE (XEXP (x, 0)) != UNSPEC
13554 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13555 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13556 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13557 return ix86_delegitimize_tls_address (orig_x);
13558 x = XVECEXP (XEXP (x, 0), 0, 0);
13559 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13560 {
13561 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13562 GET_MODE (x), 0);
13563 if (x == NULL_RTX)
13564 return orig_x;
13565 }
13566 return x;
13567 }
13568
13569 if (GET_CODE (x) != PLUS
13570 || GET_CODE (XEXP (x, 1)) != CONST)
13571 return ix86_delegitimize_tls_address (orig_x);
13572
13573 if (ix86_pic_register_p (XEXP (x, 0)))
13574 /* %ebx + GOT/GOTOFF */
13575 ;
13576 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13577 {
13578 /* %ebx + %reg * scale + GOT/GOTOFF */
13579 reg_addend = XEXP (x, 0);
13580 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13581 reg_addend = XEXP (reg_addend, 1);
13582 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13583 reg_addend = XEXP (reg_addend, 0);
13584 else
13585 {
13586 reg_addend = NULL_RTX;
13587 addend = XEXP (x, 0);
13588 }
13589 }
13590 else
13591 addend = XEXP (x, 0);
13592
13593 x = XEXP (XEXP (x, 1), 0);
13594 if (GET_CODE (x) == PLUS
13595 && CONST_INT_P (XEXP (x, 1)))
13596 {
13597 const_addend = XEXP (x, 1);
13598 x = XEXP (x, 0);
13599 }
13600
13601 if (GET_CODE (x) == UNSPEC
13602 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13603 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13604 result = XVECEXP (x, 0, 0);
13605
13606 if (TARGET_MACHO && darwin_local_data_pic (x)
13607 && !MEM_P (orig_x))
13608 result = XVECEXP (x, 0, 0);
13609
13610 if (! result)
13611 return ix86_delegitimize_tls_address (orig_x);
13612
13613 if (const_addend)
13614 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13615 if (reg_addend)
13616 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13617 if (addend)
13618 {
13619 /* If the rest of original X doesn't involve the PIC register, add
13620 addend and subtract pic_offset_table_rtx. This can happen e.g.
13621 for code like:
13622 leal (%ebx, %ecx, 4), %ecx
13623 ...
13624 movl foo@GOTOFF(%ecx), %edx
13625 in which case we return (%ecx - %ebx) + foo. */
13626 if (pic_offset_table_rtx)
13627 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13628 pic_offset_table_rtx),
13629 result);
13630 else
13631 return orig_x;
13632 }
13633 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13634 {
13635 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13636 if (result == NULL_RTX)
13637 return orig_x;
13638 }
13639 return result;
13640 }
13641
13642 /* If X is a machine specific address (i.e. a symbol or label being
13643 referenced as a displacement from the GOT implemented using an
13644 UNSPEC), then return the base term. Otherwise return X. */
13645
13646 rtx
13647 ix86_find_base_term (rtx x)
13648 {
13649 rtx term;
13650
13651 if (TARGET_64BIT)
13652 {
13653 if (GET_CODE (x) != CONST)
13654 return x;
13655 term = XEXP (x, 0);
13656 if (GET_CODE (term) == PLUS
13657 && (CONST_INT_P (XEXP (term, 1))
13658 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13659 term = XEXP (term, 0);
13660 if (GET_CODE (term) != UNSPEC
13661 || (XINT (term, 1) != UNSPEC_GOTPCREL
13662 && XINT (term, 1) != UNSPEC_PCREL))
13663 return x;
13664
13665 return XVECEXP (term, 0, 0);
13666 }
13667
13668 return ix86_delegitimize_address (x);
13669 }
13670 \f
13671 static void
13672 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13673 bool fp, FILE *file)
13674 {
13675 const char *suffix;
13676
13677 if (mode == CCFPmode || mode == CCFPUmode)
13678 {
13679 code = ix86_fp_compare_code_to_integer (code);
13680 mode = CCmode;
13681 }
13682 if (reverse)
13683 code = reverse_condition (code);
13684
13685 switch (code)
13686 {
13687 case EQ:
13688 switch (mode)
13689 {
13690 case CCAmode:
13691 suffix = "a";
13692 break;
13693
13694 case CCCmode:
13695 suffix = "c";
13696 break;
13697
13698 case CCOmode:
13699 suffix = "o";
13700 break;
13701
13702 case CCSmode:
13703 suffix = "s";
13704 break;
13705
13706 default:
13707 suffix = "e";
13708 }
13709 break;
13710 case NE:
13711 switch (mode)
13712 {
13713 case CCAmode:
13714 suffix = "na";
13715 break;
13716
13717 case CCCmode:
13718 suffix = "nc";
13719 break;
13720
13721 case CCOmode:
13722 suffix = "no";
13723 break;
13724
13725 case CCSmode:
13726 suffix = "ns";
13727 break;
13728
13729 default:
13730 suffix = "ne";
13731 }
13732 break;
13733 case GT:
13734 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13735 suffix = "g";
13736 break;
13737 case GTU:
13738 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13739 Those same assemblers have the same but opposite lossage on cmov. */
13740 if (mode == CCmode)
13741 suffix = fp ? "nbe" : "a";
13742 else if (mode == CCCmode)
13743 suffix = "b";
13744 else
13745 gcc_unreachable ();
13746 break;
13747 case LT:
13748 switch (mode)
13749 {
13750 case CCNOmode:
13751 case CCGOCmode:
13752 suffix = "s";
13753 break;
13754
13755 case CCmode:
13756 case CCGCmode:
13757 suffix = "l";
13758 break;
13759
13760 default:
13761 gcc_unreachable ();
13762 }
13763 break;
13764 case LTU:
13765 gcc_assert (mode == CCmode || mode == CCCmode);
13766 suffix = "b";
13767 break;
13768 case GE:
13769 switch (mode)
13770 {
13771 case CCNOmode:
13772 case CCGOCmode:
13773 suffix = "ns";
13774 break;
13775
13776 case CCmode:
13777 case CCGCmode:
13778 suffix = "ge";
13779 break;
13780
13781 default:
13782 gcc_unreachable ();
13783 }
13784 break;
13785 case GEU:
13786 /* ??? As above. */
13787 gcc_assert (mode == CCmode || mode == CCCmode);
13788 suffix = fp ? "nb" : "ae";
13789 break;
13790 case LE:
13791 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13792 suffix = "le";
13793 break;
13794 case LEU:
13795 /* ??? As above. */
13796 if (mode == CCmode)
13797 suffix = "be";
13798 else if (mode == CCCmode)
13799 suffix = fp ? "nb" : "ae";
13800 else
13801 gcc_unreachable ();
13802 break;
13803 case UNORDERED:
13804 suffix = fp ? "u" : "p";
13805 break;
13806 case ORDERED:
13807 suffix = fp ? "nu" : "np";
13808 break;
13809 default:
13810 gcc_unreachable ();
13811 }
13812 fputs (suffix, file);
13813 }
13814
13815 /* Print the name of register X to FILE based on its machine mode and number.
13816 If CODE is 'w', pretend the mode is HImode.
13817 If CODE is 'b', pretend the mode is QImode.
13818 If CODE is 'k', pretend the mode is SImode.
13819 If CODE is 'q', pretend the mode is DImode.
13820 If CODE is 'x', pretend the mode is V4SFmode.
13821 If CODE is 't', pretend the mode is V8SFmode.
13822 If CODE is 'h', pretend the reg is the 'high' byte register.
13823 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13824 If CODE is 'd', duplicate the operand for AVX instruction.
13825 */
13826
13827 void
13828 print_reg (rtx x, int code, FILE *file)
13829 {
13830 const char *reg;
13831 bool duplicated = code == 'd' && TARGET_AVX;
13832
13833 gcc_assert (x == pc_rtx
13834 || (REGNO (x) != ARG_POINTER_REGNUM
13835 && REGNO (x) != FRAME_POINTER_REGNUM
13836 && REGNO (x) != FLAGS_REG
13837 && REGNO (x) != FPSR_REG
13838 && REGNO (x) != FPCR_REG));
13839
13840 if (ASSEMBLER_DIALECT == ASM_ATT)
13841 putc ('%', file);
13842
13843 if (x == pc_rtx)
13844 {
13845 gcc_assert (TARGET_64BIT);
13846 fputs ("rip", file);
13847 return;
13848 }
13849
13850 if (code == 'w' || MMX_REG_P (x))
13851 code = 2;
13852 else if (code == 'b')
13853 code = 1;
13854 else if (code == 'k')
13855 code = 4;
13856 else if (code == 'q')
13857 code = 8;
13858 else if (code == 'y')
13859 code = 3;
13860 else if (code == 'h')
13861 code = 0;
13862 else if (code == 'x')
13863 code = 16;
13864 else if (code == 't')
13865 code = 32;
13866 else
13867 code = GET_MODE_SIZE (GET_MODE (x));
13868
13869 /* Irritatingly, AMD extended registers use different naming convention
13870 from the normal registers: "r%d[bwd]" */
13871 if (REX_INT_REG_P (x))
13872 {
13873 gcc_assert (TARGET_64BIT);
13874 putc ('r', file);
13875 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13876 switch (code)
13877 {
13878 case 0:
13879 error ("extended registers have no high halves");
13880 break;
13881 case 1:
13882 putc ('b', file);
13883 break;
13884 case 2:
13885 putc ('w', file);
13886 break;
13887 case 4:
13888 putc ('d', file);
13889 break;
13890 case 8:
13891 /* no suffix */
13892 break;
13893 default:
13894 error ("unsupported operand size for extended register");
13895 break;
13896 }
13897 return;
13898 }
13899
13900 reg = NULL;
13901 switch (code)
13902 {
13903 case 3:
13904 if (STACK_TOP_P (x))
13905 {
13906 reg = "st(0)";
13907 break;
13908 }
13909 /* FALLTHRU */
13910 case 8:
13911 case 4:
13912 case 12:
13913 if (! ANY_FP_REG_P (x))
13914 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13915 /* FALLTHRU */
13916 case 16:
13917 case 2:
13918 normal:
13919 reg = hi_reg_name[REGNO (x)];
13920 break;
13921 case 1:
13922 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13923 goto normal;
13924 reg = qi_reg_name[REGNO (x)];
13925 break;
13926 case 0:
13927 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13928 goto normal;
13929 reg = qi_high_reg_name[REGNO (x)];
13930 break;
13931 case 32:
13932 if (SSE_REG_P (x))
13933 {
13934 gcc_assert (!duplicated);
13935 putc ('y', file);
13936 fputs (hi_reg_name[REGNO (x)] + 1, file);
13937 return;
13938 }
13939 break;
13940 default:
13941 gcc_unreachable ();
13942 }
13943
13944 fputs (reg, file);
13945 if (duplicated)
13946 {
13947 if (ASSEMBLER_DIALECT == ASM_ATT)
13948 fprintf (file, ", %%%s", reg);
13949 else
13950 fprintf (file, ", %s", reg);
13951 }
13952 }
13953
13954 /* Locate some local-dynamic symbol still in use by this function
13955 so that we can print its name in some tls_local_dynamic_base
13956 pattern. */
13957
13958 static int
13959 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13960 {
13961 rtx x = *px;
13962
13963 if (GET_CODE (x) == SYMBOL_REF
13964 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13965 {
13966 cfun->machine->some_ld_name = XSTR (x, 0);
13967 return 1;
13968 }
13969
13970 return 0;
13971 }
13972
13973 static const char *
13974 get_some_local_dynamic_name (void)
13975 {
13976 rtx insn;
13977
13978 if (cfun->machine->some_ld_name)
13979 return cfun->machine->some_ld_name;
13980
13981 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13982 if (NONDEBUG_INSN_P (insn)
13983 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13984 return cfun->machine->some_ld_name;
13985
13986 return NULL;
13987 }
13988
13989 /* Meaning of CODE:
13990 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13991 C -- print opcode suffix for set/cmov insn.
13992 c -- like C, but print reversed condition
13993 F,f -- likewise, but for floating-point.
13994 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13995 otherwise nothing
13996 R -- print the prefix for register names.
13997 z -- print the opcode suffix for the size of the current operand.
13998 Z -- likewise, with special suffixes for x87 instructions.
13999 * -- print a star (in certain assembler syntax)
14000 A -- print an absolute memory reference.
14001 E -- print address with DImode register names if TARGET_64BIT.
14002 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14003 s -- print a shift double count, followed by the assemblers argument
14004 delimiter.
14005 b -- print the QImode name of the register for the indicated operand.
14006 %b0 would print %al if operands[0] is reg 0.
14007 w -- likewise, print the HImode name of the register.
14008 k -- likewise, print the SImode name of the register.
14009 q -- likewise, print the DImode name of the register.
14010 x -- likewise, print the V4SFmode name of the register.
14011 t -- likewise, print the V8SFmode name of the register.
14012 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14013 y -- print "st(0)" instead of "st" as a register.
14014 d -- print duplicated register operand for AVX instruction.
14015 D -- print condition for SSE cmp instruction.
14016 P -- if PIC, print an @PLT suffix.
14017 p -- print raw symbol name.
14018 X -- don't print any sort of PIC '@' suffix for a symbol.
14019 & -- print some in-use local-dynamic symbol name.
14020 H -- print a memory address offset by 8; used for sse high-parts
14021 Y -- print condition for XOP pcom* instruction.
14022 + -- print a branch hint as 'cs' or 'ds' prefix
14023 ; -- print a semicolon (after prefixes due to bug in older gas).
14024 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14025 @ -- print a segment register of thread base pointer load
14026 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14027 */
14028
14029 void
14030 ix86_print_operand (FILE *file, rtx x, int code)
14031 {
14032 if (code)
14033 {
14034 switch (code)
14035 {
14036 case 'A':
14037 switch (ASSEMBLER_DIALECT)
14038 {
14039 case ASM_ATT:
14040 putc ('*', file);
14041 break;
14042
14043 case ASM_INTEL:
14044 /* Intel syntax. For absolute addresses, registers should not
14045 be surrounded by braces. */
14046 if (!REG_P (x))
14047 {
14048 putc ('[', file);
14049 ix86_print_operand (file, x, 0);
14050 putc (']', file);
14051 return;
14052 }
14053 break;
14054
14055 default:
14056 gcc_unreachable ();
14057 }
14058
14059 ix86_print_operand (file, x, 0);
14060 return;
14061
14062 case 'E':
14063 /* Wrap address in an UNSPEC to declare special handling. */
14064 if (TARGET_64BIT)
14065 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14066
14067 output_address (x);
14068 return;
14069
14070 case 'L':
14071 if (ASSEMBLER_DIALECT == ASM_ATT)
14072 putc ('l', file);
14073 return;
14074
14075 case 'W':
14076 if (ASSEMBLER_DIALECT == ASM_ATT)
14077 putc ('w', file);
14078 return;
14079
14080 case 'B':
14081 if (ASSEMBLER_DIALECT == ASM_ATT)
14082 putc ('b', file);
14083 return;
14084
14085 case 'Q':
14086 if (ASSEMBLER_DIALECT == ASM_ATT)
14087 putc ('l', file);
14088 return;
14089
14090 case 'S':
14091 if (ASSEMBLER_DIALECT == ASM_ATT)
14092 putc ('s', file);
14093 return;
14094
14095 case 'T':
14096 if (ASSEMBLER_DIALECT == ASM_ATT)
14097 putc ('t', file);
14098 return;
14099
14100 case 'O':
14101 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14102 if (ASSEMBLER_DIALECT != ASM_ATT)
14103 return;
14104
14105 switch (GET_MODE_SIZE (GET_MODE (x)))
14106 {
14107 case 2:
14108 putc ('w', file);
14109 break;
14110
14111 case 4:
14112 putc ('l', file);
14113 break;
14114
14115 case 8:
14116 putc ('q', file);
14117 break;
14118
14119 default:
14120 output_operand_lossage
14121 ("invalid operand size for operand code 'O'");
14122 return;
14123 }
14124
14125 putc ('.', file);
14126 #endif
14127 return;
14128
14129 case 'z':
14130 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14131 {
14132 /* Opcodes don't get size suffixes if using Intel opcodes. */
14133 if (ASSEMBLER_DIALECT == ASM_INTEL)
14134 return;
14135
14136 switch (GET_MODE_SIZE (GET_MODE (x)))
14137 {
14138 case 1:
14139 putc ('b', file);
14140 return;
14141
14142 case 2:
14143 putc ('w', file);
14144 return;
14145
14146 case 4:
14147 putc ('l', file);
14148 return;
14149
14150 case 8:
14151 putc ('q', file);
14152 return;
14153
14154 default:
14155 output_operand_lossage
14156 ("invalid operand size for operand code 'z'");
14157 return;
14158 }
14159 }
14160
14161 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14162 warning
14163 (0, "non-integer operand used with operand code 'z'");
14164 /* FALLTHRU */
14165
14166 case 'Z':
14167 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14168 if (ASSEMBLER_DIALECT == ASM_INTEL)
14169 return;
14170
14171 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14172 {
14173 switch (GET_MODE_SIZE (GET_MODE (x)))
14174 {
14175 case 2:
14176 #ifdef HAVE_AS_IX86_FILDS
14177 putc ('s', file);
14178 #endif
14179 return;
14180
14181 case 4:
14182 putc ('l', file);
14183 return;
14184
14185 case 8:
14186 #ifdef HAVE_AS_IX86_FILDQ
14187 putc ('q', file);
14188 #else
14189 fputs ("ll", file);
14190 #endif
14191 return;
14192
14193 default:
14194 break;
14195 }
14196 }
14197 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14198 {
14199 /* 387 opcodes don't get size suffixes
14200 if the operands are registers. */
14201 if (STACK_REG_P (x))
14202 return;
14203
14204 switch (GET_MODE_SIZE (GET_MODE (x)))
14205 {
14206 case 4:
14207 putc ('s', file);
14208 return;
14209
14210 case 8:
14211 putc ('l', file);
14212 return;
14213
14214 case 12:
14215 case 16:
14216 putc ('t', file);
14217 return;
14218
14219 default:
14220 break;
14221 }
14222 }
14223 else
14224 {
14225 output_operand_lossage
14226 ("invalid operand type used with operand code 'Z'");
14227 return;
14228 }
14229
14230 output_operand_lossage
14231 ("invalid operand size for operand code 'Z'");
14232 return;
14233
14234 case 'd':
14235 case 'b':
14236 case 'w':
14237 case 'k':
14238 case 'q':
14239 case 'h':
14240 case 't':
14241 case 'y':
14242 case 'x':
14243 case 'X':
14244 case 'P':
14245 case 'p':
14246 break;
14247
14248 case 's':
14249 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14250 {
14251 ix86_print_operand (file, x, 0);
14252 fputs (", ", file);
14253 }
14254 return;
14255
14256 case 'Y':
14257 switch (GET_CODE (x))
14258 {
14259 case NE:
14260 fputs ("neq", file);
14261 break;
14262 case EQ:
14263 fputs ("eq", file);
14264 break;
14265 case GE:
14266 case GEU:
14267 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14268 break;
14269 case GT:
14270 case GTU:
14271 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14272 break;
14273 case LE:
14274 case LEU:
14275 fputs ("le", file);
14276 break;
14277 case LT:
14278 case LTU:
14279 fputs ("lt", file);
14280 break;
14281 case UNORDERED:
14282 fputs ("unord", file);
14283 break;
14284 case ORDERED:
14285 fputs ("ord", file);
14286 break;
14287 case UNEQ:
14288 fputs ("ueq", file);
14289 break;
14290 case UNGE:
14291 fputs ("nlt", file);
14292 break;
14293 case UNGT:
14294 fputs ("nle", file);
14295 break;
14296 case UNLE:
14297 fputs ("ule", file);
14298 break;
14299 case UNLT:
14300 fputs ("ult", file);
14301 break;
14302 case LTGT:
14303 fputs ("une", file);
14304 break;
14305 default:
14306 output_operand_lossage ("operand is not a condition code, "
14307 "invalid operand code 'Y'");
14308 return;
14309 }
14310 return;
14311
14312 case 'D':
14313 /* Little bit of braindamage here. The SSE compare instructions
14314 does use completely different names for the comparisons that the
14315 fp conditional moves. */
14316 switch (GET_CODE (x))
14317 {
14318 case UNEQ:
14319 if (TARGET_AVX)
14320 {
14321 fputs ("eq_us", file);
14322 break;
14323 }
14324 case EQ:
14325 fputs ("eq", file);
14326 break;
14327 case UNLT:
14328 if (TARGET_AVX)
14329 {
14330 fputs ("nge", file);
14331 break;
14332 }
14333 case LT:
14334 fputs ("lt", file);
14335 break;
14336 case UNLE:
14337 if (TARGET_AVX)
14338 {
14339 fputs ("ngt", file);
14340 break;
14341 }
14342 case LE:
14343 fputs ("le", file);
14344 break;
14345 case UNORDERED:
14346 fputs ("unord", file);
14347 break;
14348 case LTGT:
14349 if (TARGET_AVX)
14350 {
14351 fputs ("neq_oq", file);
14352 break;
14353 }
14354 case NE:
14355 fputs ("neq", file);
14356 break;
14357 case GE:
14358 if (TARGET_AVX)
14359 {
14360 fputs ("ge", file);
14361 break;
14362 }
14363 case UNGE:
14364 fputs ("nlt", file);
14365 break;
14366 case GT:
14367 if (TARGET_AVX)
14368 {
14369 fputs ("gt", file);
14370 break;
14371 }
14372 case UNGT:
14373 fputs ("nle", file);
14374 break;
14375 case ORDERED:
14376 fputs ("ord", file);
14377 break;
14378 default:
14379 output_operand_lossage ("operand is not a condition code, "
14380 "invalid operand code 'D'");
14381 return;
14382 }
14383 return;
14384
14385 case 'F':
14386 case 'f':
14387 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14388 if (ASSEMBLER_DIALECT == ASM_ATT)
14389 putc ('.', file);
14390 #endif
14391
14392 case 'C':
14393 case 'c':
14394 if (!COMPARISON_P (x))
14395 {
14396 output_operand_lossage ("operand is not a condition code, "
14397 "invalid operand code '%c'", code);
14398 return;
14399 }
14400 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14401 code == 'c' || code == 'f',
14402 code == 'F' || code == 'f',
14403 file);
14404 return;
14405
14406 case 'H':
14407 if (!offsettable_memref_p (x))
14408 {
14409 output_operand_lossage ("operand is not an offsettable memory "
14410 "reference, invalid operand code 'H'");
14411 return;
14412 }
14413 /* It doesn't actually matter what mode we use here, as we're
14414 only going to use this for printing. */
14415 x = adjust_address_nv (x, DImode, 8);
14416 break;
14417
14418 case 'K':
14419 gcc_assert (CONST_INT_P (x));
14420
14421 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14422 #ifdef HAVE_AS_IX86_HLE
14423 fputs ("xacquire ", file);
14424 #else
14425 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14426 #endif
14427 else if (INTVAL (x) & IX86_HLE_RELEASE)
14428 #ifdef HAVE_AS_IX86_HLE
14429 fputs ("xrelease ", file);
14430 #else
14431 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14432 #endif
14433 /* We do not want to print value of the operand. */
14434 return;
14435
14436 case '*':
14437 if (ASSEMBLER_DIALECT == ASM_ATT)
14438 putc ('*', file);
14439 return;
14440
14441 case '&':
14442 {
14443 const char *name = get_some_local_dynamic_name ();
14444 if (name == NULL)
14445 output_operand_lossage ("'%%&' used without any "
14446 "local dynamic TLS references");
14447 else
14448 assemble_name (file, name);
14449 return;
14450 }
14451
14452 case '+':
14453 {
14454 rtx x;
14455
14456 if (!optimize
14457 || optimize_function_for_size_p (cfun)
14458 || !TARGET_BRANCH_PREDICTION_HINTS)
14459 return;
14460
14461 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14462 if (x)
14463 {
14464 int pred_val = INTVAL (XEXP (x, 0));
14465
14466 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14467 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14468 {
14469 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14470 bool cputaken
14471 = final_forward_branch_p (current_output_insn) == 0;
14472
14473 /* Emit hints only in the case default branch prediction
14474 heuristics would fail. */
14475 if (taken != cputaken)
14476 {
14477 /* We use 3e (DS) prefix for taken branches and
14478 2e (CS) prefix for not taken branches. */
14479 if (taken)
14480 fputs ("ds ; ", file);
14481 else
14482 fputs ("cs ; ", file);
14483 }
14484 }
14485 }
14486 return;
14487 }
14488
14489 case ';':
14490 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14491 putc (';', file);
14492 #endif
14493 return;
14494
14495 case '@':
14496 if (ASSEMBLER_DIALECT == ASM_ATT)
14497 putc ('%', file);
14498
14499 /* The kernel uses a different segment register for performance
14500 reasons; a system call would not have to trash the userspace
14501 segment register, which would be expensive. */
14502 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14503 fputs ("fs", file);
14504 else
14505 fputs ("gs", file);
14506 return;
14507
14508 case '~':
14509 putc (TARGET_AVX2 ? 'i' : 'f', file);
14510 return;
14511
14512 case '^':
14513 if (TARGET_64BIT && Pmode != word_mode)
14514 fputs ("addr32 ", file);
14515 return;
14516
14517 default:
14518 output_operand_lossage ("invalid operand code '%c'", code);
14519 }
14520 }
14521
14522 if (REG_P (x))
14523 print_reg (x, code, file);
14524
14525 else if (MEM_P (x))
14526 {
14527 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14528 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14529 && GET_MODE (x) != BLKmode)
14530 {
14531 const char * size;
14532 switch (GET_MODE_SIZE (GET_MODE (x)))
14533 {
14534 case 1: size = "BYTE"; break;
14535 case 2: size = "WORD"; break;
14536 case 4: size = "DWORD"; break;
14537 case 8: size = "QWORD"; break;
14538 case 12: size = "TBYTE"; break;
14539 case 16:
14540 if (GET_MODE (x) == XFmode)
14541 size = "TBYTE";
14542 else
14543 size = "XMMWORD";
14544 break;
14545 case 32: size = "YMMWORD"; break;
14546 default:
14547 gcc_unreachable ();
14548 }
14549
14550 /* Check for explicit size override (codes 'b', 'w', 'k',
14551 'q' and 'x') */
14552 if (code == 'b')
14553 size = "BYTE";
14554 else if (code == 'w')
14555 size = "WORD";
14556 else if (code == 'k')
14557 size = "DWORD";
14558 else if (code == 'q')
14559 size = "QWORD";
14560 else if (code == 'x')
14561 size = "XMMWORD";
14562
14563 fputs (size, file);
14564 fputs (" PTR ", file);
14565 }
14566
14567 x = XEXP (x, 0);
14568 /* Avoid (%rip) for call operands. */
14569 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14570 && !CONST_INT_P (x))
14571 output_addr_const (file, x);
14572 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14573 output_operand_lossage ("invalid constraints for operand");
14574 else
14575 output_address (x);
14576 }
14577
14578 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14579 {
14580 REAL_VALUE_TYPE r;
14581 long l;
14582
14583 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14584 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14585
14586 if (ASSEMBLER_DIALECT == ASM_ATT)
14587 putc ('$', file);
14588 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14589 if (code == 'q')
14590 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14591 else
14592 fprintf (file, "0x%08x", (unsigned int) l);
14593 }
14594
14595 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14596 {
14597 REAL_VALUE_TYPE r;
14598 long l[2];
14599
14600 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14601 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14602
14603 if (ASSEMBLER_DIALECT == ASM_ATT)
14604 putc ('$', file);
14605 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14606 }
14607
14608 /* These float cases don't actually occur as immediate operands. */
14609 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14610 {
14611 char dstr[30];
14612
14613 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14614 fputs (dstr, file);
14615 }
14616
14617 else
14618 {
14619 /* We have patterns that allow zero sets of memory, for instance.
14620 In 64-bit mode, we should probably support all 8-byte vectors,
14621 since we can in fact encode that into an immediate. */
14622 if (GET_CODE (x) == CONST_VECTOR)
14623 {
14624 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14625 x = const0_rtx;
14626 }
14627
14628 if (code != 'P' && code != 'p')
14629 {
14630 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14631 {
14632 if (ASSEMBLER_DIALECT == ASM_ATT)
14633 putc ('$', file);
14634 }
14635 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14636 || GET_CODE (x) == LABEL_REF)
14637 {
14638 if (ASSEMBLER_DIALECT == ASM_ATT)
14639 putc ('$', file);
14640 else
14641 fputs ("OFFSET FLAT:", file);
14642 }
14643 }
14644 if (CONST_INT_P (x))
14645 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14646 else if (flag_pic || MACHOPIC_INDIRECT)
14647 output_pic_addr_const (file, x, code);
14648 else
14649 output_addr_const (file, x);
14650 }
14651 }
14652
14653 static bool
14654 ix86_print_operand_punct_valid_p (unsigned char code)
14655 {
14656 return (code == '@' || code == '*' || code == '+' || code == '&'
14657 || code == ';' || code == '~' || code == '^');
14658 }
14659 \f
14660 /* Print a memory operand whose address is ADDR. */
14661
14662 static void
14663 ix86_print_operand_address (FILE *file, rtx addr)
14664 {
14665 struct ix86_address parts;
14666 rtx base, index, disp;
14667 int scale;
14668 int ok;
14669 bool vsib = false;
14670 int code = 0;
14671
14672 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14673 {
14674 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14675 gcc_assert (parts.index == NULL_RTX);
14676 parts.index = XVECEXP (addr, 0, 1);
14677 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14678 addr = XVECEXP (addr, 0, 0);
14679 vsib = true;
14680 }
14681 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14682 {
14683 gcc_assert (TARGET_64BIT);
14684 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14685 code = 'q';
14686 }
14687 else
14688 ok = ix86_decompose_address (addr, &parts);
14689
14690 gcc_assert (ok);
14691
14692 if (parts.base && GET_CODE (parts.base) == SUBREG)
14693 {
14694 rtx tmp = SUBREG_REG (parts.base);
14695 parts.base = simplify_subreg (GET_MODE (parts.base),
14696 tmp, GET_MODE (tmp), 0);
14697 }
14698
14699 if (parts.index && GET_CODE (parts.index) == SUBREG)
14700 {
14701 rtx tmp = SUBREG_REG (parts.index);
14702 parts.index = simplify_subreg (GET_MODE (parts.index),
14703 tmp, GET_MODE (tmp), 0);
14704 }
14705
14706 base = parts.base;
14707 index = parts.index;
14708 disp = parts.disp;
14709 scale = parts.scale;
14710
14711 switch (parts.seg)
14712 {
14713 case SEG_DEFAULT:
14714 break;
14715 case SEG_FS:
14716 case SEG_GS:
14717 if (ASSEMBLER_DIALECT == ASM_ATT)
14718 putc ('%', file);
14719 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14720 break;
14721 default:
14722 gcc_unreachable ();
14723 }
14724
14725 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14726 if (TARGET_64BIT && !base && !index)
14727 {
14728 rtx symbol = disp;
14729
14730 if (GET_CODE (disp) == CONST
14731 && GET_CODE (XEXP (disp, 0)) == PLUS
14732 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14733 symbol = XEXP (XEXP (disp, 0), 0);
14734
14735 if (GET_CODE (symbol) == LABEL_REF
14736 || (GET_CODE (symbol) == SYMBOL_REF
14737 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14738 base = pc_rtx;
14739 }
14740 if (!base && !index)
14741 {
14742 /* Displacement only requires special attention. */
14743
14744 if (CONST_INT_P (disp))
14745 {
14746 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14747 fputs ("ds:", file);
14748 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14749 }
14750 else if (flag_pic)
14751 output_pic_addr_const (file, disp, 0);
14752 else
14753 output_addr_const (file, disp);
14754 }
14755 else
14756 {
14757 /* Print SImode register names for zero-extended
14758 addresses to force addr32 prefix. */
14759 if (TARGET_64BIT
14760 && (GET_CODE (addr) == ZERO_EXTEND
14761 || GET_CODE (addr) == AND))
14762 {
14763 gcc_assert (!code);
14764 code = 'l';
14765 }
14766
14767 if (ASSEMBLER_DIALECT == ASM_ATT)
14768 {
14769 if (disp)
14770 {
14771 if (flag_pic)
14772 output_pic_addr_const (file, disp, 0);
14773 else if (GET_CODE (disp) == LABEL_REF)
14774 output_asm_label (disp);
14775 else
14776 output_addr_const (file, disp);
14777 }
14778
14779 putc ('(', file);
14780 if (base)
14781 print_reg (base, code, file);
14782 if (index)
14783 {
14784 putc (',', file);
14785 print_reg (index, vsib ? 0 : code, file);
14786 if (scale != 1 || vsib)
14787 fprintf (file, ",%d", scale);
14788 }
14789 putc (')', file);
14790 }
14791 else
14792 {
14793 rtx offset = NULL_RTX;
14794
14795 if (disp)
14796 {
14797 /* Pull out the offset of a symbol; print any symbol itself. */
14798 if (GET_CODE (disp) == CONST
14799 && GET_CODE (XEXP (disp, 0)) == PLUS
14800 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14801 {
14802 offset = XEXP (XEXP (disp, 0), 1);
14803 disp = gen_rtx_CONST (VOIDmode,
14804 XEXP (XEXP (disp, 0), 0));
14805 }
14806
14807 if (flag_pic)
14808 output_pic_addr_const (file, disp, 0);
14809 else if (GET_CODE (disp) == LABEL_REF)
14810 output_asm_label (disp);
14811 else if (CONST_INT_P (disp))
14812 offset = disp;
14813 else
14814 output_addr_const (file, disp);
14815 }
14816
14817 putc ('[', file);
14818 if (base)
14819 {
14820 print_reg (base, code, file);
14821 if (offset)
14822 {
14823 if (INTVAL (offset) >= 0)
14824 putc ('+', file);
14825 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14826 }
14827 }
14828 else if (offset)
14829 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14830 else
14831 putc ('0', file);
14832
14833 if (index)
14834 {
14835 putc ('+', file);
14836 print_reg (index, vsib ? 0 : code, file);
14837 if (scale != 1 || vsib)
14838 fprintf (file, "*%d", scale);
14839 }
14840 putc (']', file);
14841 }
14842 }
14843 }
14844
14845 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14846
14847 static bool
14848 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14849 {
14850 rtx op;
14851
14852 if (GET_CODE (x) != UNSPEC)
14853 return false;
14854
14855 op = XVECEXP (x, 0, 0);
14856 switch (XINT (x, 1))
14857 {
14858 case UNSPEC_GOTTPOFF:
14859 output_addr_const (file, op);
14860 /* FIXME: This might be @TPOFF in Sun ld. */
14861 fputs ("@gottpoff", file);
14862 break;
14863 case UNSPEC_TPOFF:
14864 output_addr_const (file, op);
14865 fputs ("@tpoff", file);
14866 break;
14867 case UNSPEC_NTPOFF:
14868 output_addr_const (file, op);
14869 if (TARGET_64BIT)
14870 fputs ("@tpoff", file);
14871 else
14872 fputs ("@ntpoff", file);
14873 break;
14874 case UNSPEC_DTPOFF:
14875 output_addr_const (file, op);
14876 fputs ("@dtpoff", file);
14877 break;
14878 case UNSPEC_GOTNTPOFF:
14879 output_addr_const (file, op);
14880 if (TARGET_64BIT)
14881 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14882 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14883 else
14884 fputs ("@gotntpoff", file);
14885 break;
14886 case UNSPEC_INDNTPOFF:
14887 output_addr_const (file, op);
14888 fputs ("@indntpoff", file);
14889 break;
14890 #if TARGET_MACHO
14891 case UNSPEC_MACHOPIC_OFFSET:
14892 output_addr_const (file, op);
14893 putc ('-', file);
14894 machopic_output_function_base_name (file);
14895 break;
14896 #endif
14897
14898 case UNSPEC_STACK_CHECK:
14899 {
14900 int offset;
14901
14902 gcc_assert (flag_split_stack);
14903
14904 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14905 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14906 #else
14907 gcc_unreachable ();
14908 #endif
14909
14910 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14911 }
14912 break;
14913
14914 default:
14915 return false;
14916 }
14917
14918 return true;
14919 }
14920 \f
14921 /* Split one or more double-mode RTL references into pairs of half-mode
14922 references. The RTL can be REG, offsettable MEM, integer constant, or
14923 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14924 split and "num" is its length. lo_half and hi_half are output arrays
14925 that parallel "operands". */
14926
14927 void
14928 split_double_mode (enum machine_mode mode, rtx operands[],
14929 int num, rtx lo_half[], rtx hi_half[])
14930 {
14931 enum machine_mode half_mode;
14932 unsigned int byte;
14933
14934 switch (mode)
14935 {
14936 case TImode:
14937 half_mode = DImode;
14938 break;
14939 case DImode:
14940 half_mode = SImode;
14941 break;
14942 default:
14943 gcc_unreachable ();
14944 }
14945
14946 byte = GET_MODE_SIZE (half_mode);
14947
14948 while (num--)
14949 {
14950 rtx op = operands[num];
14951
14952 /* simplify_subreg refuse to split volatile memory addresses,
14953 but we still have to handle it. */
14954 if (MEM_P (op))
14955 {
14956 lo_half[num] = adjust_address (op, half_mode, 0);
14957 hi_half[num] = adjust_address (op, half_mode, byte);
14958 }
14959 else
14960 {
14961 lo_half[num] = simplify_gen_subreg (half_mode, op,
14962 GET_MODE (op) == VOIDmode
14963 ? mode : GET_MODE (op), 0);
14964 hi_half[num] = simplify_gen_subreg (half_mode, op,
14965 GET_MODE (op) == VOIDmode
14966 ? mode : GET_MODE (op), byte);
14967 }
14968 }
14969 }
14970 \f
14971 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14972 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14973 is the expression of the binary operation. The output may either be
14974 emitted here, or returned to the caller, like all output_* functions.
14975
14976 There is no guarantee that the operands are the same mode, as they
14977 might be within FLOAT or FLOAT_EXTEND expressions. */
14978
14979 #ifndef SYSV386_COMPAT
14980 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14981 wants to fix the assemblers because that causes incompatibility
14982 with gcc. No-one wants to fix gcc because that causes
14983 incompatibility with assemblers... You can use the option of
14984 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14985 #define SYSV386_COMPAT 1
14986 #endif
14987
14988 const char *
14989 output_387_binary_op (rtx insn, rtx *operands)
14990 {
14991 static char buf[40];
14992 const char *p;
14993 const char *ssep;
14994 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14995
14996 #ifdef ENABLE_CHECKING
14997 /* Even if we do not want to check the inputs, this documents input
14998 constraints. Which helps in understanding the following code. */
14999 if (STACK_REG_P (operands[0])
15000 && ((REG_P (operands[1])
15001 && REGNO (operands[0]) == REGNO (operands[1])
15002 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15003 || (REG_P (operands[2])
15004 && REGNO (operands[0]) == REGNO (operands[2])
15005 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15006 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15007 ; /* ok */
15008 else
15009 gcc_assert (is_sse);
15010 #endif
15011
15012 switch (GET_CODE (operands[3]))
15013 {
15014 case PLUS:
15015 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15016 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15017 p = "fiadd";
15018 else
15019 p = "fadd";
15020 ssep = "vadd";
15021 break;
15022
15023 case MINUS:
15024 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15025 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15026 p = "fisub";
15027 else
15028 p = "fsub";
15029 ssep = "vsub";
15030 break;
15031
15032 case MULT:
15033 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15034 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15035 p = "fimul";
15036 else
15037 p = "fmul";
15038 ssep = "vmul";
15039 break;
15040
15041 case DIV:
15042 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15043 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15044 p = "fidiv";
15045 else
15046 p = "fdiv";
15047 ssep = "vdiv";
15048 break;
15049
15050 default:
15051 gcc_unreachable ();
15052 }
15053
15054 if (is_sse)
15055 {
15056 if (TARGET_AVX)
15057 {
15058 strcpy (buf, ssep);
15059 if (GET_MODE (operands[0]) == SFmode)
15060 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15061 else
15062 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15063 }
15064 else
15065 {
15066 strcpy (buf, ssep + 1);
15067 if (GET_MODE (operands[0]) == SFmode)
15068 strcat (buf, "ss\t{%2, %0|%0, %2}");
15069 else
15070 strcat (buf, "sd\t{%2, %0|%0, %2}");
15071 }
15072 return buf;
15073 }
15074 strcpy (buf, p);
15075
15076 switch (GET_CODE (operands[3]))
15077 {
15078 case MULT:
15079 case PLUS:
15080 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15081 {
15082 rtx temp = operands[2];
15083 operands[2] = operands[1];
15084 operands[1] = temp;
15085 }
15086
15087 /* know operands[0] == operands[1]. */
15088
15089 if (MEM_P (operands[2]))
15090 {
15091 p = "%Z2\t%2";
15092 break;
15093 }
15094
15095 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15096 {
15097 if (STACK_TOP_P (operands[0]))
15098 /* How is it that we are storing to a dead operand[2]?
15099 Well, presumably operands[1] is dead too. We can't
15100 store the result to st(0) as st(0) gets popped on this
15101 instruction. Instead store to operands[2] (which I
15102 think has to be st(1)). st(1) will be popped later.
15103 gcc <= 2.8.1 didn't have this check and generated
15104 assembly code that the Unixware assembler rejected. */
15105 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15106 else
15107 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15108 break;
15109 }
15110
15111 if (STACK_TOP_P (operands[0]))
15112 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15113 else
15114 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15115 break;
15116
15117 case MINUS:
15118 case DIV:
15119 if (MEM_P (operands[1]))
15120 {
15121 p = "r%Z1\t%1";
15122 break;
15123 }
15124
15125 if (MEM_P (operands[2]))
15126 {
15127 p = "%Z2\t%2";
15128 break;
15129 }
15130
15131 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15132 {
15133 #if SYSV386_COMPAT
15134 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15135 derived assemblers, confusingly reverse the direction of
15136 the operation for fsub{r} and fdiv{r} when the
15137 destination register is not st(0). The Intel assembler
15138 doesn't have this brain damage. Read !SYSV386_COMPAT to
15139 figure out what the hardware really does. */
15140 if (STACK_TOP_P (operands[0]))
15141 p = "{p\t%0, %2|rp\t%2, %0}";
15142 else
15143 p = "{rp\t%2, %0|p\t%0, %2}";
15144 #else
15145 if (STACK_TOP_P (operands[0]))
15146 /* As above for fmul/fadd, we can't store to st(0). */
15147 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15148 else
15149 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15150 #endif
15151 break;
15152 }
15153
15154 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15155 {
15156 #if SYSV386_COMPAT
15157 if (STACK_TOP_P (operands[0]))
15158 p = "{rp\t%0, %1|p\t%1, %0}";
15159 else
15160 p = "{p\t%1, %0|rp\t%0, %1}";
15161 #else
15162 if (STACK_TOP_P (operands[0]))
15163 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15164 else
15165 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15166 #endif
15167 break;
15168 }
15169
15170 if (STACK_TOP_P (operands[0]))
15171 {
15172 if (STACK_TOP_P (operands[1]))
15173 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15174 else
15175 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15176 break;
15177 }
15178 else if (STACK_TOP_P (operands[1]))
15179 {
15180 #if SYSV386_COMPAT
15181 p = "{\t%1, %0|r\t%0, %1}";
15182 #else
15183 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15184 #endif
15185 }
15186 else
15187 {
15188 #if SYSV386_COMPAT
15189 p = "{r\t%2, %0|\t%0, %2}";
15190 #else
15191 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15192 #endif
15193 }
15194 break;
15195
15196 default:
15197 gcc_unreachable ();
15198 }
15199
15200 strcat (buf, p);
15201 return buf;
15202 }
15203
15204 /* Return needed mode for entity in optimize_mode_switching pass. */
15205
15206 int
15207 ix86_mode_needed (int entity, rtx insn)
15208 {
15209 enum attr_i387_cw mode;
15210
15211 /* The mode UNINITIALIZED is used to store control word after a
15212 function call or ASM pattern. The mode ANY specify that function
15213 has no requirements on the control word and make no changes in the
15214 bits we are interested in. */
15215
15216 if (CALL_P (insn)
15217 || (NONJUMP_INSN_P (insn)
15218 && (asm_noperands (PATTERN (insn)) >= 0
15219 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15220 return I387_CW_UNINITIALIZED;
15221
15222 if (recog_memoized (insn) < 0)
15223 return I387_CW_ANY;
15224
15225 mode = get_attr_i387_cw (insn);
15226
15227 switch (entity)
15228 {
15229 case I387_TRUNC:
15230 if (mode == I387_CW_TRUNC)
15231 return mode;
15232 break;
15233
15234 case I387_FLOOR:
15235 if (mode == I387_CW_FLOOR)
15236 return mode;
15237 break;
15238
15239 case I387_CEIL:
15240 if (mode == I387_CW_CEIL)
15241 return mode;
15242 break;
15243
15244 case I387_MASK_PM:
15245 if (mode == I387_CW_MASK_PM)
15246 return mode;
15247 break;
15248
15249 default:
15250 gcc_unreachable ();
15251 }
15252
15253 return I387_CW_ANY;
15254 }
15255
15256 /* Output code to initialize control word copies used by trunc?f?i and
15257 rounding patterns. CURRENT_MODE is set to current control word,
15258 while NEW_MODE is set to new control word. */
15259
15260 void
15261 emit_i387_cw_initialization (int mode)
15262 {
15263 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15264 rtx new_mode;
15265
15266 enum ix86_stack_slot slot;
15267
15268 rtx reg = gen_reg_rtx (HImode);
15269
15270 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15271 emit_move_insn (reg, copy_rtx (stored_mode));
15272
15273 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15274 || optimize_function_for_size_p (cfun))
15275 {
15276 switch (mode)
15277 {
15278 case I387_CW_TRUNC:
15279 /* round toward zero (truncate) */
15280 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15281 slot = SLOT_CW_TRUNC;
15282 break;
15283
15284 case I387_CW_FLOOR:
15285 /* round down toward -oo */
15286 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15287 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15288 slot = SLOT_CW_FLOOR;
15289 break;
15290
15291 case I387_CW_CEIL:
15292 /* round up toward +oo */
15293 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15294 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15295 slot = SLOT_CW_CEIL;
15296 break;
15297
15298 case I387_CW_MASK_PM:
15299 /* mask precision exception for nearbyint() */
15300 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15301 slot = SLOT_CW_MASK_PM;
15302 break;
15303
15304 default:
15305 gcc_unreachable ();
15306 }
15307 }
15308 else
15309 {
15310 switch (mode)
15311 {
15312 case I387_CW_TRUNC:
15313 /* round toward zero (truncate) */
15314 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15315 slot = SLOT_CW_TRUNC;
15316 break;
15317
15318 case I387_CW_FLOOR:
15319 /* round down toward -oo */
15320 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15321 slot = SLOT_CW_FLOOR;
15322 break;
15323
15324 case I387_CW_CEIL:
15325 /* round up toward +oo */
15326 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15327 slot = SLOT_CW_CEIL;
15328 break;
15329
15330 case I387_CW_MASK_PM:
15331 /* mask precision exception for nearbyint() */
15332 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15333 slot = SLOT_CW_MASK_PM;
15334 break;
15335
15336 default:
15337 gcc_unreachable ();
15338 }
15339 }
15340
15341 gcc_assert (slot < MAX_386_STACK_LOCALS);
15342
15343 new_mode = assign_386_stack_local (HImode, slot);
15344 emit_move_insn (new_mode, reg);
15345 }
15346
15347 /* Output code for INSN to convert a float to a signed int. OPERANDS
15348 are the insn operands. The output may be [HSD]Imode and the input
15349 operand may be [SDX]Fmode. */
15350
15351 const char *
15352 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15353 {
15354 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15355 int dimode_p = GET_MODE (operands[0]) == DImode;
15356 int round_mode = get_attr_i387_cw (insn);
15357
15358 /* Jump through a hoop or two for DImode, since the hardware has no
15359 non-popping instruction. We used to do this a different way, but
15360 that was somewhat fragile and broke with post-reload splitters. */
15361 if ((dimode_p || fisttp) && !stack_top_dies)
15362 output_asm_insn ("fld\t%y1", operands);
15363
15364 gcc_assert (STACK_TOP_P (operands[1]));
15365 gcc_assert (MEM_P (operands[0]));
15366 gcc_assert (GET_MODE (operands[1]) != TFmode);
15367
15368 if (fisttp)
15369 output_asm_insn ("fisttp%Z0\t%0", operands);
15370 else
15371 {
15372 if (round_mode != I387_CW_ANY)
15373 output_asm_insn ("fldcw\t%3", operands);
15374 if (stack_top_dies || dimode_p)
15375 output_asm_insn ("fistp%Z0\t%0", operands);
15376 else
15377 output_asm_insn ("fist%Z0\t%0", operands);
15378 if (round_mode != I387_CW_ANY)
15379 output_asm_insn ("fldcw\t%2", operands);
15380 }
15381
15382 return "";
15383 }
15384
15385 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15386 have the values zero or one, indicates the ffreep insn's operand
15387 from the OPERANDS array. */
15388
15389 static const char *
15390 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15391 {
15392 if (TARGET_USE_FFREEP)
15393 #ifdef HAVE_AS_IX86_FFREEP
15394 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15395 #else
15396 {
15397 static char retval[32];
15398 int regno = REGNO (operands[opno]);
15399
15400 gcc_assert (FP_REGNO_P (regno));
15401
15402 regno -= FIRST_STACK_REG;
15403
15404 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15405 return retval;
15406 }
15407 #endif
15408
15409 return opno ? "fstp\t%y1" : "fstp\t%y0";
15410 }
15411
15412
15413 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15414 should be used. UNORDERED_P is true when fucom should be used. */
15415
15416 const char *
15417 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15418 {
15419 int stack_top_dies;
15420 rtx cmp_op0, cmp_op1;
15421 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15422
15423 if (eflags_p)
15424 {
15425 cmp_op0 = operands[0];
15426 cmp_op1 = operands[1];
15427 }
15428 else
15429 {
15430 cmp_op0 = operands[1];
15431 cmp_op1 = operands[2];
15432 }
15433
15434 if (is_sse)
15435 {
15436 if (GET_MODE (operands[0]) == SFmode)
15437 if (unordered_p)
15438 return "%vucomiss\t{%1, %0|%0, %1}";
15439 else
15440 return "%vcomiss\t{%1, %0|%0, %1}";
15441 else
15442 if (unordered_p)
15443 return "%vucomisd\t{%1, %0|%0, %1}";
15444 else
15445 return "%vcomisd\t{%1, %0|%0, %1}";
15446 }
15447
15448 gcc_assert (STACK_TOP_P (cmp_op0));
15449
15450 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15451
15452 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15453 {
15454 if (stack_top_dies)
15455 {
15456 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15457 return output_387_ffreep (operands, 1);
15458 }
15459 else
15460 return "ftst\n\tfnstsw\t%0";
15461 }
15462
15463 if (STACK_REG_P (cmp_op1)
15464 && stack_top_dies
15465 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15466 && REGNO (cmp_op1) != FIRST_STACK_REG)
15467 {
15468 /* If both the top of the 387 stack dies, and the other operand
15469 is also a stack register that dies, then this must be a
15470 `fcompp' float compare */
15471
15472 if (eflags_p)
15473 {
15474 /* There is no double popping fcomi variant. Fortunately,
15475 eflags is immune from the fstp's cc clobbering. */
15476 if (unordered_p)
15477 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15478 else
15479 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15480 return output_387_ffreep (operands, 0);
15481 }
15482 else
15483 {
15484 if (unordered_p)
15485 return "fucompp\n\tfnstsw\t%0";
15486 else
15487 return "fcompp\n\tfnstsw\t%0";
15488 }
15489 }
15490 else
15491 {
15492 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15493
15494 static const char * const alt[16] =
15495 {
15496 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15497 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15498 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15499 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15500
15501 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15502 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15503 NULL,
15504 NULL,
15505
15506 "fcomi\t{%y1, %0|%0, %y1}",
15507 "fcomip\t{%y1, %0|%0, %y1}",
15508 "fucomi\t{%y1, %0|%0, %y1}",
15509 "fucomip\t{%y1, %0|%0, %y1}",
15510
15511 NULL,
15512 NULL,
15513 NULL,
15514 NULL
15515 };
15516
15517 int mask;
15518 const char *ret;
15519
15520 mask = eflags_p << 3;
15521 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15522 mask |= unordered_p << 1;
15523 mask |= stack_top_dies;
15524
15525 gcc_assert (mask < 16);
15526 ret = alt[mask];
15527 gcc_assert (ret);
15528
15529 return ret;
15530 }
15531 }
15532
15533 void
15534 ix86_output_addr_vec_elt (FILE *file, int value)
15535 {
15536 const char *directive = ASM_LONG;
15537
15538 #ifdef ASM_QUAD
15539 if (TARGET_LP64)
15540 directive = ASM_QUAD;
15541 #else
15542 gcc_assert (!TARGET_64BIT);
15543 #endif
15544
15545 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15546 }
15547
15548 void
15549 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15550 {
15551 const char *directive = ASM_LONG;
15552
15553 #ifdef ASM_QUAD
15554 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15555 directive = ASM_QUAD;
15556 #else
15557 gcc_assert (!TARGET_64BIT);
15558 #endif
15559 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15560 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15561 fprintf (file, "%s%s%d-%s%d\n",
15562 directive, LPREFIX, value, LPREFIX, rel);
15563 else if (HAVE_AS_GOTOFF_IN_DATA)
15564 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15565 #if TARGET_MACHO
15566 else if (TARGET_MACHO)
15567 {
15568 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15569 machopic_output_function_base_name (file);
15570 putc ('\n', file);
15571 }
15572 #endif
15573 else
15574 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15575 GOT_SYMBOL_NAME, LPREFIX, value);
15576 }
15577 \f
15578 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15579 for the target. */
15580
15581 void
15582 ix86_expand_clear (rtx dest)
15583 {
15584 rtx tmp;
15585
15586 /* We play register width games, which are only valid after reload. */
15587 gcc_assert (reload_completed);
15588
15589 /* Avoid HImode and its attendant prefix byte. */
15590 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15591 dest = gen_rtx_REG (SImode, REGNO (dest));
15592 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15593
15594 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15595 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15596 {
15597 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15598 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15599 }
15600
15601 emit_insn (tmp);
15602 }
15603
15604 /* X is an unchanging MEM. If it is a constant pool reference, return
15605 the constant pool rtx, else NULL. */
15606
15607 rtx
15608 maybe_get_pool_constant (rtx x)
15609 {
15610 x = ix86_delegitimize_address (XEXP (x, 0));
15611
15612 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15613 return get_pool_constant (x);
15614
15615 return NULL_RTX;
15616 }
15617
15618 void
15619 ix86_expand_move (enum machine_mode mode, rtx operands[])
15620 {
15621 rtx op0, op1;
15622 enum tls_model model;
15623
15624 op0 = operands[0];
15625 op1 = operands[1];
15626
15627 if (GET_CODE (op1) == SYMBOL_REF)
15628 {
15629 model = SYMBOL_REF_TLS_MODEL (op1);
15630 if (model)
15631 {
15632 op1 = legitimize_tls_address (op1, model, true);
15633 op1 = force_operand (op1, op0);
15634 if (op1 == op0)
15635 return;
15636 if (GET_MODE (op1) != mode)
15637 op1 = convert_to_mode (mode, op1, 1);
15638 }
15639 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15640 && SYMBOL_REF_DLLIMPORT_P (op1))
15641 op1 = legitimize_dllimport_symbol (op1, false);
15642 }
15643 else if (GET_CODE (op1) == CONST
15644 && GET_CODE (XEXP (op1, 0)) == PLUS
15645 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15646 {
15647 rtx addend = XEXP (XEXP (op1, 0), 1);
15648 rtx symbol = XEXP (XEXP (op1, 0), 0);
15649 rtx tmp = NULL;
15650
15651 model = SYMBOL_REF_TLS_MODEL (symbol);
15652 if (model)
15653 tmp = legitimize_tls_address (symbol, model, true);
15654 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15655 && SYMBOL_REF_DLLIMPORT_P (symbol))
15656 tmp = legitimize_dllimport_symbol (symbol, true);
15657
15658 if (tmp)
15659 {
15660 tmp = force_operand (tmp, NULL);
15661 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15662 op0, 1, OPTAB_DIRECT);
15663 if (tmp == op0)
15664 return;
15665 if (GET_MODE (tmp) != mode)
15666 op1 = convert_to_mode (mode, tmp, 1);
15667 }
15668 }
15669
15670 if ((flag_pic || MACHOPIC_INDIRECT)
15671 && symbolic_operand (op1, mode))
15672 {
15673 if (TARGET_MACHO && !TARGET_64BIT)
15674 {
15675 #if TARGET_MACHO
15676 /* dynamic-no-pic */
15677 if (MACHOPIC_INDIRECT)
15678 {
15679 rtx temp = ((reload_in_progress
15680 || ((op0 && REG_P (op0))
15681 && mode == Pmode))
15682 ? op0 : gen_reg_rtx (Pmode));
15683 op1 = machopic_indirect_data_reference (op1, temp);
15684 if (MACHOPIC_PURE)
15685 op1 = machopic_legitimize_pic_address (op1, mode,
15686 temp == op1 ? 0 : temp);
15687 }
15688 if (op0 != op1 && GET_CODE (op0) != MEM)
15689 {
15690 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15691 emit_insn (insn);
15692 return;
15693 }
15694 if (GET_CODE (op0) == MEM)
15695 op1 = force_reg (Pmode, op1);
15696 else
15697 {
15698 rtx temp = op0;
15699 if (GET_CODE (temp) != REG)
15700 temp = gen_reg_rtx (Pmode);
15701 temp = legitimize_pic_address (op1, temp);
15702 if (temp == op0)
15703 return;
15704 op1 = temp;
15705 }
15706 /* dynamic-no-pic */
15707 #endif
15708 }
15709 else
15710 {
15711 if (MEM_P (op0))
15712 op1 = force_reg (mode, op1);
15713 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15714 {
15715 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15716 op1 = legitimize_pic_address (op1, reg);
15717 if (op0 == op1)
15718 return;
15719 if (GET_MODE (op1) != mode)
15720 op1 = convert_to_mode (mode, op1, 1);
15721 }
15722 }
15723 }
15724 else
15725 {
15726 if (MEM_P (op0)
15727 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15728 || !push_operand (op0, mode))
15729 && MEM_P (op1))
15730 op1 = force_reg (mode, op1);
15731
15732 if (push_operand (op0, mode)
15733 && ! general_no_elim_operand (op1, mode))
15734 op1 = copy_to_mode_reg (mode, op1);
15735
15736 /* Force large constants in 64bit compilation into register
15737 to get them CSEed. */
15738 if (can_create_pseudo_p ()
15739 && (mode == DImode) && TARGET_64BIT
15740 && immediate_operand (op1, mode)
15741 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15742 && !register_operand (op0, mode)
15743 && optimize)
15744 op1 = copy_to_mode_reg (mode, op1);
15745
15746 if (can_create_pseudo_p ()
15747 && FLOAT_MODE_P (mode)
15748 && GET_CODE (op1) == CONST_DOUBLE)
15749 {
15750 /* If we are loading a floating point constant to a register,
15751 force the value to memory now, since we'll get better code
15752 out the back end. */
15753
15754 op1 = validize_mem (force_const_mem (mode, op1));
15755 if (!register_operand (op0, mode))
15756 {
15757 rtx temp = gen_reg_rtx (mode);
15758 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15759 emit_move_insn (op0, temp);
15760 return;
15761 }
15762 }
15763 }
15764
15765 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15766 }
15767
15768 void
15769 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15770 {
15771 rtx op0 = operands[0], op1 = operands[1];
15772 unsigned int align = GET_MODE_ALIGNMENT (mode);
15773
15774 /* Force constants other than zero into memory. We do not know how
15775 the instructions used to build constants modify the upper 64 bits
15776 of the register, once we have that information we may be able
15777 to handle some of them more efficiently. */
15778 if (can_create_pseudo_p ()
15779 && register_operand (op0, mode)
15780 && (CONSTANT_P (op1)
15781 || (GET_CODE (op1) == SUBREG
15782 && CONSTANT_P (SUBREG_REG (op1))))
15783 && !standard_sse_constant_p (op1))
15784 op1 = validize_mem (force_const_mem (mode, op1));
15785
15786 /* We need to check memory alignment for SSE mode since attribute
15787 can make operands unaligned. */
15788 if (can_create_pseudo_p ()
15789 && SSE_REG_MODE_P (mode)
15790 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15791 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15792 {
15793 rtx tmp[2];
15794
15795 /* ix86_expand_vector_move_misalign() does not like constants ... */
15796 if (CONSTANT_P (op1)
15797 || (GET_CODE (op1) == SUBREG
15798 && CONSTANT_P (SUBREG_REG (op1))))
15799 op1 = validize_mem (force_const_mem (mode, op1));
15800
15801 /* ... nor both arguments in memory. */
15802 if (!register_operand (op0, mode)
15803 && !register_operand (op1, mode))
15804 op1 = force_reg (mode, op1);
15805
15806 tmp[0] = op0; tmp[1] = op1;
15807 ix86_expand_vector_move_misalign (mode, tmp);
15808 return;
15809 }
15810
15811 /* Make operand1 a register if it isn't already. */
15812 if (can_create_pseudo_p ()
15813 && !register_operand (op0, mode)
15814 && !register_operand (op1, mode))
15815 {
15816 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15817 return;
15818 }
15819
15820 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15821 }
15822
15823 /* Split 32-byte AVX unaligned load and store if needed. */
15824
15825 static void
15826 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15827 {
15828 rtx m;
15829 rtx (*extract) (rtx, rtx, rtx);
15830 rtx (*move_unaligned) (rtx, rtx);
15831 enum machine_mode mode;
15832
15833 switch (GET_MODE (op0))
15834 {
15835 default:
15836 gcc_unreachable ();
15837 case V32QImode:
15838 extract = gen_avx_vextractf128v32qi;
15839 move_unaligned = gen_avx_movdqu256;
15840 mode = V16QImode;
15841 break;
15842 case V8SFmode:
15843 extract = gen_avx_vextractf128v8sf;
15844 move_unaligned = gen_avx_movups256;
15845 mode = V4SFmode;
15846 break;
15847 case V4DFmode:
15848 extract = gen_avx_vextractf128v4df;
15849 move_unaligned = gen_avx_movupd256;
15850 mode = V2DFmode;
15851 break;
15852 }
15853
15854 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15855 {
15856 rtx r = gen_reg_rtx (mode);
15857 m = adjust_address (op1, mode, 0);
15858 emit_move_insn (r, m);
15859 m = adjust_address (op1, mode, 16);
15860 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15861 emit_move_insn (op0, r);
15862 }
15863 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15864 {
15865 m = adjust_address (op0, mode, 0);
15866 emit_insn (extract (m, op1, const0_rtx));
15867 m = adjust_address (op0, mode, 16);
15868 emit_insn (extract (m, op1, const1_rtx));
15869 }
15870 else
15871 emit_insn (move_unaligned (op0, op1));
15872 }
15873
15874 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15875 straight to ix86_expand_vector_move. */
15876 /* Code generation for scalar reg-reg moves of single and double precision data:
15877 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15878 movaps reg, reg
15879 else
15880 movss reg, reg
15881 if (x86_sse_partial_reg_dependency == true)
15882 movapd reg, reg
15883 else
15884 movsd reg, reg
15885
15886 Code generation for scalar loads of double precision data:
15887 if (x86_sse_split_regs == true)
15888 movlpd mem, reg (gas syntax)
15889 else
15890 movsd mem, reg
15891
15892 Code generation for unaligned packed loads of single precision data
15893 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15894 if (x86_sse_unaligned_move_optimal)
15895 movups mem, reg
15896
15897 if (x86_sse_partial_reg_dependency == true)
15898 {
15899 xorps reg, reg
15900 movlps mem, reg
15901 movhps mem+8, reg
15902 }
15903 else
15904 {
15905 movlps mem, reg
15906 movhps mem+8, reg
15907 }
15908
15909 Code generation for unaligned packed loads of double precision data
15910 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15911 if (x86_sse_unaligned_move_optimal)
15912 movupd mem, reg
15913
15914 if (x86_sse_split_regs == true)
15915 {
15916 movlpd mem, reg
15917 movhpd mem+8, reg
15918 }
15919 else
15920 {
15921 movsd mem, reg
15922 movhpd mem+8, reg
15923 }
15924 */
15925
15926 void
15927 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15928 {
15929 rtx op0, op1, m;
15930
15931 op0 = operands[0];
15932 op1 = operands[1];
15933
15934 if (TARGET_AVX
15935 && GET_MODE_SIZE (mode) == 32)
15936 {
15937 switch (GET_MODE_CLASS (mode))
15938 {
15939 case MODE_VECTOR_INT:
15940 case MODE_INT:
15941 op0 = gen_lowpart (V32QImode, op0);
15942 op1 = gen_lowpart (V32QImode, op1);
15943 /* FALLTHRU */
15944
15945 case MODE_VECTOR_FLOAT:
15946 ix86_avx256_split_vector_move_misalign (op0, op1);
15947 break;
15948
15949 default:
15950 gcc_unreachable ();
15951 }
15952
15953 return;
15954 }
15955
15956 if (MEM_P (op1))
15957 {
15958 /* ??? If we have typed data, then it would appear that using
15959 movdqu is the only way to get unaligned data loaded with
15960 integer type. */
15961 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15962 {
15963 op0 = gen_lowpart (V16QImode, op0);
15964 op1 = gen_lowpart (V16QImode, op1);
15965 /* We will eventually emit movups based on insn attributes. */
15966 emit_insn (gen_sse2_movdqu (op0, op1));
15967 }
15968 else if (TARGET_SSE2 && mode == V2DFmode)
15969 {
15970 rtx zero;
15971
15972 if (TARGET_AVX
15973 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15974 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15975 || optimize_function_for_size_p (cfun))
15976 {
15977 /* We will eventually emit movups based on insn attributes. */
15978 emit_insn (gen_sse2_movupd (op0, op1));
15979 return;
15980 }
15981
15982 /* When SSE registers are split into halves, we can avoid
15983 writing to the top half twice. */
15984 if (TARGET_SSE_SPLIT_REGS)
15985 {
15986 emit_clobber (op0);
15987 zero = op0;
15988 }
15989 else
15990 {
15991 /* ??? Not sure about the best option for the Intel chips.
15992 The following would seem to satisfy; the register is
15993 entirely cleared, breaking the dependency chain. We
15994 then store to the upper half, with a dependency depth
15995 of one. A rumor has it that Intel recommends two movsd
15996 followed by an unpacklpd, but this is unconfirmed. And
15997 given that the dependency depth of the unpacklpd would
15998 still be one, I'm not sure why this would be better. */
15999 zero = CONST0_RTX (V2DFmode);
16000 }
16001
16002 m = adjust_address (op1, DFmode, 0);
16003 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16004 m = adjust_address (op1, DFmode, 8);
16005 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16006 }
16007 else
16008 {
16009 if (TARGET_AVX
16010 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16011 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16012 || optimize_function_for_size_p (cfun))
16013 {
16014 op0 = gen_lowpart (V4SFmode, op0);
16015 op1 = gen_lowpart (V4SFmode, op1);
16016 emit_insn (gen_sse_movups (op0, op1));
16017 return;
16018 }
16019
16020 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16021 emit_move_insn (op0, CONST0_RTX (mode));
16022 else
16023 emit_clobber (op0);
16024
16025 if (mode != V4SFmode)
16026 op0 = gen_lowpart (V4SFmode, op0);
16027
16028 m = adjust_address (op1, V2SFmode, 0);
16029 emit_insn (gen_sse_loadlps (op0, op0, m));
16030 m = adjust_address (op1, V2SFmode, 8);
16031 emit_insn (gen_sse_loadhps (op0, op0, m));
16032 }
16033 }
16034 else if (MEM_P (op0))
16035 {
16036 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16037 {
16038 op0 = gen_lowpart (V16QImode, op0);
16039 op1 = gen_lowpart (V16QImode, op1);
16040 /* We will eventually emit movups based on insn attributes. */
16041 emit_insn (gen_sse2_movdqu (op0, op1));
16042 }
16043 else if (TARGET_SSE2 && mode == V2DFmode)
16044 {
16045 if (TARGET_AVX
16046 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16047 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16048 || optimize_function_for_size_p (cfun))
16049 /* We will eventually emit movups based on insn attributes. */
16050 emit_insn (gen_sse2_movupd (op0, op1));
16051 else
16052 {
16053 m = adjust_address (op0, DFmode, 0);
16054 emit_insn (gen_sse2_storelpd (m, op1));
16055 m = adjust_address (op0, DFmode, 8);
16056 emit_insn (gen_sse2_storehpd (m, op1));
16057 }
16058 }
16059 else
16060 {
16061 if (mode != V4SFmode)
16062 op1 = gen_lowpart (V4SFmode, op1);
16063
16064 if (TARGET_AVX
16065 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16066 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16067 || optimize_function_for_size_p (cfun))
16068 {
16069 op0 = gen_lowpart (V4SFmode, op0);
16070 emit_insn (gen_sse_movups (op0, op1));
16071 }
16072 else
16073 {
16074 m = adjust_address (op0, V2SFmode, 0);
16075 emit_insn (gen_sse_storelps (m, op1));
16076 m = adjust_address (op0, V2SFmode, 8);
16077 emit_insn (gen_sse_storehps (m, op1));
16078 }
16079 }
16080 }
16081 else
16082 gcc_unreachable ();
16083 }
16084
16085 /* Expand a push in MODE. This is some mode for which we do not support
16086 proper push instructions, at least from the registers that we expect
16087 the value to live in. */
16088
16089 void
16090 ix86_expand_push (enum machine_mode mode, rtx x)
16091 {
16092 rtx tmp;
16093
16094 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16095 GEN_INT (-GET_MODE_SIZE (mode)),
16096 stack_pointer_rtx, 1, OPTAB_DIRECT);
16097 if (tmp != stack_pointer_rtx)
16098 emit_move_insn (stack_pointer_rtx, tmp);
16099
16100 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16101
16102 /* When we push an operand onto stack, it has to be aligned at least
16103 at the function argument boundary. However since we don't have
16104 the argument type, we can't determine the actual argument
16105 boundary. */
16106 emit_move_insn (tmp, x);
16107 }
16108
16109 /* Helper function of ix86_fixup_binary_operands to canonicalize
16110 operand order. Returns true if the operands should be swapped. */
16111
16112 static bool
16113 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16114 rtx operands[])
16115 {
16116 rtx dst = operands[0];
16117 rtx src1 = operands[1];
16118 rtx src2 = operands[2];
16119
16120 /* If the operation is not commutative, we can't do anything. */
16121 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16122 return false;
16123
16124 /* Highest priority is that src1 should match dst. */
16125 if (rtx_equal_p (dst, src1))
16126 return false;
16127 if (rtx_equal_p (dst, src2))
16128 return true;
16129
16130 /* Next highest priority is that immediate constants come second. */
16131 if (immediate_operand (src2, mode))
16132 return false;
16133 if (immediate_operand (src1, mode))
16134 return true;
16135
16136 /* Lowest priority is that memory references should come second. */
16137 if (MEM_P (src2))
16138 return false;
16139 if (MEM_P (src1))
16140 return true;
16141
16142 return false;
16143 }
16144
16145
16146 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16147 destination to use for the operation. If different from the true
16148 destination in operands[0], a copy operation will be required. */
16149
16150 rtx
16151 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16152 rtx operands[])
16153 {
16154 rtx dst = operands[0];
16155 rtx src1 = operands[1];
16156 rtx src2 = operands[2];
16157
16158 /* Canonicalize operand order. */
16159 if (ix86_swap_binary_operands_p (code, mode, operands))
16160 {
16161 rtx temp;
16162
16163 /* It is invalid to swap operands of different modes. */
16164 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16165
16166 temp = src1;
16167 src1 = src2;
16168 src2 = temp;
16169 }
16170
16171 /* Both source operands cannot be in memory. */
16172 if (MEM_P (src1) && MEM_P (src2))
16173 {
16174 /* Optimization: Only read from memory once. */
16175 if (rtx_equal_p (src1, src2))
16176 {
16177 src2 = force_reg (mode, src2);
16178 src1 = src2;
16179 }
16180 else
16181 src2 = force_reg (mode, src2);
16182 }
16183
16184 /* If the destination is memory, and we do not have matching source
16185 operands, do things in registers. */
16186 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16187 dst = gen_reg_rtx (mode);
16188
16189 /* Source 1 cannot be a constant. */
16190 if (CONSTANT_P (src1))
16191 src1 = force_reg (mode, src1);
16192
16193 /* Source 1 cannot be a non-matching memory. */
16194 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16195 src1 = force_reg (mode, src1);
16196
16197 /* Improve address combine. */
16198 if (code == PLUS
16199 && GET_MODE_CLASS (mode) == MODE_INT
16200 && MEM_P (src2))
16201 src2 = force_reg (mode, src2);
16202
16203 operands[1] = src1;
16204 operands[2] = src2;
16205 return dst;
16206 }
16207
16208 /* Similarly, but assume that the destination has already been
16209 set up properly. */
16210
16211 void
16212 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16213 enum machine_mode mode, rtx operands[])
16214 {
16215 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16216 gcc_assert (dst == operands[0]);
16217 }
16218
16219 /* Attempt to expand a binary operator. Make the expansion closer to the
16220 actual machine, then just general_operand, which will allow 3 separate
16221 memory references (one output, two input) in a single insn. */
16222
16223 void
16224 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16225 rtx operands[])
16226 {
16227 rtx src1, src2, dst, op, clob;
16228
16229 dst = ix86_fixup_binary_operands (code, mode, operands);
16230 src1 = operands[1];
16231 src2 = operands[2];
16232
16233 /* Emit the instruction. */
16234
16235 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16236 if (reload_in_progress)
16237 {
16238 /* Reload doesn't know about the flags register, and doesn't know that
16239 it doesn't want to clobber it. We can only do this with PLUS. */
16240 gcc_assert (code == PLUS);
16241 emit_insn (op);
16242 }
16243 else if (reload_completed
16244 && code == PLUS
16245 && !rtx_equal_p (dst, src1))
16246 {
16247 /* This is going to be an LEA; avoid splitting it later. */
16248 emit_insn (op);
16249 }
16250 else
16251 {
16252 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16253 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16254 }
16255
16256 /* Fix up the destination if needed. */
16257 if (dst != operands[0])
16258 emit_move_insn (operands[0], dst);
16259 }
16260
16261 /* Return TRUE or FALSE depending on whether the binary operator meets the
16262 appropriate constraints. */
16263
16264 bool
16265 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16266 rtx operands[3])
16267 {
16268 rtx dst = operands[0];
16269 rtx src1 = operands[1];
16270 rtx src2 = operands[2];
16271
16272 /* Both source operands cannot be in memory. */
16273 if (MEM_P (src1) && MEM_P (src2))
16274 return false;
16275
16276 /* Canonicalize operand order for commutative operators. */
16277 if (ix86_swap_binary_operands_p (code, mode, operands))
16278 {
16279 rtx temp = src1;
16280 src1 = src2;
16281 src2 = temp;
16282 }
16283
16284 /* If the destination is memory, we must have a matching source operand. */
16285 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16286 return false;
16287
16288 /* Source 1 cannot be a constant. */
16289 if (CONSTANT_P (src1))
16290 return false;
16291
16292 /* Source 1 cannot be a non-matching memory. */
16293 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16294 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16295 return (code == AND
16296 && (mode == HImode
16297 || mode == SImode
16298 || (TARGET_64BIT && mode == DImode))
16299 && satisfies_constraint_L (src2));
16300
16301 return true;
16302 }
16303
16304 /* Attempt to expand a unary operator. Make the expansion closer to the
16305 actual machine, then just general_operand, which will allow 2 separate
16306 memory references (one output, one input) in a single insn. */
16307
16308 void
16309 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16310 rtx operands[])
16311 {
16312 int matching_memory;
16313 rtx src, dst, op, clob;
16314
16315 dst = operands[0];
16316 src = operands[1];
16317
16318 /* If the destination is memory, and we do not have matching source
16319 operands, do things in registers. */
16320 matching_memory = 0;
16321 if (MEM_P (dst))
16322 {
16323 if (rtx_equal_p (dst, src))
16324 matching_memory = 1;
16325 else
16326 dst = gen_reg_rtx (mode);
16327 }
16328
16329 /* When source operand is memory, destination must match. */
16330 if (MEM_P (src) && !matching_memory)
16331 src = force_reg (mode, src);
16332
16333 /* Emit the instruction. */
16334
16335 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16336 if (reload_in_progress || code == NOT)
16337 {
16338 /* Reload doesn't know about the flags register, and doesn't know that
16339 it doesn't want to clobber it. */
16340 gcc_assert (code == NOT);
16341 emit_insn (op);
16342 }
16343 else
16344 {
16345 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16346 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16347 }
16348
16349 /* Fix up the destination if needed. */
16350 if (dst != operands[0])
16351 emit_move_insn (operands[0], dst);
16352 }
16353
16354 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16355 divisor are within the range [0-255]. */
16356
16357 void
16358 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16359 bool signed_p)
16360 {
16361 rtx end_label, qimode_label;
16362 rtx insn, div, mod;
16363 rtx scratch, tmp0, tmp1, tmp2;
16364 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16365 rtx (*gen_zero_extend) (rtx, rtx);
16366 rtx (*gen_test_ccno_1) (rtx, rtx);
16367
16368 switch (mode)
16369 {
16370 case SImode:
16371 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16372 gen_test_ccno_1 = gen_testsi_ccno_1;
16373 gen_zero_extend = gen_zero_extendqisi2;
16374 break;
16375 case DImode:
16376 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16377 gen_test_ccno_1 = gen_testdi_ccno_1;
16378 gen_zero_extend = gen_zero_extendqidi2;
16379 break;
16380 default:
16381 gcc_unreachable ();
16382 }
16383
16384 end_label = gen_label_rtx ();
16385 qimode_label = gen_label_rtx ();
16386
16387 scratch = gen_reg_rtx (mode);
16388
16389 /* Use 8bit unsigned divimod if dividend and divisor are within
16390 the range [0-255]. */
16391 emit_move_insn (scratch, operands[2]);
16392 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16393 scratch, 1, OPTAB_DIRECT);
16394 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16395 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16396 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16397 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16398 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16399 pc_rtx);
16400 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16401 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16402 JUMP_LABEL (insn) = qimode_label;
16403
16404 /* Generate original signed/unsigned divimod. */
16405 div = gen_divmod4_1 (operands[0], operands[1],
16406 operands[2], operands[3]);
16407 emit_insn (div);
16408
16409 /* Branch to the end. */
16410 emit_jump_insn (gen_jump (end_label));
16411 emit_barrier ();
16412
16413 /* Generate 8bit unsigned divide. */
16414 emit_label (qimode_label);
16415 /* Don't use operands[0] for result of 8bit divide since not all
16416 registers support QImode ZERO_EXTRACT. */
16417 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16418 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16419 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16420 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16421
16422 if (signed_p)
16423 {
16424 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16425 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16426 }
16427 else
16428 {
16429 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16430 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16431 }
16432
16433 /* Extract remainder from AH. */
16434 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16435 if (REG_P (operands[1]))
16436 insn = emit_move_insn (operands[1], tmp1);
16437 else
16438 {
16439 /* Need a new scratch register since the old one has result
16440 of 8bit divide. */
16441 scratch = gen_reg_rtx (mode);
16442 emit_move_insn (scratch, tmp1);
16443 insn = emit_move_insn (operands[1], scratch);
16444 }
16445 set_unique_reg_note (insn, REG_EQUAL, mod);
16446
16447 /* Zero extend quotient from AL. */
16448 tmp1 = gen_lowpart (QImode, tmp0);
16449 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16450 set_unique_reg_note (insn, REG_EQUAL, div);
16451
16452 emit_label (end_label);
16453 }
16454
16455 #define LEA_MAX_STALL (3)
16456 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16457
16458 /* Increase given DISTANCE in half-cycles according to
16459 dependencies between PREV and NEXT instructions.
16460 Add 1 half-cycle if there is no dependency and
16461 go to next cycle if there is some dependecy. */
16462
16463 static unsigned int
16464 increase_distance (rtx prev, rtx next, unsigned int distance)
16465 {
16466 df_ref *use_rec;
16467 df_ref *def_rec;
16468
16469 if (!prev || !next)
16470 return distance + (distance & 1) + 2;
16471
16472 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16473 return distance + 1;
16474
16475 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16476 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16477 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16478 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16479 return distance + (distance & 1) + 2;
16480
16481 return distance + 1;
16482 }
16483
16484 /* Function checks if instruction INSN defines register number
16485 REGNO1 or REGNO2. */
16486
16487 static bool
16488 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16489 rtx insn)
16490 {
16491 df_ref *def_rec;
16492
16493 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16494 if (DF_REF_REG_DEF_P (*def_rec)
16495 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16496 && (regno1 == DF_REF_REGNO (*def_rec)
16497 || regno2 == DF_REF_REGNO (*def_rec)))
16498 {
16499 return true;
16500 }
16501
16502 return false;
16503 }
16504
16505 /* Function checks if instruction INSN uses register number
16506 REGNO as a part of address expression. */
16507
16508 static bool
16509 insn_uses_reg_mem (unsigned int regno, rtx insn)
16510 {
16511 df_ref *use_rec;
16512
16513 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16514 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16515 return true;
16516
16517 return false;
16518 }
16519
16520 /* Search backward for non-agu definition of register number REGNO1
16521 or register number REGNO2 in basic block starting from instruction
16522 START up to head of basic block or instruction INSN.
16523
16524 Function puts true value into *FOUND var if definition was found
16525 and false otherwise.
16526
16527 Distance in half-cycles between START and found instruction or head
16528 of BB is added to DISTANCE and returned. */
16529
16530 static int
16531 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16532 rtx insn, int distance,
16533 rtx start, bool *found)
16534 {
16535 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16536 rtx prev = start;
16537 rtx next = NULL;
16538
16539 *found = false;
16540
16541 while (prev
16542 && prev != insn
16543 && distance < LEA_SEARCH_THRESHOLD)
16544 {
16545 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16546 {
16547 distance = increase_distance (prev, next, distance);
16548 if (insn_defines_reg (regno1, regno2, prev))
16549 {
16550 if (recog_memoized (prev) < 0
16551 || get_attr_type (prev) != TYPE_LEA)
16552 {
16553 *found = true;
16554 return distance;
16555 }
16556 }
16557
16558 next = prev;
16559 }
16560 if (prev == BB_HEAD (bb))
16561 break;
16562
16563 prev = PREV_INSN (prev);
16564 }
16565
16566 return distance;
16567 }
16568
16569 /* Search backward for non-agu definition of register number REGNO1
16570 or register number REGNO2 in INSN's basic block until
16571 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16572 2. Reach neighbour BBs boundary, or
16573 3. Reach agu definition.
16574 Returns the distance between the non-agu definition point and INSN.
16575 If no definition point, returns -1. */
16576
16577 static int
16578 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16579 rtx insn)
16580 {
16581 basic_block bb = BLOCK_FOR_INSN (insn);
16582 int distance = 0;
16583 bool found = false;
16584
16585 if (insn != BB_HEAD (bb))
16586 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16587 distance, PREV_INSN (insn),
16588 &found);
16589
16590 if (!found && distance < LEA_SEARCH_THRESHOLD)
16591 {
16592 edge e;
16593 edge_iterator ei;
16594 bool simple_loop = false;
16595
16596 FOR_EACH_EDGE (e, ei, bb->preds)
16597 if (e->src == bb)
16598 {
16599 simple_loop = true;
16600 break;
16601 }
16602
16603 if (simple_loop)
16604 distance = distance_non_agu_define_in_bb (regno1, regno2,
16605 insn, distance,
16606 BB_END (bb), &found);
16607 else
16608 {
16609 int shortest_dist = -1;
16610 bool found_in_bb = false;
16611
16612 FOR_EACH_EDGE (e, ei, bb->preds)
16613 {
16614 int bb_dist
16615 = distance_non_agu_define_in_bb (regno1, regno2,
16616 insn, distance,
16617 BB_END (e->src),
16618 &found_in_bb);
16619 if (found_in_bb)
16620 {
16621 if (shortest_dist < 0)
16622 shortest_dist = bb_dist;
16623 else if (bb_dist > 0)
16624 shortest_dist = MIN (bb_dist, shortest_dist);
16625
16626 found = true;
16627 }
16628 }
16629
16630 distance = shortest_dist;
16631 }
16632 }
16633
16634 /* get_attr_type may modify recog data. We want to make sure
16635 that recog data is valid for instruction INSN, on which
16636 distance_non_agu_define is called. INSN is unchanged here. */
16637 extract_insn_cached (insn);
16638
16639 if (!found)
16640 return -1;
16641
16642 return distance >> 1;
16643 }
16644
16645 /* Return the distance in half-cycles between INSN and the next
16646 insn that uses register number REGNO in memory address added
16647 to DISTANCE. Return -1 if REGNO0 is set.
16648
16649 Put true value into *FOUND if register usage was found and
16650 false otherwise.
16651 Put true value into *REDEFINED if register redefinition was
16652 found and false otherwise. */
16653
16654 static int
16655 distance_agu_use_in_bb (unsigned int regno,
16656 rtx insn, int distance, rtx start,
16657 bool *found, bool *redefined)
16658 {
16659 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16660 rtx next = start;
16661 rtx prev = NULL;
16662
16663 *found = false;
16664 *redefined = false;
16665
16666 while (next
16667 && next != insn
16668 && distance < LEA_SEARCH_THRESHOLD)
16669 {
16670 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16671 {
16672 distance = increase_distance(prev, next, distance);
16673 if (insn_uses_reg_mem (regno, next))
16674 {
16675 /* Return DISTANCE if OP0 is used in memory
16676 address in NEXT. */
16677 *found = true;
16678 return distance;
16679 }
16680
16681 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16682 {
16683 /* Return -1 if OP0 is set in NEXT. */
16684 *redefined = true;
16685 return -1;
16686 }
16687
16688 prev = next;
16689 }
16690
16691 if (next == BB_END (bb))
16692 break;
16693
16694 next = NEXT_INSN (next);
16695 }
16696
16697 return distance;
16698 }
16699
16700 /* Return the distance between INSN and the next insn that uses
16701 register number REGNO0 in memory address. Return -1 if no such
16702 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16703
16704 static int
16705 distance_agu_use (unsigned int regno0, rtx insn)
16706 {
16707 basic_block bb = BLOCK_FOR_INSN (insn);
16708 int distance = 0;
16709 bool found = false;
16710 bool redefined = false;
16711
16712 if (insn != BB_END (bb))
16713 distance = distance_agu_use_in_bb (regno0, insn, distance,
16714 NEXT_INSN (insn),
16715 &found, &redefined);
16716
16717 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16718 {
16719 edge e;
16720 edge_iterator ei;
16721 bool simple_loop = false;
16722
16723 FOR_EACH_EDGE (e, ei, bb->succs)
16724 if (e->dest == bb)
16725 {
16726 simple_loop = true;
16727 break;
16728 }
16729
16730 if (simple_loop)
16731 distance = distance_agu_use_in_bb (regno0, insn,
16732 distance, BB_HEAD (bb),
16733 &found, &redefined);
16734 else
16735 {
16736 int shortest_dist = -1;
16737 bool found_in_bb = false;
16738 bool redefined_in_bb = false;
16739
16740 FOR_EACH_EDGE (e, ei, bb->succs)
16741 {
16742 int bb_dist
16743 = distance_agu_use_in_bb (regno0, insn,
16744 distance, BB_HEAD (e->dest),
16745 &found_in_bb, &redefined_in_bb);
16746 if (found_in_bb)
16747 {
16748 if (shortest_dist < 0)
16749 shortest_dist = bb_dist;
16750 else if (bb_dist > 0)
16751 shortest_dist = MIN (bb_dist, shortest_dist);
16752
16753 found = true;
16754 }
16755 }
16756
16757 distance = shortest_dist;
16758 }
16759 }
16760
16761 if (!found || redefined)
16762 return -1;
16763
16764 return distance >> 1;
16765 }
16766
16767 /* Define this macro to tune LEA priority vs ADD, it take effect when
16768 there is a dilemma of choicing LEA or ADD
16769 Negative value: ADD is more preferred than LEA
16770 Zero: Netrual
16771 Positive value: LEA is more preferred than ADD*/
16772 #define IX86_LEA_PRIORITY 0
16773
16774 /* Return true if usage of lea INSN has performance advantage
16775 over a sequence of instructions. Instructions sequence has
16776 SPLIT_COST cycles higher latency than lea latency. */
16777
16778 bool
16779 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16780 unsigned int regno2, unsigned int split_cost)
16781 {
16782 int dist_define, dist_use;
16783
16784 dist_define = distance_non_agu_define (regno1, regno2, insn);
16785 dist_use = distance_agu_use (regno0, insn);
16786
16787 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16788 {
16789 /* If there is no non AGU operand definition, no AGU
16790 operand usage and split cost is 0 then both lea
16791 and non lea variants have same priority. Currently
16792 we prefer lea for 64 bit code and non lea on 32 bit
16793 code. */
16794 if (dist_use < 0 && split_cost == 0)
16795 return TARGET_64BIT || IX86_LEA_PRIORITY;
16796 else
16797 return true;
16798 }
16799
16800 /* With longer definitions distance lea is more preferable.
16801 Here we change it to take into account splitting cost and
16802 lea priority. */
16803 dist_define += split_cost + IX86_LEA_PRIORITY;
16804
16805 /* If there is no use in memory addess then we just check
16806 that split cost does not exceed AGU stall. */
16807 if (dist_use < 0)
16808 return dist_define >= LEA_MAX_STALL;
16809
16810 /* If this insn has both backward non-agu dependence and forward
16811 agu dependence, the one with short distance takes effect. */
16812 return dist_define >= dist_use;
16813 }
16814
16815 /* Return true if it is legal to clobber flags by INSN and
16816 false otherwise. */
16817
16818 static bool
16819 ix86_ok_to_clobber_flags (rtx insn)
16820 {
16821 basic_block bb = BLOCK_FOR_INSN (insn);
16822 df_ref *use;
16823 bitmap live;
16824
16825 while (insn)
16826 {
16827 if (NONDEBUG_INSN_P (insn))
16828 {
16829 for (use = DF_INSN_USES (insn); *use; use++)
16830 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16831 return false;
16832
16833 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16834 return true;
16835 }
16836
16837 if (insn == BB_END (bb))
16838 break;
16839
16840 insn = NEXT_INSN (insn);
16841 }
16842
16843 live = df_get_live_out(bb);
16844 return !REGNO_REG_SET_P (live, FLAGS_REG);
16845 }
16846
16847 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16848 move and add to avoid AGU stalls. */
16849
16850 bool
16851 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16852 {
16853 unsigned int regno0 = true_regnum (operands[0]);
16854 unsigned int regno1 = true_regnum (operands[1]);
16855 unsigned int regno2 = true_regnum (operands[2]);
16856
16857 /* Check if we need to optimize. */
16858 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16859 return false;
16860
16861 /* Check it is correct to split here. */
16862 if (!ix86_ok_to_clobber_flags(insn))
16863 return false;
16864
16865 /* We need to split only adds with non destructive
16866 destination operand. */
16867 if (regno0 == regno1 || regno0 == regno2)
16868 return false;
16869 else
16870 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16871 }
16872
16873 /* Return true if we should emit lea instruction instead of mov
16874 instruction. */
16875
16876 bool
16877 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16878 {
16879 unsigned int regno0;
16880 unsigned int regno1;
16881
16882 /* Check if we need to optimize. */
16883 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16884 return false;
16885
16886 /* Use lea for reg to reg moves only. */
16887 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16888 return false;
16889
16890 regno0 = true_regnum (operands[0]);
16891 regno1 = true_regnum (operands[1]);
16892
16893 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16894 }
16895
16896 /* Return true if we need to split lea into a sequence of
16897 instructions to avoid AGU stalls. */
16898
16899 bool
16900 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16901 {
16902 unsigned int regno0 = true_regnum (operands[0]) ;
16903 unsigned int regno1 = -1;
16904 unsigned int regno2 = -1;
16905 unsigned int split_cost = 0;
16906 struct ix86_address parts;
16907 int ok;
16908
16909 /* Check we need to optimize. */
16910 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16911 return false;
16912
16913 /* Check it is correct to split here. */
16914 if (!ix86_ok_to_clobber_flags(insn))
16915 return false;
16916
16917 ok = ix86_decompose_address (operands[1], &parts);
16918 gcc_assert (ok);
16919
16920 /* We should not split into add if non legitimate pic
16921 operand is used as displacement. */
16922 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16923 return false;
16924
16925 if (parts.base)
16926 regno1 = true_regnum (parts.base);
16927 if (parts.index)
16928 regno2 = true_regnum (parts.index);
16929
16930 /* Compute how many cycles we will add to execution time
16931 if split lea into a sequence of instructions. */
16932 if (parts.base || parts.index)
16933 {
16934 /* Have to use mov instruction if non desctructive
16935 destination form is used. */
16936 if (regno1 != regno0 && regno2 != regno0)
16937 split_cost += 1;
16938
16939 /* Have to add index to base if both exist. */
16940 if (parts.base && parts.index)
16941 split_cost += 1;
16942
16943 /* Have to use shift and adds if scale is 2 or greater. */
16944 if (parts.scale > 1)
16945 {
16946 if (regno0 != regno1)
16947 split_cost += 1;
16948 else if (regno2 == regno0)
16949 split_cost += 4;
16950 else
16951 split_cost += parts.scale;
16952 }
16953
16954 /* Have to use add instruction with immediate if
16955 disp is non zero. */
16956 if (parts.disp && parts.disp != const0_rtx)
16957 split_cost += 1;
16958
16959 /* Subtract the price of lea. */
16960 split_cost -= 1;
16961 }
16962
16963 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16964 }
16965
16966 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16967 matches destination. RTX includes clobber of FLAGS_REG. */
16968
16969 static void
16970 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16971 rtx dst, rtx src)
16972 {
16973 rtx op, clob;
16974
16975 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16976 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16977
16978 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16979 }
16980
16981 /* Split lea instructions into a sequence of instructions
16982 which are executed on ALU to avoid AGU stalls.
16983 It is assumed that it is allowed to clobber flags register
16984 at lea position. */
16985
16986 extern void
16987 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16988 {
16989 unsigned int regno0 = true_regnum (operands[0]) ;
16990 unsigned int regno1 = INVALID_REGNUM;
16991 unsigned int regno2 = INVALID_REGNUM;
16992 struct ix86_address parts;
16993 rtx tmp;
16994 int ok, adds;
16995
16996 ok = ix86_decompose_address (operands[1], &parts);
16997 gcc_assert (ok);
16998
16999 if (parts.base)
17000 {
17001 if (GET_MODE (parts.base) != mode)
17002 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
17003 regno1 = true_regnum (parts.base);
17004 }
17005
17006 if (parts.index)
17007 {
17008 if (GET_MODE (parts.index) != mode)
17009 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
17010 regno2 = true_regnum (parts.index);
17011 }
17012
17013 if (parts.scale > 1)
17014 {
17015 /* Case r1 = r1 + ... */
17016 if (regno1 == regno0)
17017 {
17018 /* If we have a case r1 = r1 + C * r1 then we
17019 should use multiplication which is very
17020 expensive. Assume cost model is wrong if we
17021 have such case here. */
17022 gcc_assert (regno2 != regno0);
17023
17024 for (adds = parts.scale; adds > 0; adds--)
17025 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
17026 }
17027 else
17028 {
17029 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17030 if (regno0 != regno2)
17031 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17032
17033 /* Use shift for scaling. */
17034 ix86_emit_binop (ASHIFT, mode, operands[0],
17035 GEN_INT (exact_log2 (parts.scale)));
17036
17037 if (parts.base)
17038 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
17039
17040 if (parts.disp && parts.disp != const0_rtx)
17041 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17042 }
17043 }
17044 else if (!parts.base && !parts.index)
17045 {
17046 gcc_assert(parts.disp);
17047 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
17048 }
17049 else
17050 {
17051 if (!parts.base)
17052 {
17053 if (regno0 != regno2)
17054 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17055 }
17056 else if (!parts.index)
17057 {
17058 if (regno0 != regno1)
17059 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17060 }
17061 else
17062 {
17063 if (regno0 == regno1)
17064 tmp = parts.index;
17065 else if (regno0 == regno2)
17066 tmp = parts.base;
17067 else
17068 {
17069 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17070 tmp = parts.index;
17071 }
17072
17073 ix86_emit_binop (PLUS, mode, operands[0], tmp);
17074 }
17075
17076 if (parts.disp && parts.disp != const0_rtx)
17077 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17078 }
17079 }
17080
17081 /* Return true if it is ok to optimize an ADD operation to LEA
17082 operation to avoid flag register consumation. For most processors,
17083 ADD is faster than LEA. For the processors like ATOM, if the
17084 destination register of LEA holds an actual address which will be
17085 used soon, LEA is better and otherwise ADD is better. */
17086
17087 bool
17088 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17089 {
17090 unsigned int regno0 = true_regnum (operands[0]);
17091 unsigned int regno1 = true_regnum (operands[1]);
17092 unsigned int regno2 = true_regnum (operands[2]);
17093
17094 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17095 if (regno0 != regno1 && regno0 != regno2)
17096 return true;
17097
17098 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17099 return false;
17100
17101 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17102 }
17103
17104 /* Return true if destination reg of SET_BODY is shift count of
17105 USE_BODY. */
17106
17107 static bool
17108 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17109 {
17110 rtx set_dest;
17111 rtx shift_rtx;
17112 int i;
17113
17114 /* Retrieve destination of SET_BODY. */
17115 switch (GET_CODE (set_body))
17116 {
17117 case SET:
17118 set_dest = SET_DEST (set_body);
17119 if (!set_dest || !REG_P (set_dest))
17120 return false;
17121 break;
17122 case PARALLEL:
17123 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17124 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17125 use_body))
17126 return true;
17127 default:
17128 return false;
17129 break;
17130 }
17131
17132 /* Retrieve shift count of USE_BODY. */
17133 switch (GET_CODE (use_body))
17134 {
17135 case SET:
17136 shift_rtx = XEXP (use_body, 1);
17137 break;
17138 case PARALLEL:
17139 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17140 if (ix86_dep_by_shift_count_body (set_body,
17141 XVECEXP (use_body, 0, i)))
17142 return true;
17143 default:
17144 return false;
17145 break;
17146 }
17147
17148 if (shift_rtx
17149 && (GET_CODE (shift_rtx) == ASHIFT
17150 || GET_CODE (shift_rtx) == LSHIFTRT
17151 || GET_CODE (shift_rtx) == ASHIFTRT
17152 || GET_CODE (shift_rtx) == ROTATE
17153 || GET_CODE (shift_rtx) == ROTATERT))
17154 {
17155 rtx shift_count = XEXP (shift_rtx, 1);
17156
17157 /* Return true if shift count is dest of SET_BODY. */
17158 if (REG_P (shift_count)
17159 && true_regnum (set_dest) == true_regnum (shift_count))
17160 return true;
17161 }
17162
17163 return false;
17164 }
17165
17166 /* Return true if destination reg of SET_INSN is shift count of
17167 USE_INSN. */
17168
17169 bool
17170 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17171 {
17172 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17173 PATTERN (use_insn));
17174 }
17175
17176 /* Return TRUE or FALSE depending on whether the unary operator meets the
17177 appropriate constraints. */
17178
17179 bool
17180 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17181 enum machine_mode mode ATTRIBUTE_UNUSED,
17182 rtx operands[2] ATTRIBUTE_UNUSED)
17183 {
17184 /* If one of operands is memory, source and destination must match. */
17185 if ((MEM_P (operands[0])
17186 || MEM_P (operands[1]))
17187 && ! rtx_equal_p (operands[0], operands[1]))
17188 return false;
17189 return true;
17190 }
17191
17192 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17193 are ok, keeping in mind the possible movddup alternative. */
17194
17195 bool
17196 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17197 {
17198 if (MEM_P (operands[0]))
17199 return rtx_equal_p (operands[0], operands[1 + high]);
17200 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17201 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17202 return true;
17203 }
17204
17205 /* Post-reload splitter for converting an SF or DFmode value in an
17206 SSE register into an unsigned SImode. */
17207
17208 void
17209 ix86_split_convert_uns_si_sse (rtx operands[])
17210 {
17211 enum machine_mode vecmode;
17212 rtx value, large, zero_or_two31, input, two31, x;
17213
17214 large = operands[1];
17215 zero_or_two31 = operands[2];
17216 input = operands[3];
17217 two31 = operands[4];
17218 vecmode = GET_MODE (large);
17219 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17220
17221 /* Load up the value into the low element. We must ensure that the other
17222 elements are valid floats -- zero is the easiest such value. */
17223 if (MEM_P (input))
17224 {
17225 if (vecmode == V4SFmode)
17226 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17227 else
17228 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17229 }
17230 else
17231 {
17232 input = gen_rtx_REG (vecmode, REGNO (input));
17233 emit_move_insn (value, CONST0_RTX (vecmode));
17234 if (vecmode == V4SFmode)
17235 emit_insn (gen_sse_movss (value, value, input));
17236 else
17237 emit_insn (gen_sse2_movsd (value, value, input));
17238 }
17239
17240 emit_move_insn (large, two31);
17241 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17242
17243 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17244 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17245
17246 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17247 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17248
17249 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17250 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17251
17252 large = gen_rtx_REG (V4SImode, REGNO (large));
17253 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17254
17255 x = gen_rtx_REG (V4SImode, REGNO (value));
17256 if (vecmode == V4SFmode)
17257 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17258 else
17259 emit_insn (gen_sse2_cvttpd2dq (x, value));
17260 value = x;
17261
17262 emit_insn (gen_xorv4si3 (value, value, large));
17263 }
17264
17265 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17266 Expects the 64-bit DImode to be supplied in a pair of integral
17267 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17268 -mfpmath=sse, !optimize_size only. */
17269
17270 void
17271 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17272 {
17273 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17274 rtx int_xmm, fp_xmm;
17275 rtx biases, exponents;
17276 rtx x;
17277
17278 int_xmm = gen_reg_rtx (V4SImode);
17279 if (TARGET_INTER_UNIT_MOVES)
17280 emit_insn (gen_movdi_to_sse (int_xmm, input));
17281 else if (TARGET_SSE_SPLIT_REGS)
17282 {
17283 emit_clobber (int_xmm);
17284 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17285 }
17286 else
17287 {
17288 x = gen_reg_rtx (V2DImode);
17289 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17290 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17291 }
17292
17293 x = gen_rtx_CONST_VECTOR (V4SImode,
17294 gen_rtvec (4, GEN_INT (0x43300000UL),
17295 GEN_INT (0x45300000UL),
17296 const0_rtx, const0_rtx));
17297 exponents = validize_mem (force_const_mem (V4SImode, x));
17298
17299 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17300 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17301
17302 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17303 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17304 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17305 (0x1.0p84 + double(fp_value_hi_xmm)).
17306 Note these exponents differ by 32. */
17307
17308 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17309
17310 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17311 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17312 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17313 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17314 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17315 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17316 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17317 biases = validize_mem (force_const_mem (V2DFmode, biases));
17318 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17319
17320 /* Add the upper and lower DFmode values together. */
17321 if (TARGET_SSE3)
17322 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17323 else
17324 {
17325 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17326 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17327 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17328 }
17329
17330 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17331 }
17332
17333 /* Not used, but eases macroization of patterns. */
17334 void
17335 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17336 rtx input ATTRIBUTE_UNUSED)
17337 {
17338 gcc_unreachable ();
17339 }
17340
17341 /* Convert an unsigned SImode value into a DFmode. Only currently used
17342 for SSE, but applicable anywhere. */
17343
17344 void
17345 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17346 {
17347 REAL_VALUE_TYPE TWO31r;
17348 rtx x, fp;
17349
17350 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17351 NULL, 1, OPTAB_DIRECT);
17352
17353 fp = gen_reg_rtx (DFmode);
17354 emit_insn (gen_floatsidf2 (fp, x));
17355
17356 real_ldexp (&TWO31r, &dconst1, 31);
17357 x = const_double_from_real_value (TWO31r, DFmode);
17358
17359 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17360 if (x != target)
17361 emit_move_insn (target, x);
17362 }
17363
17364 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17365 32-bit mode; otherwise we have a direct convert instruction. */
17366
17367 void
17368 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17369 {
17370 REAL_VALUE_TYPE TWO32r;
17371 rtx fp_lo, fp_hi, x;
17372
17373 fp_lo = gen_reg_rtx (DFmode);
17374 fp_hi = gen_reg_rtx (DFmode);
17375
17376 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17377
17378 real_ldexp (&TWO32r, &dconst1, 32);
17379 x = const_double_from_real_value (TWO32r, DFmode);
17380 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17381
17382 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17383
17384 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17385 0, OPTAB_DIRECT);
17386 if (x != target)
17387 emit_move_insn (target, x);
17388 }
17389
17390 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17391 For x86_32, -mfpmath=sse, !optimize_size only. */
17392 void
17393 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17394 {
17395 REAL_VALUE_TYPE ONE16r;
17396 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17397
17398 real_ldexp (&ONE16r, &dconst1, 16);
17399 x = const_double_from_real_value (ONE16r, SFmode);
17400 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17401 NULL, 0, OPTAB_DIRECT);
17402 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17403 NULL, 0, OPTAB_DIRECT);
17404 fp_hi = gen_reg_rtx (SFmode);
17405 fp_lo = gen_reg_rtx (SFmode);
17406 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17407 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17408 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17409 0, OPTAB_DIRECT);
17410 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17411 0, OPTAB_DIRECT);
17412 if (!rtx_equal_p (target, fp_hi))
17413 emit_move_insn (target, fp_hi);
17414 }
17415
17416 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17417 a vector of unsigned ints VAL to vector of floats TARGET. */
17418
17419 void
17420 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17421 {
17422 rtx tmp[8];
17423 REAL_VALUE_TYPE TWO16r;
17424 enum machine_mode intmode = GET_MODE (val);
17425 enum machine_mode fltmode = GET_MODE (target);
17426 rtx (*cvt) (rtx, rtx);
17427
17428 if (intmode == V4SImode)
17429 cvt = gen_floatv4siv4sf2;
17430 else
17431 cvt = gen_floatv8siv8sf2;
17432 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17433 tmp[0] = force_reg (intmode, tmp[0]);
17434 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17435 OPTAB_DIRECT);
17436 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17437 NULL_RTX, 1, OPTAB_DIRECT);
17438 tmp[3] = gen_reg_rtx (fltmode);
17439 emit_insn (cvt (tmp[3], tmp[1]));
17440 tmp[4] = gen_reg_rtx (fltmode);
17441 emit_insn (cvt (tmp[4], tmp[2]));
17442 real_ldexp (&TWO16r, &dconst1, 16);
17443 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17444 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17445 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17446 OPTAB_DIRECT);
17447 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17448 OPTAB_DIRECT);
17449 if (tmp[7] != target)
17450 emit_move_insn (target, tmp[7]);
17451 }
17452
17453 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17454 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17455 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17456 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17457
17458 rtx
17459 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17460 {
17461 REAL_VALUE_TYPE TWO31r;
17462 rtx two31r, tmp[4];
17463 enum machine_mode mode = GET_MODE (val);
17464 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17465 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17466 rtx (*cmp) (rtx, rtx, rtx, rtx);
17467 int i;
17468
17469 for (i = 0; i < 3; i++)
17470 tmp[i] = gen_reg_rtx (mode);
17471 real_ldexp (&TWO31r, &dconst1, 31);
17472 two31r = const_double_from_real_value (TWO31r, scalarmode);
17473 two31r = ix86_build_const_vector (mode, 1, two31r);
17474 two31r = force_reg (mode, two31r);
17475 switch (mode)
17476 {
17477 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17478 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17479 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17480 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17481 default: gcc_unreachable ();
17482 }
17483 tmp[3] = gen_rtx_LE (mode, two31r, val);
17484 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17485 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17486 0, OPTAB_DIRECT);
17487 if (intmode == V4SImode || TARGET_AVX2)
17488 *xorp = expand_simple_binop (intmode, ASHIFT,
17489 gen_lowpart (intmode, tmp[0]),
17490 GEN_INT (31), NULL_RTX, 0,
17491 OPTAB_DIRECT);
17492 else
17493 {
17494 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17495 two31 = ix86_build_const_vector (intmode, 1, two31);
17496 *xorp = expand_simple_binop (intmode, AND,
17497 gen_lowpart (intmode, tmp[0]),
17498 two31, NULL_RTX, 0,
17499 OPTAB_DIRECT);
17500 }
17501 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17502 0, OPTAB_DIRECT);
17503 }
17504
17505 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17506 then replicate the value for all elements of the vector
17507 register. */
17508
17509 rtx
17510 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17511 {
17512 int i, n_elt;
17513 rtvec v;
17514 enum machine_mode scalar_mode;
17515
17516 switch (mode)
17517 {
17518 case V32QImode:
17519 case V16QImode:
17520 case V16HImode:
17521 case V8HImode:
17522 case V8SImode:
17523 case V4SImode:
17524 case V4DImode:
17525 case V2DImode:
17526 gcc_assert (vect);
17527 case V8SFmode:
17528 case V4SFmode:
17529 case V4DFmode:
17530 case V2DFmode:
17531 n_elt = GET_MODE_NUNITS (mode);
17532 v = rtvec_alloc (n_elt);
17533 scalar_mode = GET_MODE_INNER (mode);
17534
17535 RTVEC_ELT (v, 0) = value;
17536
17537 for (i = 1; i < n_elt; ++i)
17538 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17539
17540 return gen_rtx_CONST_VECTOR (mode, v);
17541
17542 default:
17543 gcc_unreachable ();
17544 }
17545 }
17546
17547 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17548 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17549 for an SSE register. If VECT is true, then replicate the mask for
17550 all elements of the vector register. If INVERT is true, then create
17551 a mask excluding the sign bit. */
17552
17553 rtx
17554 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17555 {
17556 enum machine_mode vec_mode, imode;
17557 HOST_WIDE_INT hi, lo;
17558 int shift = 63;
17559 rtx v;
17560 rtx mask;
17561
17562 /* Find the sign bit, sign extended to 2*HWI. */
17563 switch (mode)
17564 {
17565 case V8SImode:
17566 case V4SImode:
17567 case V8SFmode:
17568 case V4SFmode:
17569 vec_mode = mode;
17570 mode = GET_MODE_INNER (mode);
17571 imode = SImode;
17572 lo = 0x80000000, hi = lo < 0;
17573 break;
17574
17575 case V4DImode:
17576 case V2DImode:
17577 case V4DFmode:
17578 case V2DFmode:
17579 vec_mode = mode;
17580 mode = GET_MODE_INNER (mode);
17581 imode = DImode;
17582 if (HOST_BITS_PER_WIDE_INT >= 64)
17583 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17584 else
17585 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17586 break;
17587
17588 case TImode:
17589 case TFmode:
17590 vec_mode = VOIDmode;
17591 if (HOST_BITS_PER_WIDE_INT >= 64)
17592 {
17593 imode = TImode;
17594 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17595 }
17596 else
17597 {
17598 rtvec vec;
17599
17600 imode = DImode;
17601 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17602
17603 if (invert)
17604 {
17605 lo = ~lo, hi = ~hi;
17606 v = constm1_rtx;
17607 }
17608 else
17609 v = const0_rtx;
17610
17611 mask = immed_double_const (lo, hi, imode);
17612
17613 vec = gen_rtvec (2, v, mask);
17614 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17615 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17616
17617 return v;
17618 }
17619 break;
17620
17621 default:
17622 gcc_unreachable ();
17623 }
17624
17625 if (invert)
17626 lo = ~lo, hi = ~hi;
17627
17628 /* Force this value into the low part of a fp vector constant. */
17629 mask = immed_double_const (lo, hi, imode);
17630 mask = gen_lowpart (mode, mask);
17631
17632 if (vec_mode == VOIDmode)
17633 return force_reg (mode, mask);
17634
17635 v = ix86_build_const_vector (vec_mode, vect, mask);
17636 return force_reg (vec_mode, v);
17637 }
17638
17639 /* Generate code for floating point ABS or NEG. */
17640
17641 void
17642 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17643 rtx operands[])
17644 {
17645 rtx mask, set, dst, src;
17646 bool use_sse = false;
17647 bool vector_mode = VECTOR_MODE_P (mode);
17648 enum machine_mode vmode = mode;
17649
17650 if (vector_mode)
17651 use_sse = true;
17652 else if (mode == TFmode)
17653 use_sse = true;
17654 else if (TARGET_SSE_MATH)
17655 {
17656 use_sse = SSE_FLOAT_MODE_P (mode);
17657 if (mode == SFmode)
17658 vmode = V4SFmode;
17659 else if (mode == DFmode)
17660 vmode = V2DFmode;
17661 }
17662
17663 /* NEG and ABS performed with SSE use bitwise mask operations.
17664 Create the appropriate mask now. */
17665 if (use_sse)
17666 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17667 else
17668 mask = NULL_RTX;
17669
17670 dst = operands[0];
17671 src = operands[1];
17672
17673 set = gen_rtx_fmt_e (code, mode, src);
17674 set = gen_rtx_SET (VOIDmode, dst, set);
17675
17676 if (mask)
17677 {
17678 rtx use, clob;
17679 rtvec par;
17680
17681 use = gen_rtx_USE (VOIDmode, mask);
17682 if (vector_mode)
17683 par = gen_rtvec (2, set, use);
17684 else
17685 {
17686 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17687 par = gen_rtvec (3, set, use, clob);
17688 }
17689 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17690 }
17691 else
17692 emit_insn (set);
17693 }
17694
17695 /* Expand a copysign operation. Special case operand 0 being a constant. */
17696
17697 void
17698 ix86_expand_copysign (rtx operands[])
17699 {
17700 enum machine_mode mode, vmode;
17701 rtx dest, op0, op1, mask, nmask;
17702
17703 dest = operands[0];
17704 op0 = operands[1];
17705 op1 = operands[2];
17706
17707 mode = GET_MODE (dest);
17708
17709 if (mode == SFmode)
17710 vmode = V4SFmode;
17711 else if (mode == DFmode)
17712 vmode = V2DFmode;
17713 else
17714 vmode = mode;
17715
17716 if (GET_CODE (op0) == CONST_DOUBLE)
17717 {
17718 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17719
17720 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17721 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17722
17723 if (mode == SFmode || mode == DFmode)
17724 {
17725 if (op0 == CONST0_RTX (mode))
17726 op0 = CONST0_RTX (vmode);
17727 else
17728 {
17729 rtx v = ix86_build_const_vector (vmode, false, op0);
17730
17731 op0 = force_reg (vmode, v);
17732 }
17733 }
17734 else if (op0 != CONST0_RTX (mode))
17735 op0 = force_reg (mode, op0);
17736
17737 mask = ix86_build_signbit_mask (vmode, 0, 0);
17738
17739 if (mode == SFmode)
17740 copysign_insn = gen_copysignsf3_const;
17741 else if (mode == DFmode)
17742 copysign_insn = gen_copysigndf3_const;
17743 else
17744 copysign_insn = gen_copysigntf3_const;
17745
17746 emit_insn (copysign_insn (dest, op0, op1, mask));
17747 }
17748 else
17749 {
17750 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17751
17752 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17753 mask = ix86_build_signbit_mask (vmode, 0, 0);
17754
17755 if (mode == SFmode)
17756 copysign_insn = gen_copysignsf3_var;
17757 else if (mode == DFmode)
17758 copysign_insn = gen_copysigndf3_var;
17759 else
17760 copysign_insn = gen_copysigntf3_var;
17761
17762 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17763 }
17764 }
17765
17766 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17767 be a constant, and so has already been expanded into a vector constant. */
17768
17769 void
17770 ix86_split_copysign_const (rtx operands[])
17771 {
17772 enum machine_mode mode, vmode;
17773 rtx dest, op0, mask, x;
17774
17775 dest = operands[0];
17776 op0 = operands[1];
17777 mask = operands[3];
17778
17779 mode = GET_MODE (dest);
17780 vmode = GET_MODE (mask);
17781
17782 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17783 x = gen_rtx_AND (vmode, dest, mask);
17784 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17785
17786 if (op0 != CONST0_RTX (vmode))
17787 {
17788 x = gen_rtx_IOR (vmode, dest, op0);
17789 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17790 }
17791 }
17792
17793 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17794 so we have to do two masks. */
17795
17796 void
17797 ix86_split_copysign_var (rtx operands[])
17798 {
17799 enum machine_mode mode, vmode;
17800 rtx dest, scratch, op0, op1, mask, nmask, x;
17801
17802 dest = operands[0];
17803 scratch = operands[1];
17804 op0 = operands[2];
17805 op1 = operands[3];
17806 nmask = operands[4];
17807 mask = operands[5];
17808
17809 mode = GET_MODE (dest);
17810 vmode = GET_MODE (mask);
17811
17812 if (rtx_equal_p (op0, op1))
17813 {
17814 /* Shouldn't happen often (it's useless, obviously), but when it does
17815 we'd generate incorrect code if we continue below. */
17816 emit_move_insn (dest, op0);
17817 return;
17818 }
17819
17820 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17821 {
17822 gcc_assert (REGNO (op1) == REGNO (scratch));
17823
17824 x = gen_rtx_AND (vmode, scratch, mask);
17825 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17826
17827 dest = mask;
17828 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17829 x = gen_rtx_NOT (vmode, dest);
17830 x = gen_rtx_AND (vmode, x, op0);
17831 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17832 }
17833 else
17834 {
17835 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17836 {
17837 x = gen_rtx_AND (vmode, scratch, mask);
17838 }
17839 else /* alternative 2,4 */
17840 {
17841 gcc_assert (REGNO (mask) == REGNO (scratch));
17842 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17843 x = gen_rtx_AND (vmode, scratch, op1);
17844 }
17845 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17846
17847 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17848 {
17849 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17850 x = gen_rtx_AND (vmode, dest, nmask);
17851 }
17852 else /* alternative 3,4 */
17853 {
17854 gcc_assert (REGNO (nmask) == REGNO (dest));
17855 dest = nmask;
17856 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17857 x = gen_rtx_AND (vmode, dest, op0);
17858 }
17859 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17860 }
17861
17862 x = gen_rtx_IOR (vmode, dest, scratch);
17863 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17864 }
17865
17866 /* Return TRUE or FALSE depending on whether the first SET in INSN
17867 has source and destination with matching CC modes, and that the
17868 CC mode is at least as constrained as REQ_MODE. */
17869
17870 bool
17871 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17872 {
17873 rtx set;
17874 enum machine_mode set_mode;
17875
17876 set = PATTERN (insn);
17877 if (GET_CODE (set) == PARALLEL)
17878 set = XVECEXP (set, 0, 0);
17879 gcc_assert (GET_CODE (set) == SET);
17880 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17881
17882 set_mode = GET_MODE (SET_DEST (set));
17883 switch (set_mode)
17884 {
17885 case CCNOmode:
17886 if (req_mode != CCNOmode
17887 && (req_mode != CCmode
17888 || XEXP (SET_SRC (set), 1) != const0_rtx))
17889 return false;
17890 break;
17891 case CCmode:
17892 if (req_mode == CCGCmode)
17893 return false;
17894 /* FALLTHRU */
17895 case CCGCmode:
17896 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17897 return false;
17898 /* FALLTHRU */
17899 case CCGOCmode:
17900 if (req_mode == CCZmode)
17901 return false;
17902 /* FALLTHRU */
17903 case CCZmode:
17904 break;
17905
17906 case CCAmode:
17907 case CCCmode:
17908 case CCOmode:
17909 case CCSmode:
17910 if (set_mode != req_mode)
17911 return false;
17912 break;
17913
17914 default:
17915 gcc_unreachable ();
17916 }
17917
17918 return GET_MODE (SET_SRC (set)) == set_mode;
17919 }
17920
17921 /* Generate insn patterns to do an integer compare of OPERANDS. */
17922
17923 static rtx
17924 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17925 {
17926 enum machine_mode cmpmode;
17927 rtx tmp, flags;
17928
17929 cmpmode = SELECT_CC_MODE (code, op0, op1);
17930 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17931
17932 /* This is very simple, but making the interface the same as in the
17933 FP case makes the rest of the code easier. */
17934 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17935 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17936
17937 /* Return the test that should be put into the flags user, i.e.
17938 the bcc, scc, or cmov instruction. */
17939 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17940 }
17941
17942 /* Figure out whether to use ordered or unordered fp comparisons.
17943 Return the appropriate mode to use. */
17944
17945 enum machine_mode
17946 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17947 {
17948 /* ??? In order to make all comparisons reversible, we do all comparisons
17949 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17950 all forms trapping and nontrapping comparisons, we can make inequality
17951 comparisons trapping again, since it results in better code when using
17952 FCOM based compares. */
17953 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17954 }
17955
17956 enum machine_mode
17957 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17958 {
17959 enum machine_mode mode = GET_MODE (op0);
17960
17961 if (SCALAR_FLOAT_MODE_P (mode))
17962 {
17963 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17964 return ix86_fp_compare_mode (code);
17965 }
17966
17967 switch (code)
17968 {
17969 /* Only zero flag is needed. */
17970 case EQ: /* ZF=0 */
17971 case NE: /* ZF!=0 */
17972 return CCZmode;
17973 /* Codes needing carry flag. */
17974 case GEU: /* CF=0 */
17975 case LTU: /* CF=1 */
17976 /* Detect overflow checks. They need just the carry flag. */
17977 if (GET_CODE (op0) == PLUS
17978 && rtx_equal_p (op1, XEXP (op0, 0)))
17979 return CCCmode;
17980 else
17981 return CCmode;
17982 case GTU: /* CF=0 & ZF=0 */
17983 case LEU: /* CF=1 | ZF=1 */
17984 /* Detect overflow checks. They need just the carry flag. */
17985 if (GET_CODE (op0) == MINUS
17986 && rtx_equal_p (op1, XEXP (op0, 0)))
17987 return CCCmode;
17988 else
17989 return CCmode;
17990 /* Codes possibly doable only with sign flag when
17991 comparing against zero. */
17992 case GE: /* SF=OF or SF=0 */
17993 case LT: /* SF<>OF or SF=1 */
17994 if (op1 == const0_rtx)
17995 return CCGOCmode;
17996 else
17997 /* For other cases Carry flag is not required. */
17998 return CCGCmode;
17999 /* Codes doable only with sign flag when comparing
18000 against zero, but we miss jump instruction for it
18001 so we need to use relational tests against overflow
18002 that thus needs to be zero. */
18003 case GT: /* ZF=0 & SF=OF */
18004 case LE: /* ZF=1 | SF<>OF */
18005 if (op1 == const0_rtx)
18006 return CCNOmode;
18007 else
18008 return CCGCmode;
18009 /* strcmp pattern do (use flags) and combine may ask us for proper
18010 mode. */
18011 case USE:
18012 return CCmode;
18013 default:
18014 gcc_unreachable ();
18015 }
18016 }
18017
18018 /* Return the fixed registers used for condition codes. */
18019
18020 static bool
18021 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18022 {
18023 *p1 = FLAGS_REG;
18024 *p2 = FPSR_REG;
18025 return true;
18026 }
18027
18028 /* If two condition code modes are compatible, return a condition code
18029 mode which is compatible with both. Otherwise, return
18030 VOIDmode. */
18031
18032 static enum machine_mode
18033 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18034 {
18035 if (m1 == m2)
18036 return m1;
18037
18038 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18039 return VOIDmode;
18040
18041 if ((m1 == CCGCmode && m2 == CCGOCmode)
18042 || (m1 == CCGOCmode && m2 == CCGCmode))
18043 return CCGCmode;
18044
18045 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18046 return m2;
18047 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18048 return m1;
18049
18050 switch (m1)
18051 {
18052 default:
18053 gcc_unreachable ();
18054
18055 case CCmode:
18056 case CCGCmode:
18057 case CCGOCmode:
18058 case CCNOmode:
18059 case CCAmode:
18060 case CCCmode:
18061 case CCOmode:
18062 case CCSmode:
18063 case CCZmode:
18064 switch (m2)
18065 {
18066 default:
18067 return VOIDmode;
18068
18069 case CCmode:
18070 case CCGCmode:
18071 case CCGOCmode:
18072 case CCNOmode:
18073 case CCAmode:
18074 case CCCmode:
18075 case CCOmode:
18076 case CCSmode:
18077 case CCZmode:
18078 return CCmode;
18079 }
18080
18081 case CCFPmode:
18082 case CCFPUmode:
18083 /* These are only compatible with themselves, which we already
18084 checked above. */
18085 return VOIDmode;
18086 }
18087 }
18088
18089
18090 /* Return a comparison we can do and that it is equivalent to
18091 swap_condition (code) apart possibly from orderedness.
18092 But, never change orderedness if TARGET_IEEE_FP, returning
18093 UNKNOWN in that case if necessary. */
18094
18095 static enum rtx_code
18096 ix86_fp_swap_condition (enum rtx_code code)
18097 {
18098 switch (code)
18099 {
18100 case GT: /* GTU - CF=0 & ZF=0 */
18101 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18102 case GE: /* GEU - CF=0 */
18103 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18104 case UNLT: /* LTU - CF=1 */
18105 return TARGET_IEEE_FP ? UNKNOWN : GT;
18106 case UNLE: /* LEU - CF=1 | ZF=1 */
18107 return TARGET_IEEE_FP ? UNKNOWN : GE;
18108 default:
18109 return swap_condition (code);
18110 }
18111 }
18112
18113 /* Return cost of comparison CODE using the best strategy for performance.
18114 All following functions do use number of instructions as a cost metrics.
18115 In future this should be tweaked to compute bytes for optimize_size and
18116 take into account performance of various instructions on various CPUs. */
18117
18118 static int
18119 ix86_fp_comparison_cost (enum rtx_code code)
18120 {
18121 int arith_cost;
18122
18123 /* The cost of code using bit-twiddling on %ah. */
18124 switch (code)
18125 {
18126 case UNLE:
18127 case UNLT:
18128 case LTGT:
18129 case GT:
18130 case GE:
18131 case UNORDERED:
18132 case ORDERED:
18133 case UNEQ:
18134 arith_cost = 4;
18135 break;
18136 case LT:
18137 case NE:
18138 case EQ:
18139 case UNGE:
18140 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18141 break;
18142 case LE:
18143 case UNGT:
18144 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18145 break;
18146 default:
18147 gcc_unreachable ();
18148 }
18149
18150 switch (ix86_fp_comparison_strategy (code))
18151 {
18152 case IX86_FPCMP_COMI:
18153 return arith_cost > 4 ? 3 : 2;
18154 case IX86_FPCMP_SAHF:
18155 return arith_cost > 4 ? 4 : 3;
18156 default:
18157 return arith_cost;
18158 }
18159 }
18160
18161 /* Return strategy to use for floating-point. We assume that fcomi is always
18162 preferrable where available, since that is also true when looking at size
18163 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18164
18165 enum ix86_fpcmp_strategy
18166 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18167 {
18168 /* Do fcomi/sahf based test when profitable. */
18169
18170 if (TARGET_CMOVE)
18171 return IX86_FPCMP_COMI;
18172
18173 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18174 return IX86_FPCMP_SAHF;
18175
18176 return IX86_FPCMP_ARITH;
18177 }
18178
18179 /* Swap, force into registers, or otherwise massage the two operands
18180 to a fp comparison. The operands are updated in place; the new
18181 comparison code is returned. */
18182
18183 static enum rtx_code
18184 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18185 {
18186 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18187 rtx op0 = *pop0, op1 = *pop1;
18188 enum machine_mode op_mode = GET_MODE (op0);
18189 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18190
18191 /* All of the unordered compare instructions only work on registers.
18192 The same is true of the fcomi compare instructions. The XFmode
18193 compare instructions require registers except when comparing
18194 against zero or when converting operand 1 from fixed point to
18195 floating point. */
18196
18197 if (!is_sse
18198 && (fpcmp_mode == CCFPUmode
18199 || (op_mode == XFmode
18200 && ! (standard_80387_constant_p (op0) == 1
18201 || standard_80387_constant_p (op1) == 1)
18202 && GET_CODE (op1) != FLOAT)
18203 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18204 {
18205 op0 = force_reg (op_mode, op0);
18206 op1 = force_reg (op_mode, op1);
18207 }
18208 else
18209 {
18210 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18211 things around if they appear profitable, otherwise force op0
18212 into a register. */
18213
18214 if (standard_80387_constant_p (op0) == 0
18215 || (MEM_P (op0)
18216 && ! (standard_80387_constant_p (op1) == 0
18217 || MEM_P (op1))))
18218 {
18219 enum rtx_code new_code = ix86_fp_swap_condition (code);
18220 if (new_code != UNKNOWN)
18221 {
18222 rtx tmp;
18223 tmp = op0, op0 = op1, op1 = tmp;
18224 code = new_code;
18225 }
18226 }
18227
18228 if (!REG_P (op0))
18229 op0 = force_reg (op_mode, op0);
18230
18231 if (CONSTANT_P (op1))
18232 {
18233 int tmp = standard_80387_constant_p (op1);
18234 if (tmp == 0)
18235 op1 = validize_mem (force_const_mem (op_mode, op1));
18236 else if (tmp == 1)
18237 {
18238 if (TARGET_CMOVE)
18239 op1 = force_reg (op_mode, op1);
18240 }
18241 else
18242 op1 = force_reg (op_mode, op1);
18243 }
18244 }
18245
18246 /* Try to rearrange the comparison to make it cheaper. */
18247 if (ix86_fp_comparison_cost (code)
18248 > ix86_fp_comparison_cost (swap_condition (code))
18249 && (REG_P (op1) || can_create_pseudo_p ()))
18250 {
18251 rtx tmp;
18252 tmp = op0, op0 = op1, op1 = tmp;
18253 code = swap_condition (code);
18254 if (!REG_P (op0))
18255 op0 = force_reg (op_mode, op0);
18256 }
18257
18258 *pop0 = op0;
18259 *pop1 = op1;
18260 return code;
18261 }
18262
18263 /* Convert comparison codes we use to represent FP comparison to integer
18264 code that will result in proper branch. Return UNKNOWN if no such code
18265 is available. */
18266
18267 enum rtx_code
18268 ix86_fp_compare_code_to_integer (enum rtx_code code)
18269 {
18270 switch (code)
18271 {
18272 case GT:
18273 return GTU;
18274 case GE:
18275 return GEU;
18276 case ORDERED:
18277 case UNORDERED:
18278 return code;
18279 break;
18280 case UNEQ:
18281 return EQ;
18282 break;
18283 case UNLT:
18284 return LTU;
18285 break;
18286 case UNLE:
18287 return LEU;
18288 break;
18289 case LTGT:
18290 return NE;
18291 break;
18292 default:
18293 return UNKNOWN;
18294 }
18295 }
18296
18297 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18298
18299 static rtx
18300 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18301 {
18302 enum machine_mode fpcmp_mode, intcmp_mode;
18303 rtx tmp, tmp2;
18304
18305 fpcmp_mode = ix86_fp_compare_mode (code);
18306 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18307
18308 /* Do fcomi/sahf based test when profitable. */
18309 switch (ix86_fp_comparison_strategy (code))
18310 {
18311 case IX86_FPCMP_COMI:
18312 intcmp_mode = fpcmp_mode;
18313 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18314 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18315 tmp);
18316 emit_insn (tmp);
18317 break;
18318
18319 case IX86_FPCMP_SAHF:
18320 intcmp_mode = fpcmp_mode;
18321 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18322 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18323 tmp);
18324
18325 if (!scratch)
18326 scratch = gen_reg_rtx (HImode);
18327 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18328 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18329 break;
18330
18331 case IX86_FPCMP_ARITH:
18332 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18333 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18334 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18335 if (!scratch)
18336 scratch = gen_reg_rtx (HImode);
18337 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18338
18339 /* In the unordered case, we have to check C2 for NaN's, which
18340 doesn't happen to work out to anything nice combination-wise.
18341 So do some bit twiddling on the value we've got in AH to come
18342 up with an appropriate set of condition codes. */
18343
18344 intcmp_mode = CCNOmode;
18345 switch (code)
18346 {
18347 case GT:
18348 case UNGT:
18349 if (code == GT || !TARGET_IEEE_FP)
18350 {
18351 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18352 code = EQ;
18353 }
18354 else
18355 {
18356 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18357 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18358 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18359 intcmp_mode = CCmode;
18360 code = GEU;
18361 }
18362 break;
18363 case LT:
18364 case UNLT:
18365 if (code == LT && TARGET_IEEE_FP)
18366 {
18367 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18368 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18369 intcmp_mode = CCmode;
18370 code = EQ;
18371 }
18372 else
18373 {
18374 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18375 code = NE;
18376 }
18377 break;
18378 case GE:
18379 case UNGE:
18380 if (code == GE || !TARGET_IEEE_FP)
18381 {
18382 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18383 code = EQ;
18384 }
18385 else
18386 {
18387 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18388 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18389 code = NE;
18390 }
18391 break;
18392 case LE:
18393 case UNLE:
18394 if (code == LE && TARGET_IEEE_FP)
18395 {
18396 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18397 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18398 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18399 intcmp_mode = CCmode;
18400 code = LTU;
18401 }
18402 else
18403 {
18404 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18405 code = NE;
18406 }
18407 break;
18408 case EQ:
18409 case UNEQ:
18410 if (code == EQ && TARGET_IEEE_FP)
18411 {
18412 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18413 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18414 intcmp_mode = CCmode;
18415 code = EQ;
18416 }
18417 else
18418 {
18419 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18420 code = NE;
18421 }
18422 break;
18423 case NE:
18424 case LTGT:
18425 if (code == NE && TARGET_IEEE_FP)
18426 {
18427 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18428 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18429 GEN_INT (0x40)));
18430 code = NE;
18431 }
18432 else
18433 {
18434 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18435 code = EQ;
18436 }
18437 break;
18438
18439 case UNORDERED:
18440 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18441 code = NE;
18442 break;
18443 case ORDERED:
18444 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18445 code = EQ;
18446 break;
18447
18448 default:
18449 gcc_unreachable ();
18450 }
18451 break;
18452
18453 default:
18454 gcc_unreachable();
18455 }
18456
18457 /* Return the test that should be put into the flags user, i.e.
18458 the bcc, scc, or cmov instruction. */
18459 return gen_rtx_fmt_ee (code, VOIDmode,
18460 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18461 const0_rtx);
18462 }
18463
18464 static rtx
18465 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18466 {
18467 rtx ret;
18468
18469 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18470 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18471
18472 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18473 {
18474 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18475 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18476 }
18477 else
18478 ret = ix86_expand_int_compare (code, op0, op1);
18479
18480 return ret;
18481 }
18482
18483 void
18484 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18485 {
18486 enum machine_mode mode = GET_MODE (op0);
18487 rtx tmp;
18488
18489 switch (mode)
18490 {
18491 case SFmode:
18492 case DFmode:
18493 case XFmode:
18494 case QImode:
18495 case HImode:
18496 case SImode:
18497 simple:
18498 tmp = ix86_expand_compare (code, op0, op1);
18499 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18500 gen_rtx_LABEL_REF (VOIDmode, label),
18501 pc_rtx);
18502 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18503 return;
18504
18505 case DImode:
18506 if (TARGET_64BIT)
18507 goto simple;
18508 case TImode:
18509 /* Expand DImode branch into multiple compare+branch. */
18510 {
18511 rtx lo[2], hi[2], label2;
18512 enum rtx_code code1, code2, code3;
18513 enum machine_mode submode;
18514
18515 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18516 {
18517 tmp = op0, op0 = op1, op1 = tmp;
18518 code = swap_condition (code);
18519 }
18520
18521 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18522 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18523
18524 submode = mode == DImode ? SImode : DImode;
18525
18526 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18527 avoid two branches. This costs one extra insn, so disable when
18528 optimizing for size. */
18529
18530 if ((code == EQ || code == NE)
18531 && (!optimize_insn_for_size_p ()
18532 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18533 {
18534 rtx xor0, xor1;
18535
18536 xor1 = hi[0];
18537 if (hi[1] != const0_rtx)
18538 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18539 NULL_RTX, 0, OPTAB_WIDEN);
18540
18541 xor0 = lo[0];
18542 if (lo[1] != const0_rtx)
18543 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18544 NULL_RTX, 0, OPTAB_WIDEN);
18545
18546 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18547 NULL_RTX, 0, OPTAB_WIDEN);
18548
18549 ix86_expand_branch (code, tmp, const0_rtx, label);
18550 return;
18551 }
18552
18553 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18554 op1 is a constant and the low word is zero, then we can just
18555 examine the high word. Similarly for low word -1 and
18556 less-or-equal-than or greater-than. */
18557
18558 if (CONST_INT_P (hi[1]))
18559 switch (code)
18560 {
18561 case LT: case LTU: case GE: case GEU:
18562 if (lo[1] == const0_rtx)
18563 {
18564 ix86_expand_branch (code, hi[0], hi[1], label);
18565 return;
18566 }
18567 break;
18568 case LE: case LEU: case GT: case GTU:
18569 if (lo[1] == constm1_rtx)
18570 {
18571 ix86_expand_branch (code, hi[0], hi[1], label);
18572 return;
18573 }
18574 break;
18575 default:
18576 break;
18577 }
18578
18579 /* Otherwise, we need two or three jumps. */
18580
18581 label2 = gen_label_rtx ();
18582
18583 code1 = code;
18584 code2 = swap_condition (code);
18585 code3 = unsigned_condition (code);
18586
18587 switch (code)
18588 {
18589 case LT: case GT: case LTU: case GTU:
18590 break;
18591
18592 case LE: code1 = LT; code2 = GT; break;
18593 case GE: code1 = GT; code2 = LT; break;
18594 case LEU: code1 = LTU; code2 = GTU; break;
18595 case GEU: code1 = GTU; code2 = LTU; break;
18596
18597 case EQ: code1 = UNKNOWN; code2 = NE; break;
18598 case NE: code2 = UNKNOWN; break;
18599
18600 default:
18601 gcc_unreachable ();
18602 }
18603
18604 /*
18605 * a < b =>
18606 * if (hi(a) < hi(b)) goto true;
18607 * if (hi(a) > hi(b)) goto false;
18608 * if (lo(a) < lo(b)) goto true;
18609 * false:
18610 */
18611
18612 if (code1 != UNKNOWN)
18613 ix86_expand_branch (code1, hi[0], hi[1], label);
18614 if (code2 != UNKNOWN)
18615 ix86_expand_branch (code2, hi[0], hi[1], label2);
18616
18617 ix86_expand_branch (code3, lo[0], lo[1], label);
18618
18619 if (code2 != UNKNOWN)
18620 emit_label (label2);
18621 return;
18622 }
18623
18624 default:
18625 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18626 goto simple;
18627 }
18628 }
18629
18630 /* Split branch based on floating point condition. */
18631 void
18632 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18633 rtx target1, rtx target2, rtx tmp, rtx pushed)
18634 {
18635 rtx condition;
18636 rtx i;
18637
18638 if (target2 != pc_rtx)
18639 {
18640 rtx tmp = target2;
18641 code = reverse_condition_maybe_unordered (code);
18642 target2 = target1;
18643 target1 = tmp;
18644 }
18645
18646 condition = ix86_expand_fp_compare (code, op1, op2,
18647 tmp);
18648
18649 /* Remove pushed operand from stack. */
18650 if (pushed)
18651 ix86_free_from_memory (GET_MODE (pushed));
18652
18653 i = emit_jump_insn (gen_rtx_SET
18654 (VOIDmode, pc_rtx,
18655 gen_rtx_IF_THEN_ELSE (VOIDmode,
18656 condition, target1, target2)));
18657 if (split_branch_probability >= 0)
18658 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18659 }
18660
18661 void
18662 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18663 {
18664 rtx ret;
18665
18666 gcc_assert (GET_MODE (dest) == QImode);
18667
18668 ret = ix86_expand_compare (code, op0, op1);
18669 PUT_MODE (ret, QImode);
18670 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18671 }
18672
18673 /* Expand comparison setting or clearing carry flag. Return true when
18674 successful and set pop for the operation. */
18675 static bool
18676 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18677 {
18678 enum machine_mode mode =
18679 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18680
18681 /* Do not handle double-mode compares that go through special path. */
18682 if (mode == (TARGET_64BIT ? TImode : DImode))
18683 return false;
18684
18685 if (SCALAR_FLOAT_MODE_P (mode))
18686 {
18687 rtx compare_op, compare_seq;
18688
18689 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18690
18691 /* Shortcut: following common codes never translate
18692 into carry flag compares. */
18693 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18694 || code == ORDERED || code == UNORDERED)
18695 return false;
18696
18697 /* These comparisons require zero flag; swap operands so they won't. */
18698 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18699 && !TARGET_IEEE_FP)
18700 {
18701 rtx tmp = op0;
18702 op0 = op1;
18703 op1 = tmp;
18704 code = swap_condition (code);
18705 }
18706
18707 /* Try to expand the comparison and verify that we end up with
18708 carry flag based comparison. This fails to be true only when
18709 we decide to expand comparison using arithmetic that is not
18710 too common scenario. */
18711 start_sequence ();
18712 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18713 compare_seq = get_insns ();
18714 end_sequence ();
18715
18716 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18717 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18718 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18719 else
18720 code = GET_CODE (compare_op);
18721
18722 if (code != LTU && code != GEU)
18723 return false;
18724
18725 emit_insn (compare_seq);
18726 *pop = compare_op;
18727 return true;
18728 }
18729
18730 if (!INTEGRAL_MODE_P (mode))
18731 return false;
18732
18733 switch (code)
18734 {
18735 case LTU:
18736 case GEU:
18737 break;
18738
18739 /* Convert a==0 into (unsigned)a<1. */
18740 case EQ:
18741 case NE:
18742 if (op1 != const0_rtx)
18743 return false;
18744 op1 = const1_rtx;
18745 code = (code == EQ ? LTU : GEU);
18746 break;
18747
18748 /* Convert a>b into b<a or a>=b-1. */
18749 case GTU:
18750 case LEU:
18751 if (CONST_INT_P (op1))
18752 {
18753 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18754 /* Bail out on overflow. We still can swap operands but that
18755 would force loading of the constant into register. */
18756 if (op1 == const0_rtx
18757 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18758 return false;
18759 code = (code == GTU ? GEU : LTU);
18760 }
18761 else
18762 {
18763 rtx tmp = op1;
18764 op1 = op0;
18765 op0 = tmp;
18766 code = (code == GTU ? LTU : GEU);
18767 }
18768 break;
18769
18770 /* Convert a>=0 into (unsigned)a<0x80000000. */
18771 case LT:
18772 case GE:
18773 if (mode == DImode || op1 != const0_rtx)
18774 return false;
18775 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18776 code = (code == LT ? GEU : LTU);
18777 break;
18778 case LE:
18779 case GT:
18780 if (mode == DImode || op1 != constm1_rtx)
18781 return false;
18782 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18783 code = (code == LE ? GEU : LTU);
18784 break;
18785
18786 default:
18787 return false;
18788 }
18789 /* Swapping operands may cause constant to appear as first operand. */
18790 if (!nonimmediate_operand (op0, VOIDmode))
18791 {
18792 if (!can_create_pseudo_p ())
18793 return false;
18794 op0 = force_reg (mode, op0);
18795 }
18796 *pop = ix86_expand_compare (code, op0, op1);
18797 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18798 return true;
18799 }
18800
18801 bool
18802 ix86_expand_int_movcc (rtx operands[])
18803 {
18804 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18805 rtx compare_seq, compare_op;
18806 enum machine_mode mode = GET_MODE (operands[0]);
18807 bool sign_bit_compare_p = false;
18808 rtx op0 = XEXP (operands[1], 0);
18809 rtx op1 = XEXP (operands[1], 1);
18810
18811 if (GET_MODE (op0) == TImode
18812 || (GET_MODE (op0) == DImode
18813 && !TARGET_64BIT))
18814 return false;
18815
18816 start_sequence ();
18817 compare_op = ix86_expand_compare (code, op0, op1);
18818 compare_seq = get_insns ();
18819 end_sequence ();
18820
18821 compare_code = GET_CODE (compare_op);
18822
18823 if ((op1 == const0_rtx && (code == GE || code == LT))
18824 || (op1 == constm1_rtx && (code == GT || code == LE)))
18825 sign_bit_compare_p = true;
18826
18827 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18828 HImode insns, we'd be swallowed in word prefix ops. */
18829
18830 if ((mode != HImode || TARGET_FAST_PREFIX)
18831 && (mode != (TARGET_64BIT ? TImode : DImode))
18832 && CONST_INT_P (operands[2])
18833 && CONST_INT_P (operands[3]))
18834 {
18835 rtx out = operands[0];
18836 HOST_WIDE_INT ct = INTVAL (operands[2]);
18837 HOST_WIDE_INT cf = INTVAL (operands[3]);
18838 HOST_WIDE_INT diff;
18839
18840 diff = ct - cf;
18841 /* Sign bit compares are better done using shifts than we do by using
18842 sbb. */
18843 if (sign_bit_compare_p
18844 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18845 {
18846 /* Detect overlap between destination and compare sources. */
18847 rtx tmp = out;
18848
18849 if (!sign_bit_compare_p)
18850 {
18851 rtx flags;
18852 bool fpcmp = false;
18853
18854 compare_code = GET_CODE (compare_op);
18855
18856 flags = XEXP (compare_op, 0);
18857
18858 if (GET_MODE (flags) == CCFPmode
18859 || GET_MODE (flags) == CCFPUmode)
18860 {
18861 fpcmp = true;
18862 compare_code
18863 = ix86_fp_compare_code_to_integer (compare_code);
18864 }
18865
18866 /* To simplify rest of code, restrict to the GEU case. */
18867 if (compare_code == LTU)
18868 {
18869 HOST_WIDE_INT tmp = ct;
18870 ct = cf;
18871 cf = tmp;
18872 compare_code = reverse_condition (compare_code);
18873 code = reverse_condition (code);
18874 }
18875 else
18876 {
18877 if (fpcmp)
18878 PUT_CODE (compare_op,
18879 reverse_condition_maybe_unordered
18880 (GET_CODE (compare_op)));
18881 else
18882 PUT_CODE (compare_op,
18883 reverse_condition (GET_CODE (compare_op)));
18884 }
18885 diff = ct - cf;
18886
18887 if (reg_overlap_mentioned_p (out, op0)
18888 || reg_overlap_mentioned_p (out, op1))
18889 tmp = gen_reg_rtx (mode);
18890
18891 if (mode == DImode)
18892 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18893 else
18894 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18895 flags, compare_op));
18896 }
18897 else
18898 {
18899 if (code == GT || code == GE)
18900 code = reverse_condition (code);
18901 else
18902 {
18903 HOST_WIDE_INT tmp = ct;
18904 ct = cf;
18905 cf = tmp;
18906 diff = ct - cf;
18907 }
18908 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18909 }
18910
18911 if (diff == 1)
18912 {
18913 /*
18914 * cmpl op0,op1
18915 * sbbl dest,dest
18916 * [addl dest, ct]
18917 *
18918 * Size 5 - 8.
18919 */
18920 if (ct)
18921 tmp = expand_simple_binop (mode, PLUS,
18922 tmp, GEN_INT (ct),
18923 copy_rtx (tmp), 1, OPTAB_DIRECT);
18924 }
18925 else if (cf == -1)
18926 {
18927 /*
18928 * cmpl op0,op1
18929 * sbbl dest,dest
18930 * orl $ct, dest
18931 *
18932 * Size 8.
18933 */
18934 tmp = expand_simple_binop (mode, IOR,
18935 tmp, GEN_INT (ct),
18936 copy_rtx (tmp), 1, OPTAB_DIRECT);
18937 }
18938 else if (diff == -1 && ct)
18939 {
18940 /*
18941 * cmpl op0,op1
18942 * sbbl dest,dest
18943 * notl dest
18944 * [addl dest, cf]
18945 *
18946 * Size 8 - 11.
18947 */
18948 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18949 if (cf)
18950 tmp = expand_simple_binop (mode, PLUS,
18951 copy_rtx (tmp), GEN_INT (cf),
18952 copy_rtx (tmp), 1, OPTAB_DIRECT);
18953 }
18954 else
18955 {
18956 /*
18957 * cmpl op0,op1
18958 * sbbl dest,dest
18959 * [notl dest]
18960 * andl cf - ct, dest
18961 * [addl dest, ct]
18962 *
18963 * Size 8 - 11.
18964 */
18965
18966 if (cf == 0)
18967 {
18968 cf = ct;
18969 ct = 0;
18970 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18971 }
18972
18973 tmp = expand_simple_binop (mode, AND,
18974 copy_rtx (tmp),
18975 gen_int_mode (cf - ct, mode),
18976 copy_rtx (tmp), 1, OPTAB_DIRECT);
18977 if (ct)
18978 tmp = expand_simple_binop (mode, PLUS,
18979 copy_rtx (tmp), GEN_INT (ct),
18980 copy_rtx (tmp), 1, OPTAB_DIRECT);
18981 }
18982
18983 if (!rtx_equal_p (tmp, out))
18984 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18985
18986 return true;
18987 }
18988
18989 if (diff < 0)
18990 {
18991 enum machine_mode cmp_mode = GET_MODE (op0);
18992
18993 HOST_WIDE_INT tmp;
18994 tmp = ct, ct = cf, cf = tmp;
18995 diff = -diff;
18996
18997 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18998 {
18999 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19000
19001 /* We may be reversing unordered compare to normal compare, that
19002 is not valid in general (we may convert non-trapping condition
19003 to trapping one), however on i386 we currently emit all
19004 comparisons unordered. */
19005 compare_code = reverse_condition_maybe_unordered (compare_code);
19006 code = reverse_condition_maybe_unordered (code);
19007 }
19008 else
19009 {
19010 compare_code = reverse_condition (compare_code);
19011 code = reverse_condition (code);
19012 }
19013 }
19014
19015 compare_code = UNKNOWN;
19016 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19017 && CONST_INT_P (op1))
19018 {
19019 if (op1 == const0_rtx
19020 && (code == LT || code == GE))
19021 compare_code = code;
19022 else if (op1 == constm1_rtx)
19023 {
19024 if (code == LE)
19025 compare_code = LT;
19026 else if (code == GT)
19027 compare_code = GE;
19028 }
19029 }
19030
19031 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19032 if (compare_code != UNKNOWN
19033 && GET_MODE (op0) == GET_MODE (out)
19034 && (cf == -1 || ct == -1))
19035 {
19036 /* If lea code below could be used, only optimize
19037 if it results in a 2 insn sequence. */
19038
19039 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19040 || diff == 3 || diff == 5 || diff == 9)
19041 || (compare_code == LT && ct == -1)
19042 || (compare_code == GE && cf == -1))
19043 {
19044 /*
19045 * notl op1 (if necessary)
19046 * sarl $31, op1
19047 * orl cf, op1
19048 */
19049 if (ct != -1)
19050 {
19051 cf = ct;
19052 ct = -1;
19053 code = reverse_condition (code);
19054 }
19055
19056 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19057
19058 out = expand_simple_binop (mode, IOR,
19059 out, GEN_INT (cf),
19060 out, 1, OPTAB_DIRECT);
19061 if (out != operands[0])
19062 emit_move_insn (operands[0], out);
19063
19064 return true;
19065 }
19066 }
19067
19068
19069 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19070 || diff == 3 || diff == 5 || diff == 9)
19071 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19072 && (mode != DImode
19073 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19074 {
19075 /*
19076 * xorl dest,dest
19077 * cmpl op1,op2
19078 * setcc dest
19079 * lea cf(dest*(ct-cf)),dest
19080 *
19081 * Size 14.
19082 *
19083 * This also catches the degenerate setcc-only case.
19084 */
19085
19086 rtx tmp;
19087 int nops;
19088
19089 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19090
19091 nops = 0;
19092 /* On x86_64 the lea instruction operates on Pmode, so we need
19093 to get arithmetics done in proper mode to match. */
19094 if (diff == 1)
19095 tmp = copy_rtx (out);
19096 else
19097 {
19098 rtx out1;
19099 out1 = copy_rtx (out);
19100 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19101 nops++;
19102 if (diff & 1)
19103 {
19104 tmp = gen_rtx_PLUS (mode, tmp, out1);
19105 nops++;
19106 }
19107 }
19108 if (cf != 0)
19109 {
19110 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19111 nops++;
19112 }
19113 if (!rtx_equal_p (tmp, out))
19114 {
19115 if (nops == 1)
19116 out = force_operand (tmp, copy_rtx (out));
19117 else
19118 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19119 }
19120 if (!rtx_equal_p (out, operands[0]))
19121 emit_move_insn (operands[0], copy_rtx (out));
19122
19123 return true;
19124 }
19125
19126 /*
19127 * General case: Jumpful:
19128 * xorl dest,dest cmpl op1, op2
19129 * cmpl op1, op2 movl ct, dest
19130 * setcc dest jcc 1f
19131 * decl dest movl cf, dest
19132 * andl (cf-ct),dest 1:
19133 * addl ct,dest
19134 *
19135 * Size 20. Size 14.
19136 *
19137 * This is reasonably steep, but branch mispredict costs are
19138 * high on modern cpus, so consider failing only if optimizing
19139 * for space.
19140 */
19141
19142 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19143 && BRANCH_COST (optimize_insn_for_speed_p (),
19144 false) >= 2)
19145 {
19146 if (cf == 0)
19147 {
19148 enum machine_mode cmp_mode = GET_MODE (op0);
19149
19150 cf = ct;
19151 ct = 0;
19152
19153 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19154 {
19155 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19156
19157 /* We may be reversing unordered compare to normal compare,
19158 that is not valid in general (we may convert non-trapping
19159 condition to trapping one), however on i386 we currently
19160 emit all comparisons unordered. */
19161 code = reverse_condition_maybe_unordered (code);
19162 }
19163 else
19164 {
19165 code = reverse_condition (code);
19166 if (compare_code != UNKNOWN)
19167 compare_code = reverse_condition (compare_code);
19168 }
19169 }
19170
19171 if (compare_code != UNKNOWN)
19172 {
19173 /* notl op1 (if needed)
19174 sarl $31, op1
19175 andl (cf-ct), op1
19176 addl ct, op1
19177
19178 For x < 0 (resp. x <= -1) there will be no notl,
19179 so if possible swap the constants to get rid of the
19180 complement.
19181 True/false will be -1/0 while code below (store flag
19182 followed by decrement) is 0/-1, so the constants need
19183 to be exchanged once more. */
19184
19185 if (compare_code == GE || !cf)
19186 {
19187 code = reverse_condition (code);
19188 compare_code = LT;
19189 }
19190 else
19191 {
19192 HOST_WIDE_INT tmp = cf;
19193 cf = ct;
19194 ct = tmp;
19195 }
19196
19197 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19198 }
19199 else
19200 {
19201 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19202
19203 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19204 constm1_rtx,
19205 copy_rtx (out), 1, OPTAB_DIRECT);
19206 }
19207
19208 out = expand_simple_binop (mode, AND, copy_rtx (out),
19209 gen_int_mode (cf - ct, mode),
19210 copy_rtx (out), 1, OPTAB_DIRECT);
19211 if (ct)
19212 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19213 copy_rtx (out), 1, OPTAB_DIRECT);
19214 if (!rtx_equal_p (out, operands[0]))
19215 emit_move_insn (operands[0], copy_rtx (out));
19216
19217 return true;
19218 }
19219 }
19220
19221 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19222 {
19223 /* Try a few things more with specific constants and a variable. */
19224
19225 optab op;
19226 rtx var, orig_out, out, tmp;
19227
19228 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19229 return false;
19230
19231 /* If one of the two operands is an interesting constant, load a
19232 constant with the above and mask it in with a logical operation. */
19233
19234 if (CONST_INT_P (operands[2]))
19235 {
19236 var = operands[3];
19237 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19238 operands[3] = constm1_rtx, op = and_optab;
19239 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19240 operands[3] = const0_rtx, op = ior_optab;
19241 else
19242 return false;
19243 }
19244 else if (CONST_INT_P (operands[3]))
19245 {
19246 var = operands[2];
19247 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19248 operands[2] = constm1_rtx, op = and_optab;
19249 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19250 operands[2] = const0_rtx, op = ior_optab;
19251 else
19252 return false;
19253 }
19254 else
19255 return false;
19256
19257 orig_out = operands[0];
19258 tmp = gen_reg_rtx (mode);
19259 operands[0] = tmp;
19260
19261 /* Recurse to get the constant loaded. */
19262 if (ix86_expand_int_movcc (operands) == 0)
19263 return false;
19264
19265 /* Mask in the interesting variable. */
19266 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19267 OPTAB_WIDEN);
19268 if (!rtx_equal_p (out, orig_out))
19269 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19270
19271 return true;
19272 }
19273
19274 /*
19275 * For comparison with above,
19276 *
19277 * movl cf,dest
19278 * movl ct,tmp
19279 * cmpl op1,op2
19280 * cmovcc tmp,dest
19281 *
19282 * Size 15.
19283 */
19284
19285 if (! nonimmediate_operand (operands[2], mode))
19286 operands[2] = force_reg (mode, operands[2]);
19287 if (! nonimmediate_operand (operands[3], mode))
19288 operands[3] = force_reg (mode, operands[3]);
19289
19290 if (! register_operand (operands[2], VOIDmode)
19291 && (mode == QImode
19292 || ! register_operand (operands[3], VOIDmode)))
19293 operands[2] = force_reg (mode, operands[2]);
19294
19295 if (mode == QImode
19296 && ! register_operand (operands[3], VOIDmode))
19297 operands[3] = force_reg (mode, operands[3]);
19298
19299 emit_insn (compare_seq);
19300 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19301 gen_rtx_IF_THEN_ELSE (mode,
19302 compare_op, operands[2],
19303 operands[3])));
19304 return true;
19305 }
19306
19307 /* Swap, force into registers, or otherwise massage the two operands
19308 to an sse comparison with a mask result. Thus we differ a bit from
19309 ix86_prepare_fp_compare_args which expects to produce a flags result.
19310
19311 The DEST operand exists to help determine whether to commute commutative
19312 operators. The POP0/POP1 operands are updated in place. The new
19313 comparison code is returned, or UNKNOWN if not implementable. */
19314
19315 static enum rtx_code
19316 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19317 rtx *pop0, rtx *pop1)
19318 {
19319 rtx tmp;
19320
19321 switch (code)
19322 {
19323 case LTGT:
19324 case UNEQ:
19325 /* AVX supports all the needed comparisons. */
19326 if (TARGET_AVX)
19327 break;
19328 /* We have no LTGT as an operator. We could implement it with
19329 NE & ORDERED, but this requires an extra temporary. It's
19330 not clear that it's worth it. */
19331 return UNKNOWN;
19332
19333 case LT:
19334 case LE:
19335 case UNGT:
19336 case UNGE:
19337 /* These are supported directly. */
19338 break;
19339
19340 case EQ:
19341 case NE:
19342 case UNORDERED:
19343 case ORDERED:
19344 /* AVX has 3 operand comparisons, no need to swap anything. */
19345 if (TARGET_AVX)
19346 break;
19347 /* For commutative operators, try to canonicalize the destination
19348 operand to be first in the comparison - this helps reload to
19349 avoid extra moves. */
19350 if (!dest || !rtx_equal_p (dest, *pop1))
19351 break;
19352 /* FALLTHRU */
19353
19354 case GE:
19355 case GT:
19356 case UNLE:
19357 case UNLT:
19358 /* These are not supported directly before AVX, and furthermore
19359 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19360 comparison operands to transform into something that is
19361 supported. */
19362 tmp = *pop0;
19363 *pop0 = *pop1;
19364 *pop1 = tmp;
19365 code = swap_condition (code);
19366 break;
19367
19368 default:
19369 gcc_unreachable ();
19370 }
19371
19372 return code;
19373 }
19374
19375 /* Detect conditional moves that exactly match min/max operational
19376 semantics. Note that this is IEEE safe, as long as we don't
19377 interchange the operands.
19378
19379 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19380 and TRUE if the operation is successful and instructions are emitted. */
19381
19382 static bool
19383 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19384 rtx cmp_op1, rtx if_true, rtx if_false)
19385 {
19386 enum machine_mode mode;
19387 bool is_min;
19388 rtx tmp;
19389
19390 if (code == LT)
19391 ;
19392 else if (code == UNGE)
19393 {
19394 tmp = if_true;
19395 if_true = if_false;
19396 if_false = tmp;
19397 }
19398 else
19399 return false;
19400
19401 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19402 is_min = true;
19403 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19404 is_min = false;
19405 else
19406 return false;
19407
19408 mode = GET_MODE (dest);
19409
19410 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19411 but MODE may be a vector mode and thus not appropriate. */
19412 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19413 {
19414 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19415 rtvec v;
19416
19417 if_true = force_reg (mode, if_true);
19418 v = gen_rtvec (2, if_true, if_false);
19419 tmp = gen_rtx_UNSPEC (mode, v, u);
19420 }
19421 else
19422 {
19423 code = is_min ? SMIN : SMAX;
19424 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19425 }
19426
19427 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19428 return true;
19429 }
19430
19431 /* Expand an sse vector comparison. Return the register with the result. */
19432
19433 static rtx
19434 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19435 rtx op_true, rtx op_false)
19436 {
19437 enum machine_mode mode = GET_MODE (dest);
19438 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19439 rtx x;
19440
19441 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19442 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19443 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19444
19445 if (optimize
19446 || reg_overlap_mentioned_p (dest, op_true)
19447 || reg_overlap_mentioned_p (dest, op_false))
19448 dest = gen_reg_rtx (mode);
19449
19450 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19451 if (cmp_mode != mode)
19452 {
19453 x = force_reg (cmp_mode, x);
19454 convert_move (dest, x, false);
19455 }
19456 else
19457 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19458
19459 return dest;
19460 }
19461
19462 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19463 operations. This is used for both scalar and vector conditional moves. */
19464
19465 static void
19466 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19467 {
19468 enum machine_mode mode = GET_MODE (dest);
19469 rtx t2, t3, x;
19470
19471 if (vector_all_ones_operand (op_true, mode)
19472 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19473 {
19474 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19475 }
19476 else if (op_false == CONST0_RTX (mode))
19477 {
19478 op_true = force_reg (mode, op_true);
19479 x = gen_rtx_AND (mode, cmp, op_true);
19480 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19481 }
19482 else if (op_true == CONST0_RTX (mode))
19483 {
19484 op_false = force_reg (mode, op_false);
19485 x = gen_rtx_NOT (mode, cmp);
19486 x = gen_rtx_AND (mode, x, op_false);
19487 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19488 }
19489 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19490 {
19491 op_false = force_reg (mode, op_false);
19492 x = gen_rtx_IOR (mode, cmp, op_false);
19493 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19494 }
19495 else if (TARGET_XOP)
19496 {
19497 op_true = force_reg (mode, op_true);
19498
19499 if (!nonimmediate_operand (op_false, mode))
19500 op_false = force_reg (mode, op_false);
19501
19502 emit_insn (gen_rtx_SET (mode, dest,
19503 gen_rtx_IF_THEN_ELSE (mode, cmp,
19504 op_true,
19505 op_false)));
19506 }
19507 else
19508 {
19509 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19510
19511 if (!nonimmediate_operand (op_true, mode))
19512 op_true = force_reg (mode, op_true);
19513
19514 op_false = force_reg (mode, op_false);
19515
19516 switch (mode)
19517 {
19518 case V4SFmode:
19519 if (TARGET_SSE4_1)
19520 gen = gen_sse4_1_blendvps;
19521 break;
19522 case V2DFmode:
19523 if (TARGET_SSE4_1)
19524 gen = gen_sse4_1_blendvpd;
19525 break;
19526 case V16QImode:
19527 case V8HImode:
19528 case V4SImode:
19529 case V2DImode:
19530 if (TARGET_SSE4_1)
19531 {
19532 gen = gen_sse4_1_pblendvb;
19533 dest = gen_lowpart (V16QImode, dest);
19534 op_false = gen_lowpart (V16QImode, op_false);
19535 op_true = gen_lowpart (V16QImode, op_true);
19536 cmp = gen_lowpart (V16QImode, cmp);
19537 }
19538 break;
19539 case V8SFmode:
19540 if (TARGET_AVX)
19541 gen = gen_avx_blendvps256;
19542 break;
19543 case V4DFmode:
19544 if (TARGET_AVX)
19545 gen = gen_avx_blendvpd256;
19546 break;
19547 case V32QImode:
19548 case V16HImode:
19549 case V8SImode:
19550 case V4DImode:
19551 if (TARGET_AVX2)
19552 {
19553 gen = gen_avx2_pblendvb;
19554 dest = gen_lowpart (V32QImode, dest);
19555 op_false = gen_lowpart (V32QImode, op_false);
19556 op_true = gen_lowpart (V32QImode, op_true);
19557 cmp = gen_lowpart (V32QImode, cmp);
19558 }
19559 break;
19560 default:
19561 break;
19562 }
19563
19564 if (gen != NULL)
19565 emit_insn (gen (dest, op_false, op_true, cmp));
19566 else
19567 {
19568 op_true = force_reg (mode, op_true);
19569
19570 t2 = gen_reg_rtx (mode);
19571 if (optimize)
19572 t3 = gen_reg_rtx (mode);
19573 else
19574 t3 = dest;
19575
19576 x = gen_rtx_AND (mode, op_true, cmp);
19577 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19578
19579 x = gen_rtx_NOT (mode, cmp);
19580 x = gen_rtx_AND (mode, x, op_false);
19581 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19582
19583 x = gen_rtx_IOR (mode, t3, t2);
19584 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19585 }
19586 }
19587 }
19588
19589 /* Expand a floating-point conditional move. Return true if successful. */
19590
19591 bool
19592 ix86_expand_fp_movcc (rtx operands[])
19593 {
19594 enum machine_mode mode = GET_MODE (operands[0]);
19595 enum rtx_code code = GET_CODE (operands[1]);
19596 rtx tmp, compare_op;
19597 rtx op0 = XEXP (operands[1], 0);
19598 rtx op1 = XEXP (operands[1], 1);
19599
19600 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19601 {
19602 enum machine_mode cmode;
19603
19604 /* Since we've no cmove for sse registers, don't force bad register
19605 allocation just to gain access to it. Deny movcc when the
19606 comparison mode doesn't match the move mode. */
19607 cmode = GET_MODE (op0);
19608 if (cmode == VOIDmode)
19609 cmode = GET_MODE (op1);
19610 if (cmode != mode)
19611 return false;
19612
19613 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19614 if (code == UNKNOWN)
19615 return false;
19616
19617 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19618 operands[2], operands[3]))
19619 return true;
19620
19621 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19622 operands[2], operands[3]);
19623 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19624 return true;
19625 }
19626
19627 /* The floating point conditional move instructions don't directly
19628 support conditions resulting from a signed integer comparison. */
19629
19630 compare_op = ix86_expand_compare (code, op0, op1);
19631 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19632 {
19633 tmp = gen_reg_rtx (QImode);
19634 ix86_expand_setcc (tmp, code, op0, op1);
19635
19636 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19637 }
19638
19639 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19640 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19641 operands[2], operands[3])));
19642
19643 return true;
19644 }
19645
19646 /* Expand a floating-point vector conditional move; a vcond operation
19647 rather than a movcc operation. */
19648
19649 bool
19650 ix86_expand_fp_vcond (rtx operands[])
19651 {
19652 enum rtx_code code = GET_CODE (operands[3]);
19653 rtx cmp;
19654
19655 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19656 &operands[4], &operands[5]);
19657 if (code == UNKNOWN)
19658 {
19659 rtx temp;
19660 switch (GET_CODE (operands[3]))
19661 {
19662 case LTGT:
19663 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19664 operands[5], operands[0], operands[0]);
19665 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19666 operands[5], operands[1], operands[2]);
19667 code = AND;
19668 break;
19669 case UNEQ:
19670 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19671 operands[5], operands[0], operands[0]);
19672 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19673 operands[5], operands[1], operands[2]);
19674 code = IOR;
19675 break;
19676 default:
19677 gcc_unreachable ();
19678 }
19679 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19680 OPTAB_DIRECT);
19681 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19682 return true;
19683 }
19684
19685 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19686 operands[5], operands[1], operands[2]))
19687 return true;
19688
19689 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19690 operands[1], operands[2]);
19691 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19692 return true;
19693 }
19694
19695 /* Expand a signed/unsigned integral vector conditional move. */
19696
19697 bool
19698 ix86_expand_int_vcond (rtx operands[])
19699 {
19700 enum machine_mode data_mode = GET_MODE (operands[0]);
19701 enum machine_mode mode = GET_MODE (operands[4]);
19702 enum rtx_code code = GET_CODE (operands[3]);
19703 bool negate = false;
19704 rtx x, cop0, cop1;
19705
19706 cop0 = operands[4];
19707 cop1 = operands[5];
19708
19709 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19710 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19711 if ((code == LT || code == GE)
19712 && data_mode == mode
19713 && cop1 == CONST0_RTX (mode)
19714 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19715 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19716 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19717 && (GET_MODE_SIZE (data_mode) == 16
19718 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19719 {
19720 rtx negop = operands[2 - (code == LT)];
19721 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19722 if (negop == CONST1_RTX (data_mode))
19723 {
19724 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19725 operands[0], 1, OPTAB_DIRECT);
19726 if (res != operands[0])
19727 emit_move_insn (operands[0], res);
19728 return true;
19729 }
19730 else if (GET_MODE_INNER (data_mode) != DImode
19731 && vector_all_ones_operand (negop, data_mode))
19732 {
19733 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19734 operands[0], 0, OPTAB_DIRECT);
19735 if (res != operands[0])
19736 emit_move_insn (operands[0], res);
19737 return true;
19738 }
19739 }
19740
19741 if (!nonimmediate_operand (cop1, mode))
19742 cop1 = force_reg (mode, cop1);
19743 if (!general_operand (operands[1], data_mode))
19744 operands[1] = force_reg (data_mode, operands[1]);
19745 if (!general_operand (operands[2], data_mode))
19746 operands[2] = force_reg (data_mode, operands[2]);
19747
19748 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19749 if (TARGET_XOP
19750 && (mode == V16QImode || mode == V8HImode
19751 || mode == V4SImode || mode == V2DImode))
19752 ;
19753 else
19754 {
19755 /* Canonicalize the comparison to EQ, GT, GTU. */
19756 switch (code)
19757 {
19758 case EQ:
19759 case GT:
19760 case GTU:
19761 break;
19762
19763 case NE:
19764 case LE:
19765 case LEU:
19766 code = reverse_condition (code);
19767 negate = true;
19768 break;
19769
19770 case GE:
19771 case GEU:
19772 code = reverse_condition (code);
19773 negate = true;
19774 /* FALLTHRU */
19775
19776 case LT:
19777 case LTU:
19778 code = swap_condition (code);
19779 x = cop0, cop0 = cop1, cop1 = x;
19780 break;
19781
19782 default:
19783 gcc_unreachable ();
19784 }
19785
19786 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19787 if (mode == V2DImode)
19788 {
19789 switch (code)
19790 {
19791 case EQ:
19792 /* SSE4.1 supports EQ. */
19793 if (!TARGET_SSE4_1)
19794 return false;
19795 break;
19796
19797 case GT:
19798 case GTU:
19799 /* SSE4.2 supports GT/GTU. */
19800 if (!TARGET_SSE4_2)
19801 return false;
19802 break;
19803
19804 default:
19805 gcc_unreachable ();
19806 }
19807 }
19808
19809 /* Unsigned parallel compare is not supported by the hardware.
19810 Play some tricks to turn this into a signed comparison
19811 against 0. */
19812 if (code == GTU)
19813 {
19814 cop0 = force_reg (mode, cop0);
19815
19816 switch (mode)
19817 {
19818 case V8SImode:
19819 case V4DImode:
19820 case V4SImode:
19821 case V2DImode:
19822 {
19823 rtx t1, t2, mask;
19824 rtx (*gen_sub3) (rtx, rtx, rtx);
19825
19826 switch (mode)
19827 {
19828 case V8SImode: gen_sub3 = gen_subv8si3; break;
19829 case V4DImode: gen_sub3 = gen_subv4di3; break;
19830 case V4SImode: gen_sub3 = gen_subv4si3; break;
19831 case V2DImode: gen_sub3 = gen_subv2di3; break;
19832 default:
19833 gcc_unreachable ();
19834 }
19835 /* Subtract (-(INT MAX) - 1) from both operands to make
19836 them signed. */
19837 mask = ix86_build_signbit_mask (mode, true, false);
19838 t1 = gen_reg_rtx (mode);
19839 emit_insn (gen_sub3 (t1, cop0, mask));
19840
19841 t2 = gen_reg_rtx (mode);
19842 emit_insn (gen_sub3 (t2, cop1, mask));
19843
19844 cop0 = t1;
19845 cop1 = t2;
19846 code = GT;
19847 }
19848 break;
19849
19850 case V32QImode:
19851 case V16HImode:
19852 case V16QImode:
19853 case V8HImode:
19854 /* Perform a parallel unsigned saturating subtraction. */
19855 x = gen_reg_rtx (mode);
19856 emit_insn (gen_rtx_SET (VOIDmode, x,
19857 gen_rtx_US_MINUS (mode, cop0, cop1)));
19858
19859 cop0 = x;
19860 cop1 = CONST0_RTX (mode);
19861 code = EQ;
19862 negate = !negate;
19863 break;
19864
19865 default:
19866 gcc_unreachable ();
19867 }
19868 }
19869 }
19870
19871 /* Allow the comparison to be done in one mode, but the movcc to
19872 happen in another mode. */
19873 if (data_mode == mode)
19874 {
19875 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19876 operands[1+negate], operands[2-negate]);
19877 }
19878 else
19879 {
19880 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19881 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19882 code, cop0, cop1,
19883 operands[1+negate], operands[2-negate]);
19884 x = gen_lowpart (data_mode, x);
19885 }
19886
19887 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19888 operands[2-negate]);
19889 return true;
19890 }
19891
19892 /* Expand a variable vector permutation. */
19893
19894 void
19895 ix86_expand_vec_perm (rtx operands[])
19896 {
19897 rtx target = operands[0];
19898 rtx op0 = operands[1];
19899 rtx op1 = operands[2];
19900 rtx mask = operands[3];
19901 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19902 enum machine_mode mode = GET_MODE (op0);
19903 enum machine_mode maskmode = GET_MODE (mask);
19904 int w, e, i;
19905 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19906
19907 /* Number of elements in the vector. */
19908 w = GET_MODE_NUNITS (mode);
19909 e = GET_MODE_UNIT_SIZE (mode);
19910 gcc_assert (w <= 32);
19911
19912 if (TARGET_AVX2)
19913 {
19914 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19915 {
19916 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19917 an constant shuffle operand. With a tiny bit of effort we can
19918 use VPERMD instead. A re-interpretation stall for V4DFmode is
19919 unfortunate but there's no avoiding it.
19920 Similarly for V16HImode we don't have instructions for variable
19921 shuffling, while for V32QImode we can use after preparing suitable
19922 masks vpshufb; vpshufb; vpermq; vpor. */
19923
19924 if (mode == V16HImode)
19925 {
19926 maskmode = mode = V32QImode;
19927 w = 32;
19928 e = 1;
19929 }
19930 else
19931 {
19932 maskmode = mode = V8SImode;
19933 w = 8;
19934 e = 4;
19935 }
19936 t1 = gen_reg_rtx (maskmode);
19937
19938 /* Replicate the low bits of the V4DImode mask into V8SImode:
19939 mask = { A B C D }
19940 t1 = { A A B B C C D D }. */
19941 for (i = 0; i < w / 2; ++i)
19942 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19943 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19944 vt = force_reg (maskmode, vt);
19945 mask = gen_lowpart (maskmode, mask);
19946 if (maskmode == V8SImode)
19947 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
19948 else
19949 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19950
19951 /* Multiply the shuffle indicies by two. */
19952 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19953 OPTAB_DIRECT);
19954
19955 /* Add one to the odd shuffle indicies:
19956 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19957 for (i = 0; i < w / 2; ++i)
19958 {
19959 vec[i * 2] = const0_rtx;
19960 vec[i * 2 + 1] = const1_rtx;
19961 }
19962 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19963 vt = force_const_mem (maskmode, vt);
19964 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19965 OPTAB_DIRECT);
19966
19967 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19968 operands[3] = mask = t1;
19969 target = gen_lowpart (mode, target);
19970 op0 = gen_lowpart (mode, op0);
19971 op1 = gen_lowpart (mode, op1);
19972 }
19973
19974 switch (mode)
19975 {
19976 case V8SImode:
19977 /* The VPERMD and VPERMPS instructions already properly ignore
19978 the high bits of the shuffle elements. No need for us to
19979 perform an AND ourselves. */
19980 if (one_operand_shuffle)
19981 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
19982 else
19983 {
19984 t1 = gen_reg_rtx (V8SImode);
19985 t2 = gen_reg_rtx (V8SImode);
19986 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
19987 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
19988 goto merge_two;
19989 }
19990 return;
19991
19992 case V8SFmode:
19993 mask = gen_lowpart (V8SFmode, mask);
19994 if (one_operand_shuffle)
19995 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
19996 else
19997 {
19998 t1 = gen_reg_rtx (V8SFmode);
19999 t2 = gen_reg_rtx (V8SFmode);
20000 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20001 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20002 goto merge_two;
20003 }
20004 return;
20005
20006 case V4SImode:
20007 /* By combining the two 128-bit input vectors into one 256-bit
20008 input vector, we can use VPERMD and VPERMPS for the full
20009 two-operand shuffle. */
20010 t1 = gen_reg_rtx (V8SImode);
20011 t2 = gen_reg_rtx (V8SImode);
20012 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20013 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20014 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20015 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20016 return;
20017
20018 case V4SFmode:
20019 t1 = gen_reg_rtx (V8SFmode);
20020 t2 = gen_reg_rtx (V8SImode);
20021 mask = gen_lowpart (V4SImode, mask);
20022 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20023 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20024 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20025 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20026 return;
20027
20028 case V32QImode:
20029 t1 = gen_reg_rtx (V32QImode);
20030 t2 = gen_reg_rtx (V32QImode);
20031 t3 = gen_reg_rtx (V32QImode);
20032 vt2 = GEN_INT (128);
20033 for (i = 0; i < 32; i++)
20034 vec[i] = vt2;
20035 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20036 vt = force_reg (V32QImode, vt);
20037 for (i = 0; i < 32; i++)
20038 vec[i] = i < 16 ? vt2 : const0_rtx;
20039 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20040 vt2 = force_reg (V32QImode, vt2);
20041 /* From mask create two adjusted masks, which contain the same
20042 bits as mask in the low 7 bits of each vector element.
20043 The first mask will have the most significant bit clear
20044 if it requests element from the same 128-bit lane
20045 and MSB set if it requests element from the other 128-bit lane.
20046 The second mask will have the opposite values of the MSB,
20047 and additionally will have its 128-bit lanes swapped.
20048 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20049 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20050 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20051 stands for other 12 bytes. */
20052 /* The bit whether element is from the same lane or the other
20053 lane is bit 4, so shift it up by 3 to the MSB position. */
20054 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20055 gen_lowpart (V4DImode, mask),
20056 GEN_INT (3)));
20057 /* Clear MSB bits from the mask just in case it had them set. */
20058 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20059 /* After this t1 will have MSB set for elements from other lane. */
20060 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20061 /* Clear bits other than MSB. */
20062 emit_insn (gen_andv32qi3 (t1, t1, vt));
20063 /* Or in the lower bits from mask into t3. */
20064 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20065 /* And invert MSB bits in t1, so MSB is set for elements from the same
20066 lane. */
20067 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20068 /* Swap 128-bit lanes in t3. */
20069 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20070 gen_lowpart (V4DImode, t3),
20071 const2_rtx, GEN_INT (3),
20072 const0_rtx, const1_rtx));
20073 /* And or in the lower bits from mask into t1. */
20074 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20075 if (one_operand_shuffle)
20076 {
20077 /* Each of these shuffles will put 0s in places where
20078 element from the other 128-bit lane is needed, otherwise
20079 will shuffle in the requested value. */
20080 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20081 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20082 /* For t3 the 128-bit lanes are swapped again. */
20083 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20084 gen_lowpart (V4DImode, t3),
20085 const2_rtx, GEN_INT (3),
20086 const0_rtx, const1_rtx));
20087 /* And oring both together leads to the result. */
20088 emit_insn (gen_iorv32qi3 (target, t1, t3));
20089 return;
20090 }
20091
20092 t4 = gen_reg_rtx (V32QImode);
20093 /* Similarly to the above one_operand_shuffle code,
20094 just for repeated twice for each operand. merge_two:
20095 code will merge the two results together. */
20096 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20097 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20098 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20099 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20100 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20101 gen_lowpart (V4DImode, t4),
20102 const2_rtx, GEN_INT (3),
20103 const0_rtx, const1_rtx));
20104 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20105 gen_lowpart (V4DImode, t3),
20106 const2_rtx, GEN_INT (3),
20107 const0_rtx, const1_rtx));
20108 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20109 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20110 t1 = t4;
20111 t2 = t3;
20112 goto merge_two;
20113
20114 default:
20115 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20116 break;
20117 }
20118 }
20119
20120 if (TARGET_XOP)
20121 {
20122 /* The XOP VPPERM insn supports three inputs. By ignoring the
20123 one_operand_shuffle special case, we avoid creating another
20124 set of constant vectors in memory. */
20125 one_operand_shuffle = false;
20126
20127 /* mask = mask & {2*w-1, ...} */
20128 vt = GEN_INT (2*w - 1);
20129 }
20130 else
20131 {
20132 /* mask = mask & {w-1, ...} */
20133 vt = GEN_INT (w - 1);
20134 }
20135
20136 for (i = 0; i < w; i++)
20137 vec[i] = vt;
20138 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20139 mask = expand_simple_binop (maskmode, AND, mask, vt,
20140 NULL_RTX, 0, OPTAB_DIRECT);
20141
20142 /* For non-QImode operations, convert the word permutation control
20143 into a byte permutation control. */
20144 if (mode != V16QImode)
20145 {
20146 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20147 GEN_INT (exact_log2 (e)),
20148 NULL_RTX, 0, OPTAB_DIRECT);
20149
20150 /* Convert mask to vector of chars. */
20151 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20152
20153 /* Replicate each of the input bytes into byte positions:
20154 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20155 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20156 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20157 for (i = 0; i < 16; ++i)
20158 vec[i] = GEN_INT (i/e * e);
20159 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20160 vt = force_const_mem (V16QImode, vt);
20161 if (TARGET_XOP)
20162 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20163 else
20164 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20165
20166 /* Convert it into the byte positions by doing
20167 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20168 for (i = 0; i < 16; ++i)
20169 vec[i] = GEN_INT (i % e);
20170 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20171 vt = force_const_mem (V16QImode, vt);
20172 emit_insn (gen_addv16qi3 (mask, mask, vt));
20173 }
20174
20175 /* The actual shuffle operations all operate on V16QImode. */
20176 op0 = gen_lowpart (V16QImode, op0);
20177 op1 = gen_lowpart (V16QImode, op1);
20178 target = gen_lowpart (V16QImode, target);
20179
20180 if (TARGET_XOP)
20181 {
20182 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20183 }
20184 else if (one_operand_shuffle)
20185 {
20186 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20187 }
20188 else
20189 {
20190 rtx xops[6];
20191 bool ok;
20192
20193 /* Shuffle the two input vectors independently. */
20194 t1 = gen_reg_rtx (V16QImode);
20195 t2 = gen_reg_rtx (V16QImode);
20196 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20197 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20198
20199 merge_two:
20200 /* Then merge them together. The key is whether any given control
20201 element contained a bit set that indicates the second word. */
20202 mask = operands[3];
20203 vt = GEN_INT (w);
20204 if (maskmode == V2DImode && !TARGET_SSE4_1)
20205 {
20206 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20207 more shuffle to convert the V2DI input mask into a V4SI
20208 input mask. At which point the masking that expand_int_vcond
20209 will work as desired. */
20210 rtx t3 = gen_reg_rtx (V4SImode);
20211 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20212 const0_rtx, const0_rtx,
20213 const2_rtx, const2_rtx));
20214 mask = t3;
20215 maskmode = V4SImode;
20216 e = w = 4;
20217 }
20218
20219 for (i = 0; i < w; i++)
20220 vec[i] = vt;
20221 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20222 vt = force_reg (maskmode, vt);
20223 mask = expand_simple_binop (maskmode, AND, mask, vt,
20224 NULL_RTX, 0, OPTAB_DIRECT);
20225
20226 xops[0] = gen_lowpart (mode, operands[0]);
20227 xops[1] = gen_lowpart (mode, t2);
20228 xops[2] = gen_lowpart (mode, t1);
20229 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20230 xops[4] = mask;
20231 xops[5] = vt;
20232 ok = ix86_expand_int_vcond (xops);
20233 gcc_assert (ok);
20234 }
20235 }
20236
20237 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20238 true if we should do zero extension, else sign extension. HIGH_P is
20239 true if we want the N/2 high elements, else the low elements. */
20240
20241 void
20242 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20243 {
20244 enum machine_mode imode = GET_MODE (src);
20245 rtx tmp;
20246
20247 if (TARGET_SSE4_1)
20248 {
20249 rtx (*unpack)(rtx, rtx);
20250 rtx (*extract)(rtx, rtx) = NULL;
20251 enum machine_mode halfmode = BLKmode;
20252
20253 switch (imode)
20254 {
20255 case V32QImode:
20256 if (unsigned_p)
20257 unpack = gen_avx2_zero_extendv16qiv16hi2;
20258 else
20259 unpack = gen_avx2_sign_extendv16qiv16hi2;
20260 halfmode = V16QImode;
20261 extract
20262 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20263 break;
20264 case V16HImode:
20265 if (unsigned_p)
20266 unpack = gen_avx2_zero_extendv8hiv8si2;
20267 else
20268 unpack = gen_avx2_sign_extendv8hiv8si2;
20269 halfmode = V8HImode;
20270 extract
20271 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20272 break;
20273 case V8SImode:
20274 if (unsigned_p)
20275 unpack = gen_avx2_zero_extendv4siv4di2;
20276 else
20277 unpack = gen_avx2_sign_extendv4siv4di2;
20278 halfmode = V4SImode;
20279 extract
20280 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20281 break;
20282 case V16QImode:
20283 if (unsigned_p)
20284 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20285 else
20286 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20287 break;
20288 case V8HImode:
20289 if (unsigned_p)
20290 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20291 else
20292 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20293 break;
20294 case V4SImode:
20295 if (unsigned_p)
20296 unpack = gen_sse4_1_zero_extendv2siv2di2;
20297 else
20298 unpack = gen_sse4_1_sign_extendv2siv2di2;
20299 break;
20300 default:
20301 gcc_unreachable ();
20302 }
20303
20304 if (GET_MODE_SIZE (imode) == 32)
20305 {
20306 tmp = gen_reg_rtx (halfmode);
20307 emit_insn (extract (tmp, src));
20308 }
20309 else if (high_p)
20310 {
20311 /* Shift higher 8 bytes to lower 8 bytes. */
20312 tmp = gen_reg_rtx (imode);
20313 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20314 gen_lowpart (V1TImode, src),
20315 GEN_INT (64)));
20316 }
20317 else
20318 tmp = src;
20319
20320 emit_insn (unpack (dest, tmp));
20321 }
20322 else
20323 {
20324 rtx (*unpack)(rtx, rtx, rtx);
20325
20326 switch (imode)
20327 {
20328 case V16QImode:
20329 if (high_p)
20330 unpack = gen_vec_interleave_highv16qi;
20331 else
20332 unpack = gen_vec_interleave_lowv16qi;
20333 break;
20334 case V8HImode:
20335 if (high_p)
20336 unpack = gen_vec_interleave_highv8hi;
20337 else
20338 unpack = gen_vec_interleave_lowv8hi;
20339 break;
20340 case V4SImode:
20341 if (high_p)
20342 unpack = gen_vec_interleave_highv4si;
20343 else
20344 unpack = gen_vec_interleave_lowv4si;
20345 break;
20346 default:
20347 gcc_unreachable ();
20348 }
20349
20350 if (unsigned_p)
20351 tmp = force_reg (imode, CONST0_RTX (imode));
20352 else
20353 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20354 src, pc_rtx, pc_rtx);
20355
20356 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20357 }
20358 }
20359
20360 /* Expand conditional increment or decrement using adb/sbb instructions.
20361 The default case using setcc followed by the conditional move can be
20362 done by generic code. */
20363 bool
20364 ix86_expand_int_addcc (rtx operands[])
20365 {
20366 enum rtx_code code = GET_CODE (operands[1]);
20367 rtx flags;
20368 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20369 rtx compare_op;
20370 rtx val = const0_rtx;
20371 bool fpcmp = false;
20372 enum machine_mode mode;
20373 rtx op0 = XEXP (operands[1], 0);
20374 rtx op1 = XEXP (operands[1], 1);
20375
20376 if (operands[3] != const1_rtx
20377 && operands[3] != constm1_rtx)
20378 return false;
20379 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20380 return false;
20381 code = GET_CODE (compare_op);
20382
20383 flags = XEXP (compare_op, 0);
20384
20385 if (GET_MODE (flags) == CCFPmode
20386 || GET_MODE (flags) == CCFPUmode)
20387 {
20388 fpcmp = true;
20389 code = ix86_fp_compare_code_to_integer (code);
20390 }
20391
20392 if (code != LTU)
20393 {
20394 val = constm1_rtx;
20395 if (fpcmp)
20396 PUT_CODE (compare_op,
20397 reverse_condition_maybe_unordered
20398 (GET_CODE (compare_op)));
20399 else
20400 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20401 }
20402
20403 mode = GET_MODE (operands[0]);
20404
20405 /* Construct either adc or sbb insn. */
20406 if ((code == LTU) == (operands[3] == constm1_rtx))
20407 {
20408 switch (mode)
20409 {
20410 case QImode:
20411 insn = gen_subqi3_carry;
20412 break;
20413 case HImode:
20414 insn = gen_subhi3_carry;
20415 break;
20416 case SImode:
20417 insn = gen_subsi3_carry;
20418 break;
20419 case DImode:
20420 insn = gen_subdi3_carry;
20421 break;
20422 default:
20423 gcc_unreachable ();
20424 }
20425 }
20426 else
20427 {
20428 switch (mode)
20429 {
20430 case QImode:
20431 insn = gen_addqi3_carry;
20432 break;
20433 case HImode:
20434 insn = gen_addhi3_carry;
20435 break;
20436 case SImode:
20437 insn = gen_addsi3_carry;
20438 break;
20439 case DImode:
20440 insn = gen_adddi3_carry;
20441 break;
20442 default:
20443 gcc_unreachable ();
20444 }
20445 }
20446 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20447
20448 return true;
20449 }
20450
20451
20452 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20453 but works for floating pointer parameters and nonoffsetable memories.
20454 For pushes, it returns just stack offsets; the values will be saved
20455 in the right order. Maximally three parts are generated. */
20456
20457 static int
20458 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20459 {
20460 int size;
20461
20462 if (!TARGET_64BIT)
20463 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20464 else
20465 size = (GET_MODE_SIZE (mode) + 4) / 8;
20466
20467 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20468 gcc_assert (size >= 2 && size <= 4);
20469
20470 /* Optimize constant pool reference to immediates. This is used by fp
20471 moves, that force all constants to memory to allow combining. */
20472 if (MEM_P (operand) && MEM_READONLY_P (operand))
20473 {
20474 rtx tmp = maybe_get_pool_constant (operand);
20475 if (tmp)
20476 operand = tmp;
20477 }
20478
20479 if (MEM_P (operand) && !offsettable_memref_p (operand))
20480 {
20481 /* The only non-offsetable memories we handle are pushes. */
20482 int ok = push_operand (operand, VOIDmode);
20483
20484 gcc_assert (ok);
20485
20486 operand = copy_rtx (operand);
20487 PUT_MODE (operand, word_mode);
20488 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20489 return size;
20490 }
20491
20492 if (GET_CODE (operand) == CONST_VECTOR)
20493 {
20494 enum machine_mode imode = int_mode_for_mode (mode);
20495 /* Caution: if we looked through a constant pool memory above,
20496 the operand may actually have a different mode now. That's
20497 ok, since we want to pun this all the way back to an integer. */
20498 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20499 gcc_assert (operand != NULL);
20500 mode = imode;
20501 }
20502
20503 if (!TARGET_64BIT)
20504 {
20505 if (mode == DImode)
20506 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20507 else
20508 {
20509 int i;
20510
20511 if (REG_P (operand))
20512 {
20513 gcc_assert (reload_completed);
20514 for (i = 0; i < size; i++)
20515 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20516 }
20517 else if (offsettable_memref_p (operand))
20518 {
20519 operand = adjust_address (operand, SImode, 0);
20520 parts[0] = operand;
20521 for (i = 1; i < size; i++)
20522 parts[i] = adjust_address (operand, SImode, 4 * i);
20523 }
20524 else if (GET_CODE (operand) == CONST_DOUBLE)
20525 {
20526 REAL_VALUE_TYPE r;
20527 long l[4];
20528
20529 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20530 switch (mode)
20531 {
20532 case TFmode:
20533 real_to_target (l, &r, mode);
20534 parts[3] = gen_int_mode (l[3], SImode);
20535 parts[2] = gen_int_mode (l[2], SImode);
20536 break;
20537 case XFmode:
20538 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20539 parts[2] = gen_int_mode (l[2], SImode);
20540 break;
20541 case DFmode:
20542 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20543 break;
20544 default:
20545 gcc_unreachable ();
20546 }
20547 parts[1] = gen_int_mode (l[1], SImode);
20548 parts[0] = gen_int_mode (l[0], SImode);
20549 }
20550 else
20551 gcc_unreachable ();
20552 }
20553 }
20554 else
20555 {
20556 if (mode == TImode)
20557 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20558 if (mode == XFmode || mode == TFmode)
20559 {
20560 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20561 if (REG_P (operand))
20562 {
20563 gcc_assert (reload_completed);
20564 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20565 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20566 }
20567 else if (offsettable_memref_p (operand))
20568 {
20569 operand = adjust_address (operand, DImode, 0);
20570 parts[0] = operand;
20571 parts[1] = adjust_address (operand, upper_mode, 8);
20572 }
20573 else if (GET_CODE (operand) == CONST_DOUBLE)
20574 {
20575 REAL_VALUE_TYPE r;
20576 long l[4];
20577
20578 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20579 real_to_target (l, &r, mode);
20580
20581 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20582 if (HOST_BITS_PER_WIDE_INT >= 64)
20583 parts[0]
20584 = gen_int_mode
20585 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20586 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20587 DImode);
20588 else
20589 parts[0] = immed_double_const (l[0], l[1], DImode);
20590
20591 if (upper_mode == SImode)
20592 parts[1] = gen_int_mode (l[2], SImode);
20593 else if (HOST_BITS_PER_WIDE_INT >= 64)
20594 parts[1]
20595 = gen_int_mode
20596 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20597 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20598 DImode);
20599 else
20600 parts[1] = immed_double_const (l[2], l[3], DImode);
20601 }
20602 else
20603 gcc_unreachable ();
20604 }
20605 }
20606
20607 return size;
20608 }
20609
20610 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20611 Return false when normal moves are needed; true when all required
20612 insns have been emitted. Operands 2-4 contain the input values
20613 int the correct order; operands 5-7 contain the output values. */
20614
20615 void
20616 ix86_split_long_move (rtx operands[])
20617 {
20618 rtx part[2][4];
20619 int nparts, i, j;
20620 int push = 0;
20621 int collisions = 0;
20622 enum machine_mode mode = GET_MODE (operands[0]);
20623 bool collisionparts[4];
20624
20625 /* The DFmode expanders may ask us to move double.
20626 For 64bit target this is single move. By hiding the fact
20627 here we simplify i386.md splitters. */
20628 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20629 {
20630 /* Optimize constant pool reference to immediates. This is used by
20631 fp moves, that force all constants to memory to allow combining. */
20632
20633 if (MEM_P (operands[1])
20634 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20635 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20636 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20637 if (push_operand (operands[0], VOIDmode))
20638 {
20639 operands[0] = copy_rtx (operands[0]);
20640 PUT_MODE (operands[0], word_mode);
20641 }
20642 else
20643 operands[0] = gen_lowpart (DImode, operands[0]);
20644 operands[1] = gen_lowpart (DImode, operands[1]);
20645 emit_move_insn (operands[0], operands[1]);
20646 return;
20647 }
20648
20649 /* The only non-offsettable memory we handle is push. */
20650 if (push_operand (operands[0], VOIDmode))
20651 push = 1;
20652 else
20653 gcc_assert (!MEM_P (operands[0])
20654 || offsettable_memref_p (operands[0]));
20655
20656 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20657 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20658
20659 /* When emitting push, take care for source operands on the stack. */
20660 if (push && MEM_P (operands[1])
20661 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20662 {
20663 rtx src_base = XEXP (part[1][nparts - 1], 0);
20664
20665 /* Compensate for the stack decrement by 4. */
20666 if (!TARGET_64BIT && nparts == 3
20667 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20668 src_base = plus_constant (Pmode, src_base, 4);
20669
20670 /* src_base refers to the stack pointer and is
20671 automatically decreased by emitted push. */
20672 for (i = 0; i < nparts; i++)
20673 part[1][i] = change_address (part[1][i],
20674 GET_MODE (part[1][i]), src_base);
20675 }
20676
20677 /* We need to do copy in the right order in case an address register
20678 of the source overlaps the destination. */
20679 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20680 {
20681 rtx tmp;
20682
20683 for (i = 0; i < nparts; i++)
20684 {
20685 collisionparts[i]
20686 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20687 if (collisionparts[i])
20688 collisions++;
20689 }
20690
20691 /* Collision in the middle part can be handled by reordering. */
20692 if (collisions == 1 && nparts == 3 && collisionparts [1])
20693 {
20694 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20695 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20696 }
20697 else if (collisions == 1
20698 && nparts == 4
20699 && (collisionparts [1] || collisionparts [2]))
20700 {
20701 if (collisionparts [1])
20702 {
20703 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20704 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20705 }
20706 else
20707 {
20708 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20709 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20710 }
20711 }
20712
20713 /* If there are more collisions, we can't handle it by reordering.
20714 Do an lea to the last part and use only one colliding move. */
20715 else if (collisions > 1)
20716 {
20717 rtx base;
20718
20719 collisions = 1;
20720
20721 base = part[0][nparts - 1];
20722
20723 /* Handle the case when the last part isn't valid for lea.
20724 Happens in 64-bit mode storing the 12-byte XFmode. */
20725 if (GET_MODE (base) != Pmode)
20726 base = gen_rtx_REG (Pmode, REGNO (base));
20727
20728 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20729 part[1][0] = replace_equiv_address (part[1][0], base);
20730 for (i = 1; i < nparts; i++)
20731 {
20732 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20733 part[1][i] = replace_equiv_address (part[1][i], tmp);
20734 }
20735 }
20736 }
20737
20738 if (push)
20739 {
20740 if (!TARGET_64BIT)
20741 {
20742 if (nparts == 3)
20743 {
20744 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20745 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20746 stack_pointer_rtx, GEN_INT (-4)));
20747 emit_move_insn (part[0][2], part[1][2]);
20748 }
20749 else if (nparts == 4)
20750 {
20751 emit_move_insn (part[0][3], part[1][3]);
20752 emit_move_insn (part[0][2], part[1][2]);
20753 }
20754 }
20755 else
20756 {
20757 /* In 64bit mode we don't have 32bit push available. In case this is
20758 register, it is OK - we will just use larger counterpart. We also
20759 retype memory - these comes from attempt to avoid REX prefix on
20760 moving of second half of TFmode value. */
20761 if (GET_MODE (part[1][1]) == SImode)
20762 {
20763 switch (GET_CODE (part[1][1]))
20764 {
20765 case MEM:
20766 part[1][1] = adjust_address (part[1][1], DImode, 0);
20767 break;
20768
20769 case REG:
20770 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20771 break;
20772
20773 default:
20774 gcc_unreachable ();
20775 }
20776
20777 if (GET_MODE (part[1][0]) == SImode)
20778 part[1][0] = part[1][1];
20779 }
20780 }
20781 emit_move_insn (part[0][1], part[1][1]);
20782 emit_move_insn (part[0][0], part[1][0]);
20783 return;
20784 }
20785
20786 /* Choose correct order to not overwrite the source before it is copied. */
20787 if ((REG_P (part[0][0])
20788 && REG_P (part[1][1])
20789 && (REGNO (part[0][0]) == REGNO (part[1][1])
20790 || (nparts == 3
20791 && REGNO (part[0][0]) == REGNO (part[1][2]))
20792 || (nparts == 4
20793 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20794 || (collisions > 0
20795 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20796 {
20797 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20798 {
20799 operands[2 + i] = part[0][j];
20800 operands[6 + i] = part[1][j];
20801 }
20802 }
20803 else
20804 {
20805 for (i = 0; i < nparts; i++)
20806 {
20807 operands[2 + i] = part[0][i];
20808 operands[6 + i] = part[1][i];
20809 }
20810 }
20811
20812 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20813 if (optimize_insn_for_size_p ())
20814 {
20815 for (j = 0; j < nparts - 1; j++)
20816 if (CONST_INT_P (operands[6 + j])
20817 && operands[6 + j] != const0_rtx
20818 && REG_P (operands[2 + j]))
20819 for (i = j; i < nparts - 1; i++)
20820 if (CONST_INT_P (operands[7 + i])
20821 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20822 operands[7 + i] = operands[2 + j];
20823 }
20824
20825 for (i = 0; i < nparts; i++)
20826 emit_move_insn (operands[2 + i], operands[6 + i]);
20827
20828 return;
20829 }
20830
20831 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20832 left shift by a constant, either using a single shift or
20833 a sequence of add instructions. */
20834
20835 static void
20836 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20837 {
20838 rtx (*insn)(rtx, rtx, rtx);
20839
20840 if (count == 1
20841 || (count * ix86_cost->add <= ix86_cost->shift_const
20842 && !optimize_insn_for_size_p ()))
20843 {
20844 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20845 while (count-- > 0)
20846 emit_insn (insn (operand, operand, operand));
20847 }
20848 else
20849 {
20850 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20851 emit_insn (insn (operand, operand, GEN_INT (count)));
20852 }
20853 }
20854
20855 void
20856 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20857 {
20858 rtx (*gen_ashl3)(rtx, rtx, rtx);
20859 rtx (*gen_shld)(rtx, rtx, rtx);
20860 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20861
20862 rtx low[2], high[2];
20863 int count;
20864
20865 if (CONST_INT_P (operands[2]))
20866 {
20867 split_double_mode (mode, operands, 2, low, high);
20868 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20869
20870 if (count >= half_width)
20871 {
20872 emit_move_insn (high[0], low[1]);
20873 emit_move_insn (low[0], const0_rtx);
20874
20875 if (count > half_width)
20876 ix86_expand_ashl_const (high[0], count - half_width, mode);
20877 }
20878 else
20879 {
20880 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20881
20882 if (!rtx_equal_p (operands[0], operands[1]))
20883 emit_move_insn (operands[0], operands[1]);
20884
20885 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20886 ix86_expand_ashl_const (low[0], count, mode);
20887 }
20888 return;
20889 }
20890
20891 split_double_mode (mode, operands, 1, low, high);
20892
20893 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20894
20895 if (operands[1] == const1_rtx)
20896 {
20897 /* Assuming we've chosen a QImode capable registers, then 1 << N
20898 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20899 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20900 {
20901 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20902
20903 ix86_expand_clear (low[0]);
20904 ix86_expand_clear (high[0]);
20905 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20906
20907 d = gen_lowpart (QImode, low[0]);
20908 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20909 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20910 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20911
20912 d = gen_lowpart (QImode, high[0]);
20913 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20914 s = gen_rtx_NE (QImode, flags, const0_rtx);
20915 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20916 }
20917
20918 /* Otherwise, we can get the same results by manually performing
20919 a bit extract operation on bit 5/6, and then performing the two
20920 shifts. The two methods of getting 0/1 into low/high are exactly
20921 the same size. Avoiding the shift in the bit extract case helps
20922 pentium4 a bit; no one else seems to care much either way. */
20923 else
20924 {
20925 enum machine_mode half_mode;
20926 rtx (*gen_lshr3)(rtx, rtx, rtx);
20927 rtx (*gen_and3)(rtx, rtx, rtx);
20928 rtx (*gen_xor3)(rtx, rtx, rtx);
20929 HOST_WIDE_INT bits;
20930 rtx x;
20931
20932 if (mode == DImode)
20933 {
20934 half_mode = SImode;
20935 gen_lshr3 = gen_lshrsi3;
20936 gen_and3 = gen_andsi3;
20937 gen_xor3 = gen_xorsi3;
20938 bits = 5;
20939 }
20940 else
20941 {
20942 half_mode = DImode;
20943 gen_lshr3 = gen_lshrdi3;
20944 gen_and3 = gen_anddi3;
20945 gen_xor3 = gen_xordi3;
20946 bits = 6;
20947 }
20948
20949 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20950 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20951 else
20952 x = gen_lowpart (half_mode, operands[2]);
20953 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20954
20955 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20956 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20957 emit_move_insn (low[0], high[0]);
20958 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20959 }
20960
20961 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20962 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20963 return;
20964 }
20965
20966 if (operands[1] == constm1_rtx)
20967 {
20968 /* For -1 << N, we can avoid the shld instruction, because we
20969 know that we're shifting 0...31/63 ones into a -1. */
20970 emit_move_insn (low[0], constm1_rtx);
20971 if (optimize_insn_for_size_p ())
20972 emit_move_insn (high[0], low[0]);
20973 else
20974 emit_move_insn (high[0], constm1_rtx);
20975 }
20976 else
20977 {
20978 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20979
20980 if (!rtx_equal_p (operands[0], operands[1]))
20981 emit_move_insn (operands[0], operands[1]);
20982
20983 split_double_mode (mode, operands, 1, low, high);
20984 emit_insn (gen_shld (high[0], low[0], operands[2]));
20985 }
20986
20987 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20988
20989 if (TARGET_CMOVE && scratch)
20990 {
20991 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20992 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20993
20994 ix86_expand_clear (scratch);
20995 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20996 }
20997 else
20998 {
20999 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21000 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21001
21002 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21003 }
21004 }
21005
21006 void
21007 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21008 {
21009 rtx (*gen_ashr3)(rtx, rtx, rtx)
21010 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21011 rtx (*gen_shrd)(rtx, rtx, rtx);
21012 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21013
21014 rtx low[2], high[2];
21015 int count;
21016
21017 if (CONST_INT_P (operands[2]))
21018 {
21019 split_double_mode (mode, operands, 2, low, high);
21020 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21021
21022 if (count == GET_MODE_BITSIZE (mode) - 1)
21023 {
21024 emit_move_insn (high[0], high[1]);
21025 emit_insn (gen_ashr3 (high[0], high[0],
21026 GEN_INT (half_width - 1)));
21027 emit_move_insn (low[0], high[0]);
21028
21029 }
21030 else if (count >= half_width)
21031 {
21032 emit_move_insn (low[0], high[1]);
21033 emit_move_insn (high[0], low[0]);
21034 emit_insn (gen_ashr3 (high[0], high[0],
21035 GEN_INT (half_width - 1)));
21036
21037 if (count > half_width)
21038 emit_insn (gen_ashr3 (low[0], low[0],
21039 GEN_INT (count - half_width)));
21040 }
21041 else
21042 {
21043 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21044
21045 if (!rtx_equal_p (operands[0], operands[1]))
21046 emit_move_insn (operands[0], operands[1]);
21047
21048 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21049 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21050 }
21051 }
21052 else
21053 {
21054 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21055
21056 if (!rtx_equal_p (operands[0], operands[1]))
21057 emit_move_insn (operands[0], operands[1]);
21058
21059 split_double_mode (mode, operands, 1, low, high);
21060
21061 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21062 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21063
21064 if (TARGET_CMOVE && scratch)
21065 {
21066 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21067 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21068
21069 emit_move_insn (scratch, high[0]);
21070 emit_insn (gen_ashr3 (scratch, scratch,
21071 GEN_INT (half_width - 1)));
21072 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21073 scratch));
21074 }
21075 else
21076 {
21077 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21078 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21079
21080 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21081 }
21082 }
21083 }
21084
21085 void
21086 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21087 {
21088 rtx (*gen_lshr3)(rtx, rtx, rtx)
21089 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21090 rtx (*gen_shrd)(rtx, rtx, rtx);
21091 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21092
21093 rtx low[2], high[2];
21094 int count;
21095
21096 if (CONST_INT_P (operands[2]))
21097 {
21098 split_double_mode (mode, operands, 2, low, high);
21099 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21100
21101 if (count >= half_width)
21102 {
21103 emit_move_insn (low[0], high[1]);
21104 ix86_expand_clear (high[0]);
21105
21106 if (count > half_width)
21107 emit_insn (gen_lshr3 (low[0], low[0],
21108 GEN_INT (count - half_width)));
21109 }
21110 else
21111 {
21112 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21113
21114 if (!rtx_equal_p (operands[0], operands[1]))
21115 emit_move_insn (operands[0], operands[1]);
21116
21117 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21118 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21119 }
21120 }
21121 else
21122 {
21123 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21124
21125 if (!rtx_equal_p (operands[0], operands[1]))
21126 emit_move_insn (operands[0], operands[1]);
21127
21128 split_double_mode (mode, operands, 1, low, high);
21129
21130 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21131 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21132
21133 if (TARGET_CMOVE && scratch)
21134 {
21135 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21136 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21137
21138 ix86_expand_clear (scratch);
21139 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21140 scratch));
21141 }
21142 else
21143 {
21144 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21145 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21146
21147 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21148 }
21149 }
21150 }
21151
21152 /* Predict just emitted jump instruction to be taken with probability PROB. */
21153 static void
21154 predict_jump (int prob)
21155 {
21156 rtx insn = get_last_insn ();
21157 gcc_assert (JUMP_P (insn));
21158 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21159 }
21160
21161 /* Helper function for the string operations below. Dest VARIABLE whether
21162 it is aligned to VALUE bytes. If true, jump to the label. */
21163 static rtx
21164 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21165 {
21166 rtx label = gen_label_rtx ();
21167 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21168 if (GET_MODE (variable) == DImode)
21169 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21170 else
21171 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21172 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21173 1, label);
21174 if (epilogue)
21175 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21176 else
21177 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21178 return label;
21179 }
21180
21181 /* Adjust COUNTER by the VALUE. */
21182 static void
21183 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21184 {
21185 rtx (*gen_add)(rtx, rtx, rtx)
21186 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21187
21188 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21189 }
21190
21191 /* Zero extend possibly SImode EXP to Pmode register. */
21192 rtx
21193 ix86_zero_extend_to_Pmode (rtx exp)
21194 {
21195 if (GET_MODE (exp) != Pmode)
21196 exp = convert_to_mode (Pmode, exp, 1);
21197 return force_reg (Pmode, exp);
21198 }
21199
21200 /* Divide COUNTREG by SCALE. */
21201 static rtx
21202 scale_counter (rtx countreg, int scale)
21203 {
21204 rtx sc;
21205
21206 if (scale == 1)
21207 return countreg;
21208 if (CONST_INT_P (countreg))
21209 return GEN_INT (INTVAL (countreg) / scale);
21210 gcc_assert (REG_P (countreg));
21211
21212 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21213 GEN_INT (exact_log2 (scale)),
21214 NULL, 1, OPTAB_DIRECT);
21215 return sc;
21216 }
21217
21218 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21219 DImode for constant loop counts. */
21220
21221 static enum machine_mode
21222 counter_mode (rtx count_exp)
21223 {
21224 if (GET_MODE (count_exp) != VOIDmode)
21225 return GET_MODE (count_exp);
21226 if (!CONST_INT_P (count_exp))
21227 return Pmode;
21228 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21229 return DImode;
21230 return SImode;
21231 }
21232
21233 /* When SRCPTR is non-NULL, output simple loop to move memory
21234 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21235 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21236 equivalent loop to set memory by VALUE (supposed to be in MODE).
21237
21238 The size is rounded down to whole number of chunk size moved at once.
21239 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21240
21241
21242 static void
21243 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21244 rtx destptr, rtx srcptr, rtx value,
21245 rtx count, enum machine_mode mode, int unroll,
21246 int expected_size)
21247 {
21248 rtx out_label, top_label, iter, tmp;
21249 enum machine_mode iter_mode = counter_mode (count);
21250 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21251 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21252 rtx size;
21253 rtx x_addr;
21254 rtx y_addr;
21255 int i;
21256
21257 top_label = gen_label_rtx ();
21258 out_label = gen_label_rtx ();
21259 iter = gen_reg_rtx (iter_mode);
21260
21261 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21262 NULL, 1, OPTAB_DIRECT);
21263 /* Those two should combine. */
21264 if (piece_size == const1_rtx)
21265 {
21266 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21267 true, out_label);
21268 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21269 }
21270 emit_move_insn (iter, const0_rtx);
21271
21272 emit_label (top_label);
21273
21274 tmp = convert_modes (Pmode, iter_mode, iter, true);
21275 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21276 destmem = change_address (destmem, mode, x_addr);
21277
21278 if (srcmem)
21279 {
21280 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21281 srcmem = change_address (srcmem, mode, y_addr);
21282
21283 /* When unrolling for chips that reorder memory reads and writes,
21284 we can save registers by using single temporary.
21285 Also using 4 temporaries is overkill in 32bit mode. */
21286 if (!TARGET_64BIT && 0)
21287 {
21288 for (i = 0; i < unroll; i++)
21289 {
21290 if (i)
21291 {
21292 destmem =
21293 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21294 srcmem =
21295 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21296 }
21297 emit_move_insn (destmem, srcmem);
21298 }
21299 }
21300 else
21301 {
21302 rtx tmpreg[4];
21303 gcc_assert (unroll <= 4);
21304 for (i = 0; i < unroll; i++)
21305 {
21306 tmpreg[i] = gen_reg_rtx (mode);
21307 if (i)
21308 {
21309 srcmem =
21310 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21311 }
21312 emit_move_insn (tmpreg[i], srcmem);
21313 }
21314 for (i = 0; i < unroll; i++)
21315 {
21316 if (i)
21317 {
21318 destmem =
21319 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21320 }
21321 emit_move_insn (destmem, tmpreg[i]);
21322 }
21323 }
21324 }
21325 else
21326 for (i = 0; i < unroll; i++)
21327 {
21328 if (i)
21329 destmem =
21330 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21331 emit_move_insn (destmem, value);
21332 }
21333
21334 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21335 true, OPTAB_LIB_WIDEN);
21336 if (tmp != iter)
21337 emit_move_insn (iter, tmp);
21338
21339 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21340 true, top_label);
21341 if (expected_size != -1)
21342 {
21343 expected_size /= GET_MODE_SIZE (mode) * unroll;
21344 if (expected_size == 0)
21345 predict_jump (0);
21346 else if (expected_size > REG_BR_PROB_BASE)
21347 predict_jump (REG_BR_PROB_BASE - 1);
21348 else
21349 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21350 }
21351 else
21352 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21353 iter = ix86_zero_extend_to_Pmode (iter);
21354 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21355 true, OPTAB_LIB_WIDEN);
21356 if (tmp != destptr)
21357 emit_move_insn (destptr, tmp);
21358 if (srcptr)
21359 {
21360 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21361 true, OPTAB_LIB_WIDEN);
21362 if (tmp != srcptr)
21363 emit_move_insn (srcptr, tmp);
21364 }
21365 emit_label (out_label);
21366 }
21367
21368 /* Output "rep; mov" instruction.
21369 Arguments have same meaning as for previous function */
21370 static void
21371 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21372 rtx destptr, rtx srcptr,
21373 rtx count,
21374 enum machine_mode mode)
21375 {
21376 rtx destexp;
21377 rtx srcexp;
21378 rtx countreg;
21379 HOST_WIDE_INT rounded_count;
21380
21381 /* If the size is known, it is shorter to use rep movs. */
21382 if (mode == QImode && CONST_INT_P (count)
21383 && !(INTVAL (count) & 3))
21384 mode = SImode;
21385
21386 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21387 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21388 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21389 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21390 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21391 if (mode != QImode)
21392 {
21393 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21394 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21395 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21396 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21397 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21398 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21399 }
21400 else
21401 {
21402 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21403 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21404 }
21405 if (CONST_INT_P (count))
21406 {
21407 rounded_count = (INTVAL (count)
21408 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21409 destmem = shallow_copy_rtx (destmem);
21410 srcmem = shallow_copy_rtx (srcmem);
21411 set_mem_size (destmem, rounded_count);
21412 set_mem_size (srcmem, rounded_count);
21413 }
21414 else
21415 {
21416 if (MEM_SIZE_KNOWN_P (destmem))
21417 clear_mem_size (destmem);
21418 if (MEM_SIZE_KNOWN_P (srcmem))
21419 clear_mem_size (srcmem);
21420 }
21421 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21422 destexp, srcexp));
21423 }
21424
21425 /* Output "rep; stos" instruction.
21426 Arguments have same meaning as for previous function */
21427 static void
21428 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21429 rtx count, enum machine_mode mode,
21430 rtx orig_value)
21431 {
21432 rtx destexp;
21433 rtx countreg;
21434 HOST_WIDE_INT rounded_count;
21435
21436 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21437 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21438 value = force_reg (mode, gen_lowpart (mode, value));
21439 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21440 if (mode != QImode)
21441 {
21442 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21443 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21444 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21445 }
21446 else
21447 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21448 if (orig_value == const0_rtx && CONST_INT_P (count))
21449 {
21450 rounded_count = (INTVAL (count)
21451 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21452 destmem = shallow_copy_rtx (destmem);
21453 set_mem_size (destmem, rounded_count);
21454 }
21455 else if (MEM_SIZE_KNOWN_P (destmem))
21456 clear_mem_size (destmem);
21457 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21458 }
21459
21460 static void
21461 emit_strmov (rtx destmem, rtx srcmem,
21462 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21463 {
21464 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21465 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21466 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21467 }
21468
21469 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21470 static void
21471 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21472 rtx destptr, rtx srcptr, rtx count, int max_size)
21473 {
21474 rtx src, dest;
21475 if (CONST_INT_P (count))
21476 {
21477 HOST_WIDE_INT countval = INTVAL (count);
21478 int offset = 0;
21479
21480 if ((countval & 0x10) && max_size > 16)
21481 {
21482 if (TARGET_64BIT)
21483 {
21484 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21485 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21486 }
21487 else
21488 gcc_unreachable ();
21489 offset += 16;
21490 }
21491 if ((countval & 0x08) && max_size > 8)
21492 {
21493 if (TARGET_64BIT)
21494 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21495 else
21496 {
21497 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21498 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21499 }
21500 offset += 8;
21501 }
21502 if ((countval & 0x04) && max_size > 4)
21503 {
21504 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21505 offset += 4;
21506 }
21507 if ((countval & 0x02) && max_size > 2)
21508 {
21509 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21510 offset += 2;
21511 }
21512 if ((countval & 0x01) && max_size > 1)
21513 {
21514 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21515 offset += 1;
21516 }
21517 return;
21518 }
21519 if (max_size > 8)
21520 {
21521 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21522 count, 1, OPTAB_DIRECT);
21523 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21524 count, QImode, 1, 4);
21525 return;
21526 }
21527
21528 /* When there are stringops, we can cheaply increase dest and src pointers.
21529 Otherwise we save code size by maintaining offset (zero is readily
21530 available from preceding rep operation) and using x86 addressing modes.
21531 */
21532 if (TARGET_SINGLE_STRINGOP)
21533 {
21534 if (max_size > 4)
21535 {
21536 rtx label = ix86_expand_aligntest (count, 4, true);
21537 src = change_address (srcmem, SImode, srcptr);
21538 dest = change_address (destmem, SImode, destptr);
21539 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21540 emit_label (label);
21541 LABEL_NUSES (label) = 1;
21542 }
21543 if (max_size > 2)
21544 {
21545 rtx label = ix86_expand_aligntest (count, 2, true);
21546 src = change_address (srcmem, HImode, srcptr);
21547 dest = change_address (destmem, HImode, destptr);
21548 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21549 emit_label (label);
21550 LABEL_NUSES (label) = 1;
21551 }
21552 if (max_size > 1)
21553 {
21554 rtx label = ix86_expand_aligntest (count, 1, true);
21555 src = change_address (srcmem, QImode, srcptr);
21556 dest = change_address (destmem, QImode, destptr);
21557 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21558 emit_label (label);
21559 LABEL_NUSES (label) = 1;
21560 }
21561 }
21562 else
21563 {
21564 rtx offset = force_reg (Pmode, const0_rtx);
21565 rtx tmp;
21566
21567 if (max_size > 4)
21568 {
21569 rtx label = ix86_expand_aligntest (count, 4, true);
21570 src = change_address (srcmem, SImode, srcptr);
21571 dest = change_address (destmem, SImode, destptr);
21572 emit_move_insn (dest, src);
21573 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21574 true, OPTAB_LIB_WIDEN);
21575 if (tmp != offset)
21576 emit_move_insn (offset, tmp);
21577 emit_label (label);
21578 LABEL_NUSES (label) = 1;
21579 }
21580 if (max_size > 2)
21581 {
21582 rtx label = ix86_expand_aligntest (count, 2, true);
21583 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21584 src = change_address (srcmem, HImode, tmp);
21585 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21586 dest = change_address (destmem, HImode, tmp);
21587 emit_move_insn (dest, src);
21588 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21589 true, OPTAB_LIB_WIDEN);
21590 if (tmp != offset)
21591 emit_move_insn (offset, tmp);
21592 emit_label (label);
21593 LABEL_NUSES (label) = 1;
21594 }
21595 if (max_size > 1)
21596 {
21597 rtx label = ix86_expand_aligntest (count, 1, true);
21598 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21599 src = change_address (srcmem, QImode, tmp);
21600 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21601 dest = change_address (destmem, QImode, tmp);
21602 emit_move_insn (dest, src);
21603 emit_label (label);
21604 LABEL_NUSES (label) = 1;
21605 }
21606 }
21607 }
21608
21609 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21610 static void
21611 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21612 rtx count, int max_size)
21613 {
21614 count =
21615 expand_simple_binop (counter_mode (count), AND, count,
21616 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21617 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21618 gen_lowpart (QImode, value), count, QImode,
21619 1, max_size / 2);
21620 }
21621
21622 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21623 static void
21624 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21625 {
21626 rtx dest;
21627
21628 if (CONST_INT_P (count))
21629 {
21630 HOST_WIDE_INT countval = INTVAL (count);
21631 int offset = 0;
21632
21633 if ((countval & 0x10) && max_size > 16)
21634 {
21635 if (TARGET_64BIT)
21636 {
21637 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21638 emit_insn (gen_strset (destptr, dest, value));
21639 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21640 emit_insn (gen_strset (destptr, dest, value));
21641 }
21642 else
21643 gcc_unreachable ();
21644 offset += 16;
21645 }
21646 if ((countval & 0x08) && max_size > 8)
21647 {
21648 if (TARGET_64BIT)
21649 {
21650 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21651 emit_insn (gen_strset (destptr, dest, value));
21652 }
21653 else
21654 {
21655 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21656 emit_insn (gen_strset (destptr, dest, value));
21657 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21658 emit_insn (gen_strset (destptr, dest, value));
21659 }
21660 offset += 8;
21661 }
21662 if ((countval & 0x04) && max_size > 4)
21663 {
21664 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21665 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21666 offset += 4;
21667 }
21668 if ((countval & 0x02) && max_size > 2)
21669 {
21670 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21671 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21672 offset += 2;
21673 }
21674 if ((countval & 0x01) && max_size > 1)
21675 {
21676 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21677 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21678 offset += 1;
21679 }
21680 return;
21681 }
21682 if (max_size > 32)
21683 {
21684 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21685 return;
21686 }
21687 if (max_size > 16)
21688 {
21689 rtx label = ix86_expand_aligntest (count, 16, true);
21690 if (TARGET_64BIT)
21691 {
21692 dest = change_address (destmem, DImode, destptr);
21693 emit_insn (gen_strset (destptr, dest, value));
21694 emit_insn (gen_strset (destptr, dest, value));
21695 }
21696 else
21697 {
21698 dest = change_address (destmem, SImode, destptr);
21699 emit_insn (gen_strset (destptr, dest, value));
21700 emit_insn (gen_strset (destptr, dest, value));
21701 emit_insn (gen_strset (destptr, dest, value));
21702 emit_insn (gen_strset (destptr, dest, value));
21703 }
21704 emit_label (label);
21705 LABEL_NUSES (label) = 1;
21706 }
21707 if (max_size > 8)
21708 {
21709 rtx label = ix86_expand_aligntest (count, 8, true);
21710 if (TARGET_64BIT)
21711 {
21712 dest = change_address (destmem, DImode, destptr);
21713 emit_insn (gen_strset (destptr, dest, value));
21714 }
21715 else
21716 {
21717 dest = change_address (destmem, SImode, destptr);
21718 emit_insn (gen_strset (destptr, dest, value));
21719 emit_insn (gen_strset (destptr, dest, value));
21720 }
21721 emit_label (label);
21722 LABEL_NUSES (label) = 1;
21723 }
21724 if (max_size > 4)
21725 {
21726 rtx label = ix86_expand_aligntest (count, 4, true);
21727 dest = change_address (destmem, SImode, destptr);
21728 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21729 emit_label (label);
21730 LABEL_NUSES (label) = 1;
21731 }
21732 if (max_size > 2)
21733 {
21734 rtx label = ix86_expand_aligntest (count, 2, true);
21735 dest = change_address (destmem, HImode, destptr);
21736 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21737 emit_label (label);
21738 LABEL_NUSES (label) = 1;
21739 }
21740 if (max_size > 1)
21741 {
21742 rtx label = ix86_expand_aligntest (count, 1, true);
21743 dest = change_address (destmem, QImode, destptr);
21744 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21745 emit_label (label);
21746 LABEL_NUSES (label) = 1;
21747 }
21748 }
21749
21750 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21751 DESIRED_ALIGNMENT. */
21752 static void
21753 expand_movmem_prologue (rtx destmem, rtx srcmem,
21754 rtx destptr, rtx srcptr, rtx count,
21755 int align, int desired_alignment)
21756 {
21757 if (align <= 1 && desired_alignment > 1)
21758 {
21759 rtx label = ix86_expand_aligntest (destptr, 1, false);
21760 srcmem = change_address (srcmem, QImode, srcptr);
21761 destmem = change_address (destmem, QImode, destptr);
21762 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21763 ix86_adjust_counter (count, 1);
21764 emit_label (label);
21765 LABEL_NUSES (label) = 1;
21766 }
21767 if (align <= 2 && desired_alignment > 2)
21768 {
21769 rtx label = ix86_expand_aligntest (destptr, 2, false);
21770 srcmem = change_address (srcmem, HImode, srcptr);
21771 destmem = change_address (destmem, HImode, destptr);
21772 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21773 ix86_adjust_counter (count, 2);
21774 emit_label (label);
21775 LABEL_NUSES (label) = 1;
21776 }
21777 if (align <= 4 && desired_alignment > 4)
21778 {
21779 rtx label = ix86_expand_aligntest (destptr, 4, false);
21780 srcmem = change_address (srcmem, SImode, srcptr);
21781 destmem = change_address (destmem, SImode, destptr);
21782 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21783 ix86_adjust_counter (count, 4);
21784 emit_label (label);
21785 LABEL_NUSES (label) = 1;
21786 }
21787 gcc_assert (desired_alignment <= 8);
21788 }
21789
21790 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21791 ALIGN_BYTES is how many bytes need to be copied. */
21792 static rtx
21793 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21794 int desired_align, int align_bytes)
21795 {
21796 rtx src = *srcp;
21797 rtx orig_dst = dst;
21798 rtx orig_src = src;
21799 int off = 0;
21800 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21801 if (src_align_bytes >= 0)
21802 src_align_bytes = desired_align - src_align_bytes;
21803 if (align_bytes & 1)
21804 {
21805 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21806 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21807 off = 1;
21808 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21809 }
21810 if (align_bytes & 2)
21811 {
21812 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21813 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21814 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21815 set_mem_align (dst, 2 * BITS_PER_UNIT);
21816 if (src_align_bytes >= 0
21817 && (src_align_bytes & 1) == (align_bytes & 1)
21818 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21819 set_mem_align (src, 2 * BITS_PER_UNIT);
21820 off = 2;
21821 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21822 }
21823 if (align_bytes & 4)
21824 {
21825 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21826 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21827 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21828 set_mem_align (dst, 4 * BITS_PER_UNIT);
21829 if (src_align_bytes >= 0)
21830 {
21831 unsigned int src_align = 0;
21832 if ((src_align_bytes & 3) == (align_bytes & 3))
21833 src_align = 4;
21834 else if ((src_align_bytes & 1) == (align_bytes & 1))
21835 src_align = 2;
21836 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21837 set_mem_align (src, src_align * BITS_PER_UNIT);
21838 }
21839 off = 4;
21840 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21841 }
21842 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21843 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21844 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21845 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21846 if (src_align_bytes >= 0)
21847 {
21848 unsigned int src_align = 0;
21849 if ((src_align_bytes & 7) == (align_bytes & 7))
21850 src_align = 8;
21851 else if ((src_align_bytes & 3) == (align_bytes & 3))
21852 src_align = 4;
21853 else if ((src_align_bytes & 1) == (align_bytes & 1))
21854 src_align = 2;
21855 if (src_align > (unsigned int) desired_align)
21856 src_align = desired_align;
21857 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21858 set_mem_align (src, src_align * BITS_PER_UNIT);
21859 }
21860 if (MEM_SIZE_KNOWN_P (orig_dst))
21861 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21862 if (MEM_SIZE_KNOWN_P (orig_src))
21863 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21864 *srcp = src;
21865 return dst;
21866 }
21867
21868 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21869 DESIRED_ALIGNMENT. */
21870 static void
21871 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21872 int align, int desired_alignment)
21873 {
21874 if (align <= 1 && desired_alignment > 1)
21875 {
21876 rtx label = ix86_expand_aligntest (destptr, 1, false);
21877 destmem = change_address (destmem, QImode, destptr);
21878 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21879 ix86_adjust_counter (count, 1);
21880 emit_label (label);
21881 LABEL_NUSES (label) = 1;
21882 }
21883 if (align <= 2 && desired_alignment > 2)
21884 {
21885 rtx label = ix86_expand_aligntest (destptr, 2, false);
21886 destmem = change_address (destmem, HImode, destptr);
21887 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21888 ix86_adjust_counter (count, 2);
21889 emit_label (label);
21890 LABEL_NUSES (label) = 1;
21891 }
21892 if (align <= 4 && desired_alignment > 4)
21893 {
21894 rtx label = ix86_expand_aligntest (destptr, 4, false);
21895 destmem = change_address (destmem, SImode, destptr);
21896 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21897 ix86_adjust_counter (count, 4);
21898 emit_label (label);
21899 LABEL_NUSES (label) = 1;
21900 }
21901 gcc_assert (desired_alignment <= 8);
21902 }
21903
21904 /* Set enough from DST to align DST known to by aligned by ALIGN to
21905 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21906 static rtx
21907 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21908 int desired_align, int align_bytes)
21909 {
21910 int off = 0;
21911 rtx orig_dst = dst;
21912 if (align_bytes & 1)
21913 {
21914 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21915 off = 1;
21916 emit_insn (gen_strset (destreg, dst,
21917 gen_lowpart (QImode, value)));
21918 }
21919 if (align_bytes & 2)
21920 {
21921 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21922 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21923 set_mem_align (dst, 2 * BITS_PER_UNIT);
21924 off = 2;
21925 emit_insn (gen_strset (destreg, dst,
21926 gen_lowpart (HImode, value)));
21927 }
21928 if (align_bytes & 4)
21929 {
21930 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21931 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21932 set_mem_align (dst, 4 * BITS_PER_UNIT);
21933 off = 4;
21934 emit_insn (gen_strset (destreg, dst,
21935 gen_lowpart (SImode, value)));
21936 }
21937 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21938 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21939 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21940 if (MEM_SIZE_KNOWN_P (orig_dst))
21941 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21942 return dst;
21943 }
21944
21945 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21946 static enum stringop_alg
21947 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21948 int *dynamic_check)
21949 {
21950 const struct stringop_algs * algs;
21951 bool optimize_for_speed;
21952 /* Algorithms using the rep prefix want at least edi and ecx;
21953 additionally, memset wants eax and memcpy wants esi. Don't
21954 consider such algorithms if the user has appropriated those
21955 registers for their own purposes. */
21956 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21957 || (memset
21958 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21959
21960 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21961 || (alg != rep_prefix_1_byte \
21962 && alg != rep_prefix_4_byte \
21963 && alg != rep_prefix_8_byte))
21964 const struct processor_costs *cost;
21965
21966 /* Even if the string operation call is cold, we still might spend a lot
21967 of time processing large blocks. */
21968 if (optimize_function_for_size_p (cfun)
21969 || (optimize_insn_for_size_p ()
21970 && expected_size != -1 && expected_size < 256))
21971 optimize_for_speed = false;
21972 else
21973 optimize_for_speed = true;
21974
21975 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21976
21977 *dynamic_check = -1;
21978 if (memset)
21979 algs = &cost->memset[TARGET_64BIT != 0];
21980 else
21981 algs = &cost->memcpy[TARGET_64BIT != 0];
21982 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21983 return ix86_stringop_alg;
21984 /* rep; movq or rep; movl is the smallest variant. */
21985 else if (!optimize_for_speed)
21986 {
21987 if (!count || (count & 3))
21988 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21989 else
21990 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21991 }
21992 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21993 */
21994 else if (expected_size != -1 && expected_size < 4)
21995 return loop_1_byte;
21996 else if (expected_size != -1)
21997 {
21998 unsigned int i;
21999 enum stringop_alg alg = libcall;
22000 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22001 {
22002 /* We get here if the algorithms that were not libcall-based
22003 were rep-prefix based and we are unable to use rep prefixes
22004 based on global register usage. Break out of the loop and
22005 use the heuristic below. */
22006 if (algs->size[i].max == 0)
22007 break;
22008 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22009 {
22010 enum stringop_alg candidate = algs->size[i].alg;
22011
22012 if (candidate != libcall && ALG_USABLE_P (candidate))
22013 alg = candidate;
22014 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22015 last non-libcall inline algorithm. */
22016 if (TARGET_INLINE_ALL_STRINGOPS)
22017 {
22018 /* When the current size is best to be copied by a libcall,
22019 but we are still forced to inline, run the heuristic below
22020 that will pick code for medium sized blocks. */
22021 if (alg != libcall)
22022 return alg;
22023 break;
22024 }
22025 else if (ALG_USABLE_P (candidate))
22026 return candidate;
22027 }
22028 }
22029 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22030 }
22031 /* When asked to inline the call anyway, try to pick meaningful choice.
22032 We look for maximal size of block that is faster to copy by hand and
22033 take blocks of at most of that size guessing that average size will
22034 be roughly half of the block.
22035
22036 If this turns out to be bad, we might simply specify the preferred
22037 choice in ix86_costs. */
22038 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22039 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22040 {
22041 int max = -1;
22042 enum stringop_alg alg;
22043 int i;
22044 bool any_alg_usable_p = true;
22045
22046 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22047 {
22048 enum stringop_alg candidate = algs->size[i].alg;
22049 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22050
22051 if (candidate != libcall && candidate
22052 && ALG_USABLE_P (candidate))
22053 max = algs->size[i].max;
22054 }
22055 /* If there aren't any usable algorithms, then recursing on
22056 smaller sizes isn't going to find anything. Just return the
22057 simple byte-at-a-time copy loop. */
22058 if (!any_alg_usable_p)
22059 {
22060 /* Pick something reasonable. */
22061 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22062 *dynamic_check = 128;
22063 return loop_1_byte;
22064 }
22065 if (max == -1)
22066 max = 4096;
22067 alg = decide_alg (count, max / 2, memset, dynamic_check);
22068 gcc_assert (*dynamic_check == -1);
22069 gcc_assert (alg != libcall);
22070 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22071 *dynamic_check = max;
22072 return alg;
22073 }
22074 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22075 #undef ALG_USABLE_P
22076 }
22077
22078 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22079 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22080 static int
22081 decide_alignment (int align,
22082 enum stringop_alg alg,
22083 int expected_size)
22084 {
22085 int desired_align = 0;
22086 switch (alg)
22087 {
22088 case no_stringop:
22089 gcc_unreachable ();
22090 case loop:
22091 case unrolled_loop:
22092 desired_align = GET_MODE_SIZE (Pmode);
22093 break;
22094 case rep_prefix_8_byte:
22095 desired_align = 8;
22096 break;
22097 case rep_prefix_4_byte:
22098 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22099 copying whole cacheline at once. */
22100 if (TARGET_PENTIUMPRO)
22101 desired_align = 8;
22102 else
22103 desired_align = 4;
22104 break;
22105 case rep_prefix_1_byte:
22106 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22107 copying whole cacheline at once. */
22108 if (TARGET_PENTIUMPRO)
22109 desired_align = 8;
22110 else
22111 desired_align = 1;
22112 break;
22113 case loop_1_byte:
22114 desired_align = 1;
22115 break;
22116 case libcall:
22117 return 0;
22118 }
22119
22120 if (optimize_size)
22121 desired_align = 1;
22122 if (desired_align < align)
22123 desired_align = align;
22124 if (expected_size != -1 && expected_size < 4)
22125 desired_align = align;
22126 return desired_align;
22127 }
22128
22129 /* Return the smallest power of 2 greater than VAL. */
22130 static int
22131 smallest_pow2_greater_than (int val)
22132 {
22133 int ret = 1;
22134 while (ret <= val)
22135 ret <<= 1;
22136 return ret;
22137 }
22138
22139 /* Expand string move (memcpy) operation. Use i386 string operations
22140 when profitable. expand_setmem contains similar code. The code
22141 depends upon architecture, block size and alignment, but always has
22142 the same overall structure:
22143
22144 1) Prologue guard: Conditional that jumps up to epilogues for small
22145 blocks that can be handled by epilogue alone. This is faster
22146 but also needed for correctness, since prologue assume the block
22147 is larger than the desired alignment.
22148
22149 Optional dynamic check for size and libcall for large
22150 blocks is emitted here too, with -minline-stringops-dynamically.
22151
22152 2) Prologue: copy first few bytes in order to get destination
22153 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22154 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22155 copied. We emit either a jump tree on power of two sized
22156 blocks, or a byte loop.
22157
22158 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22159 with specified algorithm.
22160
22161 4) Epilogue: code copying tail of the block that is too small to be
22162 handled by main body (or up to size guarded by prologue guard). */
22163
22164 bool
22165 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22166 rtx expected_align_exp, rtx expected_size_exp)
22167 {
22168 rtx destreg;
22169 rtx srcreg;
22170 rtx label = NULL;
22171 rtx tmp;
22172 rtx jump_around_label = NULL;
22173 HOST_WIDE_INT align = 1;
22174 unsigned HOST_WIDE_INT count = 0;
22175 HOST_WIDE_INT expected_size = -1;
22176 int size_needed = 0, epilogue_size_needed;
22177 int desired_align = 0, align_bytes = 0;
22178 enum stringop_alg alg;
22179 int dynamic_check;
22180 bool need_zero_guard = false;
22181
22182 if (CONST_INT_P (align_exp))
22183 align = INTVAL (align_exp);
22184 /* i386 can do misaligned access on reasonably increased cost. */
22185 if (CONST_INT_P (expected_align_exp)
22186 && INTVAL (expected_align_exp) > align)
22187 align = INTVAL (expected_align_exp);
22188 /* ALIGN is the minimum of destination and source alignment, but we care here
22189 just about destination alignment. */
22190 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22191 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22192
22193 if (CONST_INT_P (count_exp))
22194 count = expected_size = INTVAL (count_exp);
22195 if (CONST_INT_P (expected_size_exp) && count == 0)
22196 expected_size = INTVAL (expected_size_exp);
22197
22198 /* Make sure we don't need to care about overflow later on. */
22199 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22200 return false;
22201
22202 /* Step 0: Decide on preferred algorithm, desired alignment and
22203 size of chunks to be copied by main loop. */
22204
22205 alg = decide_alg (count, expected_size, false, &dynamic_check);
22206 desired_align = decide_alignment (align, alg, expected_size);
22207
22208 if (!TARGET_ALIGN_STRINGOPS)
22209 align = desired_align;
22210
22211 if (alg == libcall)
22212 return false;
22213 gcc_assert (alg != no_stringop);
22214 if (!count)
22215 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22216 destreg = copy_addr_to_reg (XEXP (dst, 0));
22217 srcreg = copy_addr_to_reg (XEXP (src, 0));
22218 switch (alg)
22219 {
22220 case libcall:
22221 case no_stringop:
22222 gcc_unreachable ();
22223 case loop:
22224 need_zero_guard = true;
22225 size_needed = GET_MODE_SIZE (word_mode);
22226 break;
22227 case unrolled_loop:
22228 need_zero_guard = true;
22229 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22230 break;
22231 case rep_prefix_8_byte:
22232 size_needed = 8;
22233 break;
22234 case rep_prefix_4_byte:
22235 size_needed = 4;
22236 break;
22237 case rep_prefix_1_byte:
22238 size_needed = 1;
22239 break;
22240 case loop_1_byte:
22241 need_zero_guard = true;
22242 size_needed = 1;
22243 break;
22244 }
22245
22246 epilogue_size_needed = size_needed;
22247
22248 /* Step 1: Prologue guard. */
22249
22250 /* Alignment code needs count to be in register. */
22251 if (CONST_INT_P (count_exp) && desired_align > align)
22252 {
22253 if (INTVAL (count_exp) > desired_align
22254 && INTVAL (count_exp) > size_needed)
22255 {
22256 align_bytes
22257 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22258 if (align_bytes <= 0)
22259 align_bytes = 0;
22260 else
22261 align_bytes = desired_align - align_bytes;
22262 }
22263 if (align_bytes == 0)
22264 count_exp = force_reg (counter_mode (count_exp), count_exp);
22265 }
22266 gcc_assert (desired_align >= 1 && align >= 1);
22267
22268 /* Ensure that alignment prologue won't copy past end of block. */
22269 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22270 {
22271 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22272 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22273 Make sure it is power of 2. */
22274 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22275
22276 if (count)
22277 {
22278 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22279 {
22280 /* If main algorithm works on QImode, no epilogue is needed.
22281 For small sizes just don't align anything. */
22282 if (size_needed == 1)
22283 desired_align = align;
22284 else
22285 goto epilogue;
22286 }
22287 }
22288 else
22289 {
22290 label = gen_label_rtx ();
22291 emit_cmp_and_jump_insns (count_exp,
22292 GEN_INT (epilogue_size_needed),
22293 LTU, 0, counter_mode (count_exp), 1, label);
22294 if (expected_size == -1 || expected_size < epilogue_size_needed)
22295 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22296 else
22297 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22298 }
22299 }
22300
22301 /* Emit code to decide on runtime whether library call or inline should be
22302 used. */
22303 if (dynamic_check != -1)
22304 {
22305 if (CONST_INT_P (count_exp))
22306 {
22307 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22308 {
22309 emit_block_move_via_libcall (dst, src, count_exp, false);
22310 count_exp = const0_rtx;
22311 goto epilogue;
22312 }
22313 }
22314 else
22315 {
22316 rtx hot_label = gen_label_rtx ();
22317 jump_around_label = gen_label_rtx ();
22318 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22319 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22320 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22321 emit_block_move_via_libcall (dst, src, count_exp, false);
22322 emit_jump (jump_around_label);
22323 emit_label (hot_label);
22324 }
22325 }
22326
22327 /* Step 2: Alignment prologue. */
22328
22329 if (desired_align > align)
22330 {
22331 if (align_bytes == 0)
22332 {
22333 /* Except for the first move in epilogue, we no longer know
22334 constant offset in aliasing info. It don't seems to worth
22335 the pain to maintain it for the first move, so throw away
22336 the info early. */
22337 src = change_address (src, BLKmode, srcreg);
22338 dst = change_address (dst, BLKmode, destreg);
22339 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22340 desired_align);
22341 }
22342 else
22343 {
22344 /* If we know how many bytes need to be stored before dst is
22345 sufficiently aligned, maintain aliasing info accurately. */
22346 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22347 desired_align, align_bytes);
22348 count_exp = plus_constant (counter_mode (count_exp),
22349 count_exp, -align_bytes);
22350 count -= align_bytes;
22351 }
22352 if (need_zero_guard
22353 && (count < (unsigned HOST_WIDE_INT) size_needed
22354 || (align_bytes == 0
22355 && count < ((unsigned HOST_WIDE_INT) size_needed
22356 + desired_align - align))))
22357 {
22358 /* It is possible that we copied enough so the main loop will not
22359 execute. */
22360 gcc_assert (size_needed > 1);
22361 if (label == NULL_RTX)
22362 label = gen_label_rtx ();
22363 emit_cmp_and_jump_insns (count_exp,
22364 GEN_INT (size_needed),
22365 LTU, 0, counter_mode (count_exp), 1, label);
22366 if (expected_size == -1
22367 || expected_size < (desired_align - align) / 2 + size_needed)
22368 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22369 else
22370 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22371 }
22372 }
22373 if (label && size_needed == 1)
22374 {
22375 emit_label (label);
22376 LABEL_NUSES (label) = 1;
22377 label = NULL;
22378 epilogue_size_needed = 1;
22379 }
22380 else if (label == NULL_RTX)
22381 epilogue_size_needed = size_needed;
22382
22383 /* Step 3: Main loop. */
22384
22385 switch (alg)
22386 {
22387 case libcall:
22388 case no_stringop:
22389 gcc_unreachable ();
22390 case loop_1_byte:
22391 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22392 count_exp, QImode, 1, expected_size);
22393 break;
22394 case loop:
22395 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22396 count_exp, word_mode, 1, expected_size);
22397 break;
22398 case unrolled_loop:
22399 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22400 registers for 4 temporaries anyway. */
22401 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22402 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22403 expected_size);
22404 break;
22405 case rep_prefix_8_byte:
22406 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22407 DImode);
22408 break;
22409 case rep_prefix_4_byte:
22410 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22411 SImode);
22412 break;
22413 case rep_prefix_1_byte:
22414 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22415 QImode);
22416 break;
22417 }
22418 /* Adjust properly the offset of src and dest memory for aliasing. */
22419 if (CONST_INT_P (count_exp))
22420 {
22421 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22422 (count / size_needed) * size_needed);
22423 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22424 (count / size_needed) * size_needed);
22425 }
22426 else
22427 {
22428 src = change_address (src, BLKmode, srcreg);
22429 dst = change_address (dst, BLKmode, destreg);
22430 }
22431
22432 /* Step 4: Epilogue to copy the remaining bytes. */
22433 epilogue:
22434 if (label)
22435 {
22436 /* When the main loop is done, COUNT_EXP might hold original count,
22437 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22438 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22439 bytes. Compensate if needed. */
22440
22441 if (size_needed < epilogue_size_needed)
22442 {
22443 tmp =
22444 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22445 GEN_INT (size_needed - 1), count_exp, 1,
22446 OPTAB_DIRECT);
22447 if (tmp != count_exp)
22448 emit_move_insn (count_exp, tmp);
22449 }
22450 emit_label (label);
22451 LABEL_NUSES (label) = 1;
22452 }
22453
22454 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22455 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22456 epilogue_size_needed);
22457 if (jump_around_label)
22458 emit_label (jump_around_label);
22459 return true;
22460 }
22461
22462 /* Helper function for memcpy. For QImode value 0xXY produce
22463 0xXYXYXYXY of wide specified by MODE. This is essentially
22464 a * 0x10101010, but we can do slightly better than
22465 synth_mult by unwinding the sequence by hand on CPUs with
22466 slow multiply. */
22467 static rtx
22468 promote_duplicated_reg (enum machine_mode mode, rtx val)
22469 {
22470 enum machine_mode valmode = GET_MODE (val);
22471 rtx tmp;
22472 int nops = mode == DImode ? 3 : 2;
22473
22474 gcc_assert (mode == SImode || mode == DImode);
22475 if (val == const0_rtx)
22476 return copy_to_mode_reg (mode, const0_rtx);
22477 if (CONST_INT_P (val))
22478 {
22479 HOST_WIDE_INT v = INTVAL (val) & 255;
22480
22481 v |= v << 8;
22482 v |= v << 16;
22483 if (mode == DImode)
22484 v |= (v << 16) << 16;
22485 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22486 }
22487
22488 if (valmode == VOIDmode)
22489 valmode = QImode;
22490 if (valmode != QImode)
22491 val = gen_lowpart (QImode, val);
22492 if (mode == QImode)
22493 return val;
22494 if (!TARGET_PARTIAL_REG_STALL)
22495 nops--;
22496 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22497 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22498 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22499 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22500 {
22501 rtx reg = convert_modes (mode, QImode, val, true);
22502 tmp = promote_duplicated_reg (mode, const1_rtx);
22503 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22504 OPTAB_DIRECT);
22505 }
22506 else
22507 {
22508 rtx reg = convert_modes (mode, QImode, val, true);
22509
22510 if (!TARGET_PARTIAL_REG_STALL)
22511 if (mode == SImode)
22512 emit_insn (gen_movsi_insv_1 (reg, reg));
22513 else
22514 emit_insn (gen_movdi_insv_1 (reg, reg));
22515 else
22516 {
22517 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22518 NULL, 1, OPTAB_DIRECT);
22519 reg =
22520 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22521 }
22522 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22523 NULL, 1, OPTAB_DIRECT);
22524 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22525 if (mode == SImode)
22526 return reg;
22527 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22528 NULL, 1, OPTAB_DIRECT);
22529 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22530 return reg;
22531 }
22532 }
22533
22534 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22535 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22536 alignment from ALIGN to DESIRED_ALIGN. */
22537 static rtx
22538 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22539 {
22540 rtx promoted_val;
22541
22542 if (TARGET_64BIT
22543 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22544 promoted_val = promote_duplicated_reg (DImode, val);
22545 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22546 promoted_val = promote_duplicated_reg (SImode, val);
22547 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22548 promoted_val = promote_duplicated_reg (HImode, val);
22549 else
22550 promoted_val = val;
22551
22552 return promoted_val;
22553 }
22554
22555 /* Expand string clear operation (bzero). Use i386 string operations when
22556 profitable. See expand_movmem comment for explanation of individual
22557 steps performed. */
22558 bool
22559 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22560 rtx expected_align_exp, rtx expected_size_exp)
22561 {
22562 rtx destreg;
22563 rtx label = NULL;
22564 rtx tmp;
22565 rtx jump_around_label = NULL;
22566 HOST_WIDE_INT align = 1;
22567 unsigned HOST_WIDE_INT count = 0;
22568 HOST_WIDE_INT expected_size = -1;
22569 int size_needed = 0, epilogue_size_needed;
22570 int desired_align = 0, align_bytes = 0;
22571 enum stringop_alg alg;
22572 rtx promoted_val = NULL;
22573 bool force_loopy_epilogue = false;
22574 int dynamic_check;
22575 bool need_zero_guard = false;
22576
22577 if (CONST_INT_P (align_exp))
22578 align = INTVAL (align_exp);
22579 /* i386 can do misaligned access on reasonably increased cost. */
22580 if (CONST_INT_P (expected_align_exp)
22581 && INTVAL (expected_align_exp) > align)
22582 align = INTVAL (expected_align_exp);
22583 if (CONST_INT_P (count_exp))
22584 count = expected_size = INTVAL (count_exp);
22585 if (CONST_INT_P (expected_size_exp) && count == 0)
22586 expected_size = INTVAL (expected_size_exp);
22587
22588 /* Make sure we don't need to care about overflow later on. */
22589 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22590 return false;
22591
22592 /* Step 0: Decide on preferred algorithm, desired alignment and
22593 size of chunks to be copied by main loop. */
22594
22595 alg = decide_alg (count, expected_size, true, &dynamic_check);
22596 desired_align = decide_alignment (align, alg, expected_size);
22597
22598 if (!TARGET_ALIGN_STRINGOPS)
22599 align = desired_align;
22600
22601 if (alg == libcall)
22602 return false;
22603 gcc_assert (alg != no_stringop);
22604 if (!count)
22605 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22606 destreg = copy_addr_to_reg (XEXP (dst, 0));
22607 switch (alg)
22608 {
22609 case libcall:
22610 case no_stringop:
22611 gcc_unreachable ();
22612 case loop:
22613 need_zero_guard = true;
22614 size_needed = GET_MODE_SIZE (word_mode);
22615 break;
22616 case unrolled_loop:
22617 need_zero_guard = true;
22618 size_needed = GET_MODE_SIZE (word_mode) * 4;
22619 break;
22620 case rep_prefix_8_byte:
22621 size_needed = 8;
22622 break;
22623 case rep_prefix_4_byte:
22624 size_needed = 4;
22625 break;
22626 case rep_prefix_1_byte:
22627 size_needed = 1;
22628 break;
22629 case loop_1_byte:
22630 need_zero_guard = true;
22631 size_needed = 1;
22632 break;
22633 }
22634 epilogue_size_needed = size_needed;
22635
22636 /* Step 1: Prologue guard. */
22637
22638 /* Alignment code needs count to be in register. */
22639 if (CONST_INT_P (count_exp) && desired_align > align)
22640 {
22641 if (INTVAL (count_exp) > desired_align
22642 && INTVAL (count_exp) > size_needed)
22643 {
22644 align_bytes
22645 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22646 if (align_bytes <= 0)
22647 align_bytes = 0;
22648 else
22649 align_bytes = desired_align - align_bytes;
22650 }
22651 if (align_bytes == 0)
22652 {
22653 enum machine_mode mode = SImode;
22654 if (TARGET_64BIT && (count & ~0xffffffff))
22655 mode = DImode;
22656 count_exp = force_reg (mode, count_exp);
22657 }
22658 }
22659 /* Do the cheap promotion to allow better CSE across the
22660 main loop and epilogue (ie one load of the big constant in the
22661 front of all code. */
22662 if (CONST_INT_P (val_exp))
22663 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22664 desired_align, align);
22665 /* Ensure that alignment prologue won't copy past end of block. */
22666 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22667 {
22668 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22669 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22670 Make sure it is power of 2. */
22671 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22672
22673 /* To improve performance of small blocks, we jump around the VAL
22674 promoting mode. This mean that if the promoted VAL is not constant,
22675 we might not use it in the epilogue and have to use byte
22676 loop variant. */
22677 if (epilogue_size_needed > 2 && !promoted_val)
22678 force_loopy_epilogue = true;
22679 if (count)
22680 {
22681 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22682 {
22683 /* If main algorithm works on QImode, no epilogue is needed.
22684 For small sizes just don't align anything. */
22685 if (size_needed == 1)
22686 desired_align = align;
22687 else
22688 goto epilogue;
22689 }
22690 }
22691 else
22692 {
22693 label = gen_label_rtx ();
22694 emit_cmp_and_jump_insns (count_exp,
22695 GEN_INT (epilogue_size_needed),
22696 LTU, 0, counter_mode (count_exp), 1, label);
22697 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22698 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22699 else
22700 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22701 }
22702 }
22703 if (dynamic_check != -1)
22704 {
22705 rtx hot_label = gen_label_rtx ();
22706 jump_around_label = gen_label_rtx ();
22707 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22708 LEU, 0, counter_mode (count_exp), 1, hot_label);
22709 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22710 set_storage_via_libcall (dst, count_exp, val_exp, false);
22711 emit_jump (jump_around_label);
22712 emit_label (hot_label);
22713 }
22714
22715 /* Step 2: Alignment prologue. */
22716
22717 /* Do the expensive promotion once we branched off the small blocks. */
22718 if (!promoted_val)
22719 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22720 desired_align, align);
22721 gcc_assert (desired_align >= 1 && align >= 1);
22722
22723 if (desired_align > align)
22724 {
22725 if (align_bytes == 0)
22726 {
22727 /* Except for the first move in epilogue, we no longer know
22728 constant offset in aliasing info. It don't seems to worth
22729 the pain to maintain it for the first move, so throw away
22730 the info early. */
22731 dst = change_address (dst, BLKmode, destreg);
22732 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22733 desired_align);
22734 }
22735 else
22736 {
22737 /* If we know how many bytes need to be stored before dst is
22738 sufficiently aligned, maintain aliasing info accurately. */
22739 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22740 desired_align, align_bytes);
22741 count_exp = plus_constant (counter_mode (count_exp),
22742 count_exp, -align_bytes);
22743 count -= align_bytes;
22744 }
22745 if (need_zero_guard
22746 && (count < (unsigned HOST_WIDE_INT) size_needed
22747 || (align_bytes == 0
22748 && count < ((unsigned HOST_WIDE_INT) size_needed
22749 + desired_align - align))))
22750 {
22751 /* It is possible that we copied enough so the main loop will not
22752 execute. */
22753 gcc_assert (size_needed > 1);
22754 if (label == NULL_RTX)
22755 label = gen_label_rtx ();
22756 emit_cmp_and_jump_insns (count_exp,
22757 GEN_INT (size_needed),
22758 LTU, 0, counter_mode (count_exp), 1, label);
22759 if (expected_size == -1
22760 || expected_size < (desired_align - align) / 2 + size_needed)
22761 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22762 else
22763 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22764 }
22765 }
22766 if (label && size_needed == 1)
22767 {
22768 emit_label (label);
22769 LABEL_NUSES (label) = 1;
22770 label = NULL;
22771 promoted_val = val_exp;
22772 epilogue_size_needed = 1;
22773 }
22774 else if (label == NULL_RTX)
22775 epilogue_size_needed = size_needed;
22776
22777 /* Step 3: Main loop. */
22778
22779 switch (alg)
22780 {
22781 case libcall:
22782 case no_stringop:
22783 gcc_unreachable ();
22784 case loop_1_byte:
22785 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22786 count_exp, QImode, 1, expected_size);
22787 break;
22788 case loop:
22789 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22790 count_exp, word_mode, 1, expected_size);
22791 break;
22792 case unrolled_loop:
22793 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22794 count_exp, word_mode, 4, expected_size);
22795 break;
22796 case rep_prefix_8_byte:
22797 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22798 DImode, val_exp);
22799 break;
22800 case rep_prefix_4_byte:
22801 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22802 SImode, val_exp);
22803 break;
22804 case rep_prefix_1_byte:
22805 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22806 QImode, val_exp);
22807 break;
22808 }
22809 /* Adjust properly the offset of src and dest memory for aliasing. */
22810 if (CONST_INT_P (count_exp))
22811 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22812 (count / size_needed) * size_needed);
22813 else
22814 dst = change_address (dst, BLKmode, destreg);
22815
22816 /* Step 4: Epilogue to copy the remaining bytes. */
22817
22818 if (label)
22819 {
22820 /* When the main loop is done, COUNT_EXP might hold original count,
22821 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22822 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22823 bytes. Compensate if needed. */
22824
22825 if (size_needed < epilogue_size_needed)
22826 {
22827 tmp =
22828 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22829 GEN_INT (size_needed - 1), count_exp, 1,
22830 OPTAB_DIRECT);
22831 if (tmp != count_exp)
22832 emit_move_insn (count_exp, tmp);
22833 }
22834 emit_label (label);
22835 LABEL_NUSES (label) = 1;
22836 }
22837 epilogue:
22838 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22839 {
22840 if (force_loopy_epilogue)
22841 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22842 epilogue_size_needed);
22843 else
22844 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22845 epilogue_size_needed);
22846 }
22847 if (jump_around_label)
22848 emit_label (jump_around_label);
22849 return true;
22850 }
22851
22852 /* Expand the appropriate insns for doing strlen if not just doing
22853 repnz; scasb
22854
22855 out = result, initialized with the start address
22856 align_rtx = alignment of the address.
22857 scratch = scratch register, initialized with the startaddress when
22858 not aligned, otherwise undefined
22859
22860 This is just the body. It needs the initializations mentioned above and
22861 some address computing at the end. These things are done in i386.md. */
22862
22863 static void
22864 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22865 {
22866 int align;
22867 rtx tmp;
22868 rtx align_2_label = NULL_RTX;
22869 rtx align_3_label = NULL_RTX;
22870 rtx align_4_label = gen_label_rtx ();
22871 rtx end_0_label = gen_label_rtx ();
22872 rtx mem;
22873 rtx tmpreg = gen_reg_rtx (SImode);
22874 rtx scratch = gen_reg_rtx (SImode);
22875 rtx cmp;
22876
22877 align = 0;
22878 if (CONST_INT_P (align_rtx))
22879 align = INTVAL (align_rtx);
22880
22881 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22882
22883 /* Is there a known alignment and is it less than 4? */
22884 if (align < 4)
22885 {
22886 rtx scratch1 = gen_reg_rtx (Pmode);
22887 emit_move_insn (scratch1, out);
22888 /* Is there a known alignment and is it not 2? */
22889 if (align != 2)
22890 {
22891 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22892 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22893
22894 /* Leave just the 3 lower bits. */
22895 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22896 NULL_RTX, 0, OPTAB_WIDEN);
22897
22898 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22899 Pmode, 1, align_4_label);
22900 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22901 Pmode, 1, align_2_label);
22902 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22903 Pmode, 1, align_3_label);
22904 }
22905 else
22906 {
22907 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22908 check if is aligned to 4 - byte. */
22909
22910 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22911 NULL_RTX, 0, OPTAB_WIDEN);
22912
22913 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22914 Pmode, 1, align_4_label);
22915 }
22916
22917 mem = change_address (src, QImode, out);
22918
22919 /* Now compare the bytes. */
22920
22921 /* Compare the first n unaligned byte on a byte per byte basis. */
22922 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22923 QImode, 1, end_0_label);
22924
22925 /* Increment the address. */
22926 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22927
22928 /* Not needed with an alignment of 2 */
22929 if (align != 2)
22930 {
22931 emit_label (align_2_label);
22932
22933 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22934 end_0_label);
22935
22936 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22937
22938 emit_label (align_3_label);
22939 }
22940
22941 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22942 end_0_label);
22943
22944 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22945 }
22946
22947 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22948 align this loop. It gives only huge programs, but does not help to
22949 speed up. */
22950 emit_label (align_4_label);
22951
22952 mem = change_address (src, SImode, out);
22953 emit_move_insn (scratch, mem);
22954 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22955
22956 /* This formula yields a nonzero result iff one of the bytes is zero.
22957 This saves three branches inside loop and many cycles. */
22958
22959 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22960 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22961 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22962 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22963 gen_int_mode (0x80808080, SImode)));
22964 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22965 align_4_label);
22966
22967 if (TARGET_CMOVE)
22968 {
22969 rtx reg = gen_reg_rtx (SImode);
22970 rtx reg2 = gen_reg_rtx (Pmode);
22971 emit_move_insn (reg, tmpreg);
22972 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22973
22974 /* If zero is not in the first two bytes, move two bytes forward. */
22975 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22976 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22977 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22978 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22979 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22980 reg,
22981 tmpreg)));
22982 /* Emit lea manually to avoid clobbering of flags. */
22983 emit_insn (gen_rtx_SET (SImode, reg2,
22984 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22985
22986 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22987 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22988 emit_insn (gen_rtx_SET (VOIDmode, out,
22989 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22990 reg2,
22991 out)));
22992 }
22993 else
22994 {
22995 rtx end_2_label = gen_label_rtx ();
22996 /* Is zero in the first two bytes? */
22997
22998 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22999 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23000 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23001 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23002 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23003 pc_rtx);
23004 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23005 JUMP_LABEL (tmp) = end_2_label;
23006
23007 /* Not in the first two. Move two bytes forward. */
23008 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23009 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23010
23011 emit_label (end_2_label);
23012
23013 }
23014
23015 /* Avoid branch in fixing the byte. */
23016 tmpreg = gen_lowpart (QImode, tmpreg);
23017 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23018 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23019 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23020 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23021
23022 emit_label (end_0_label);
23023 }
23024
23025 /* Expand strlen. */
23026
23027 bool
23028 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23029 {
23030 rtx addr, scratch1, scratch2, scratch3, scratch4;
23031
23032 /* The generic case of strlen expander is long. Avoid it's
23033 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23034
23035 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23036 && !TARGET_INLINE_ALL_STRINGOPS
23037 && !optimize_insn_for_size_p ()
23038 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23039 return false;
23040
23041 addr = force_reg (Pmode, XEXP (src, 0));
23042 scratch1 = gen_reg_rtx (Pmode);
23043
23044 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23045 && !optimize_insn_for_size_p ())
23046 {
23047 /* Well it seems that some optimizer does not combine a call like
23048 foo(strlen(bar), strlen(bar));
23049 when the move and the subtraction is done here. It does calculate
23050 the length just once when these instructions are done inside of
23051 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23052 often used and I use one fewer register for the lifetime of
23053 output_strlen_unroll() this is better. */
23054
23055 emit_move_insn (out, addr);
23056
23057 ix86_expand_strlensi_unroll_1 (out, src, align);
23058
23059 /* strlensi_unroll_1 returns the address of the zero at the end of
23060 the string, like memchr(), so compute the length by subtracting
23061 the start address. */
23062 emit_insn (ix86_gen_sub3 (out, out, addr));
23063 }
23064 else
23065 {
23066 rtx unspec;
23067
23068 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23069 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23070 return false;
23071
23072 scratch2 = gen_reg_rtx (Pmode);
23073 scratch3 = gen_reg_rtx (Pmode);
23074 scratch4 = force_reg (Pmode, constm1_rtx);
23075
23076 emit_move_insn (scratch3, addr);
23077 eoschar = force_reg (QImode, eoschar);
23078
23079 src = replace_equiv_address_nv (src, scratch3);
23080
23081 /* If .md starts supporting :P, this can be done in .md. */
23082 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23083 scratch4), UNSPEC_SCAS);
23084 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23085 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23086 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23087 }
23088 return true;
23089 }
23090
23091 /* For given symbol (function) construct code to compute address of it's PLT
23092 entry in large x86-64 PIC model. */
23093 rtx
23094 construct_plt_address (rtx symbol)
23095 {
23096 rtx tmp, unspec;
23097
23098 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23099 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23100 gcc_assert (Pmode == DImode);
23101
23102 tmp = gen_reg_rtx (Pmode);
23103 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23104
23105 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23106 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23107 return tmp;
23108 }
23109
23110 rtx
23111 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23112 rtx callarg2,
23113 rtx pop, bool sibcall)
23114 {
23115 /* We need to represent that SI and DI registers are clobbered
23116 by SYSV calls. */
23117 static int clobbered_registers[] = {
23118 XMM6_REG, XMM7_REG, XMM8_REG,
23119 XMM9_REG, XMM10_REG, XMM11_REG,
23120 XMM12_REG, XMM13_REG, XMM14_REG,
23121 XMM15_REG, SI_REG, DI_REG
23122 };
23123 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23124 rtx use = NULL, call;
23125 unsigned int vec_len;
23126
23127 if (pop == const0_rtx)
23128 pop = NULL;
23129 gcc_assert (!TARGET_64BIT || !pop);
23130
23131 if (TARGET_MACHO && !TARGET_64BIT)
23132 {
23133 #if TARGET_MACHO
23134 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23135 fnaddr = machopic_indirect_call_target (fnaddr);
23136 #endif
23137 }
23138 else
23139 {
23140 /* Static functions and indirect calls don't need the pic register. */
23141 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23142 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23143 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23144 use_reg (&use, pic_offset_table_rtx);
23145 }
23146
23147 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23148 {
23149 rtx al = gen_rtx_REG (QImode, AX_REG);
23150 emit_move_insn (al, callarg2);
23151 use_reg (&use, al);
23152 }
23153
23154 if (ix86_cmodel == CM_LARGE_PIC
23155 && MEM_P (fnaddr)
23156 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23157 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23158 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23159 else if (sibcall
23160 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23161 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23162 {
23163 fnaddr = XEXP (fnaddr, 0);
23164 if (GET_MODE (fnaddr) != word_mode)
23165 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23166 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23167 }
23168
23169 vec_len = 0;
23170 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23171 if (retval)
23172 call = gen_rtx_SET (VOIDmode, retval, call);
23173 vec[vec_len++] = call;
23174
23175 if (pop)
23176 {
23177 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23178 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23179 vec[vec_len++] = pop;
23180 }
23181
23182 if (TARGET_64BIT_MS_ABI
23183 && (!callarg2 || INTVAL (callarg2) != -2))
23184 {
23185 unsigned i;
23186
23187 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23188 UNSPEC_MS_TO_SYSV_CALL);
23189
23190 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23191 vec[vec_len++]
23192 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23193 ? TImode : DImode,
23194 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23195 ? TImode : DImode,
23196 clobbered_registers[i]));
23197 }
23198
23199 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23200 if (TARGET_VZEROUPPER)
23201 {
23202 int avx256;
23203 if (cfun->machine->callee_pass_avx256_p)
23204 {
23205 if (cfun->machine->callee_return_avx256_p)
23206 avx256 = callee_return_pass_avx256;
23207 else
23208 avx256 = callee_pass_avx256;
23209 }
23210 else if (cfun->machine->callee_return_avx256_p)
23211 avx256 = callee_return_avx256;
23212 else
23213 avx256 = call_no_avx256;
23214
23215 if (reload_completed)
23216 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23217 else
23218 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23219 gen_rtvec (1, GEN_INT (avx256)),
23220 UNSPEC_CALL_NEEDS_VZEROUPPER);
23221 }
23222
23223 if (vec_len > 1)
23224 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23225 call = emit_call_insn (call);
23226 if (use)
23227 CALL_INSN_FUNCTION_USAGE (call) = use;
23228
23229 return call;
23230 }
23231
23232 void
23233 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23234 {
23235 rtx pat = PATTERN (insn);
23236 rtvec vec = XVEC (pat, 0);
23237 int len = GET_NUM_ELEM (vec) - 1;
23238
23239 /* Strip off the last entry of the parallel. */
23240 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23241 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23242 if (len == 1)
23243 pat = RTVEC_ELT (vec, 0);
23244 else
23245 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23246
23247 emit_insn (gen_avx_vzeroupper (vzeroupper));
23248 emit_call_insn (pat);
23249 }
23250
23251 /* Output the assembly for a call instruction. */
23252
23253 const char *
23254 ix86_output_call_insn (rtx insn, rtx call_op)
23255 {
23256 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23257 bool seh_nop_p = false;
23258 const char *xasm;
23259
23260 if (SIBLING_CALL_P (insn))
23261 {
23262 if (direct_p)
23263 xasm = "jmp\t%P0";
23264 /* SEH epilogue detection requires the indirect branch case
23265 to include REX.W. */
23266 else if (TARGET_SEH)
23267 xasm = "rex.W jmp %A0";
23268 else
23269 xasm = "jmp\t%A0";
23270
23271 output_asm_insn (xasm, &call_op);
23272 return "";
23273 }
23274
23275 /* SEH unwinding can require an extra nop to be emitted in several
23276 circumstances. Determine if we have one of those. */
23277 if (TARGET_SEH)
23278 {
23279 rtx i;
23280
23281 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23282 {
23283 /* If we get to another real insn, we don't need the nop. */
23284 if (INSN_P (i))
23285 break;
23286
23287 /* If we get to the epilogue note, prevent a catch region from
23288 being adjacent to the standard epilogue sequence. If non-
23289 call-exceptions, we'll have done this during epilogue emission. */
23290 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23291 && !flag_non_call_exceptions
23292 && !can_throw_internal (insn))
23293 {
23294 seh_nop_p = true;
23295 break;
23296 }
23297 }
23298
23299 /* If we didn't find a real insn following the call, prevent the
23300 unwinder from looking into the next function. */
23301 if (i == NULL)
23302 seh_nop_p = true;
23303 }
23304
23305 if (direct_p)
23306 xasm = "call\t%P0";
23307 else
23308 xasm = "call\t%A0";
23309
23310 output_asm_insn (xasm, &call_op);
23311
23312 if (seh_nop_p)
23313 return "nop";
23314
23315 return "";
23316 }
23317 \f
23318 /* Clear stack slot assignments remembered from previous functions.
23319 This is called from INIT_EXPANDERS once before RTL is emitted for each
23320 function. */
23321
23322 static struct machine_function *
23323 ix86_init_machine_status (void)
23324 {
23325 struct machine_function *f;
23326
23327 f = ggc_alloc_cleared_machine_function ();
23328 f->use_fast_prologue_epilogue_nregs = -1;
23329 f->tls_descriptor_call_expanded_p = 0;
23330 f->call_abi = ix86_abi;
23331
23332 return f;
23333 }
23334
23335 /* Return a MEM corresponding to a stack slot with mode MODE.
23336 Allocate a new slot if necessary.
23337
23338 The RTL for a function can have several slots available: N is
23339 which slot to use. */
23340
23341 rtx
23342 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23343 {
23344 struct stack_local_entry *s;
23345
23346 gcc_assert (n < MAX_386_STACK_LOCALS);
23347
23348 /* Virtual slot is valid only before vregs are instantiated. */
23349 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23350
23351 for (s = ix86_stack_locals; s; s = s->next)
23352 if (s->mode == mode && s->n == n)
23353 return validize_mem (copy_rtx (s->rtl));
23354
23355 s = ggc_alloc_stack_local_entry ();
23356 s->n = n;
23357 s->mode = mode;
23358 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23359
23360 s->next = ix86_stack_locals;
23361 ix86_stack_locals = s;
23362 return validize_mem (s->rtl);
23363 }
23364 \f
23365 /* Calculate the length of the memory address in the instruction encoding.
23366 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23367 or other prefixes. */
23368
23369 int
23370 memory_address_length (rtx addr)
23371 {
23372 struct ix86_address parts;
23373 rtx base, index, disp;
23374 int len;
23375 int ok;
23376
23377 if (GET_CODE (addr) == PRE_DEC
23378 || GET_CODE (addr) == POST_INC
23379 || GET_CODE (addr) == PRE_MODIFY
23380 || GET_CODE (addr) == POST_MODIFY)
23381 return 0;
23382
23383 ok = ix86_decompose_address (addr, &parts);
23384 gcc_assert (ok);
23385
23386 if (parts.base && GET_CODE (parts.base) == SUBREG)
23387 parts.base = SUBREG_REG (parts.base);
23388 if (parts.index && GET_CODE (parts.index) == SUBREG)
23389 parts.index = SUBREG_REG (parts.index);
23390
23391 base = parts.base;
23392 index = parts.index;
23393 disp = parts.disp;
23394
23395 /* Add length of addr32 prefix. */
23396 len = (GET_CODE (addr) == ZERO_EXTEND
23397 || GET_CODE (addr) == AND);
23398
23399 /* Rule of thumb:
23400 - esp as the base always wants an index,
23401 - ebp as the base always wants a displacement,
23402 - r12 as the base always wants an index,
23403 - r13 as the base always wants a displacement. */
23404
23405 /* Register Indirect. */
23406 if (base && !index && !disp)
23407 {
23408 /* esp (for its index) and ebp (for its displacement) need
23409 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23410 code. */
23411 if (REG_P (addr)
23412 && (addr == arg_pointer_rtx
23413 || addr == frame_pointer_rtx
23414 || REGNO (addr) == SP_REG
23415 || REGNO (addr) == BP_REG
23416 || REGNO (addr) == R12_REG
23417 || REGNO (addr) == R13_REG))
23418 len = 1;
23419 }
23420
23421 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23422 is not disp32, but disp32(%rip), so for disp32
23423 SIB byte is needed, unless print_operand_address
23424 optimizes it into disp32(%rip) or (%rip) is implied
23425 by UNSPEC. */
23426 else if (disp && !base && !index)
23427 {
23428 len = 4;
23429 if (TARGET_64BIT)
23430 {
23431 rtx symbol = disp;
23432
23433 if (GET_CODE (disp) == CONST)
23434 symbol = XEXP (disp, 0);
23435 if (GET_CODE (symbol) == PLUS
23436 && CONST_INT_P (XEXP (symbol, 1)))
23437 symbol = XEXP (symbol, 0);
23438
23439 if (GET_CODE (symbol) != LABEL_REF
23440 && (GET_CODE (symbol) != SYMBOL_REF
23441 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23442 && (GET_CODE (symbol) != UNSPEC
23443 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23444 && XINT (symbol, 1) != UNSPEC_PCREL
23445 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23446 len += 1;
23447 }
23448 }
23449
23450 else
23451 {
23452 /* Find the length of the displacement constant. */
23453 if (disp)
23454 {
23455 if (base && satisfies_constraint_K (disp))
23456 len = 1;
23457 else
23458 len = 4;
23459 }
23460 /* ebp always wants a displacement. Similarly r13. */
23461 else if (base && REG_P (base)
23462 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23463 len = 1;
23464
23465 /* An index requires the two-byte modrm form.... */
23466 if (index
23467 /* ...like esp (or r12), which always wants an index. */
23468 || base == arg_pointer_rtx
23469 || base == frame_pointer_rtx
23470 || (base && REG_P (base)
23471 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23472 len += 1;
23473 }
23474
23475 switch (parts.seg)
23476 {
23477 case SEG_FS:
23478 case SEG_GS:
23479 len += 1;
23480 break;
23481 default:
23482 break;
23483 }
23484
23485 return len;
23486 }
23487
23488 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23489 is set, expect that insn have 8bit immediate alternative. */
23490 int
23491 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23492 {
23493 int len = 0;
23494 int i;
23495 extract_insn_cached (insn);
23496 for (i = recog_data.n_operands - 1; i >= 0; --i)
23497 if (CONSTANT_P (recog_data.operand[i]))
23498 {
23499 enum attr_mode mode = get_attr_mode (insn);
23500
23501 gcc_assert (!len);
23502 if (shortform && CONST_INT_P (recog_data.operand[i]))
23503 {
23504 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23505 switch (mode)
23506 {
23507 case MODE_QI:
23508 len = 1;
23509 continue;
23510 case MODE_HI:
23511 ival = trunc_int_for_mode (ival, HImode);
23512 break;
23513 case MODE_SI:
23514 ival = trunc_int_for_mode (ival, SImode);
23515 break;
23516 default:
23517 break;
23518 }
23519 if (IN_RANGE (ival, -128, 127))
23520 {
23521 len = 1;
23522 continue;
23523 }
23524 }
23525 switch (mode)
23526 {
23527 case MODE_QI:
23528 len = 1;
23529 break;
23530 case MODE_HI:
23531 len = 2;
23532 break;
23533 case MODE_SI:
23534 len = 4;
23535 break;
23536 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23537 case MODE_DI:
23538 len = 4;
23539 break;
23540 default:
23541 fatal_insn ("unknown insn mode", insn);
23542 }
23543 }
23544 return len;
23545 }
23546 /* Compute default value for "length_address" attribute. */
23547 int
23548 ix86_attr_length_address_default (rtx insn)
23549 {
23550 int i;
23551
23552 if (get_attr_type (insn) == TYPE_LEA)
23553 {
23554 rtx set = PATTERN (insn), addr;
23555
23556 if (GET_CODE (set) == PARALLEL)
23557 set = XVECEXP (set, 0, 0);
23558
23559 gcc_assert (GET_CODE (set) == SET);
23560
23561 addr = SET_SRC (set);
23562 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23563 {
23564 if (GET_CODE (addr) == ZERO_EXTEND)
23565 addr = XEXP (addr, 0);
23566 if (GET_CODE (addr) == SUBREG)
23567 addr = SUBREG_REG (addr);
23568 }
23569
23570 return memory_address_length (addr);
23571 }
23572
23573 extract_insn_cached (insn);
23574 for (i = recog_data.n_operands - 1; i >= 0; --i)
23575 if (MEM_P (recog_data.operand[i]))
23576 {
23577 constrain_operands_cached (reload_completed);
23578 if (which_alternative != -1)
23579 {
23580 const char *constraints = recog_data.constraints[i];
23581 int alt = which_alternative;
23582
23583 while (*constraints == '=' || *constraints == '+')
23584 constraints++;
23585 while (alt-- > 0)
23586 while (*constraints++ != ',')
23587 ;
23588 /* Skip ignored operands. */
23589 if (*constraints == 'X')
23590 continue;
23591 }
23592 return memory_address_length (XEXP (recog_data.operand[i], 0));
23593 }
23594 return 0;
23595 }
23596
23597 /* Compute default value for "length_vex" attribute. It includes
23598 2 or 3 byte VEX prefix and 1 opcode byte. */
23599
23600 int
23601 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23602 {
23603 int i;
23604
23605 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23606 byte VEX prefix. */
23607 if (!has_0f_opcode || has_vex_w)
23608 return 3 + 1;
23609
23610 /* We can always use 2 byte VEX prefix in 32bit. */
23611 if (!TARGET_64BIT)
23612 return 2 + 1;
23613
23614 extract_insn_cached (insn);
23615
23616 for (i = recog_data.n_operands - 1; i >= 0; --i)
23617 if (REG_P (recog_data.operand[i]))
23618 {
23619 /* REX.W bit uses 3 byte VEX prefix. */
23620 if (GET_MODE (recog_data.operand[i]) == DImode
23621 && GENERAL_REG_P (recog_data.operand[i]))
23622 return 3 + 1;
23623 }
23624 else
23625 {
23626 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23627 if (MEM_P (recog_data.operand[i])
23628 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23629 return 3 + 1;
23630 }
23631
23632 return 2 + 1;
23633 }
23634 \f
23635 /* Return the maximum number of instructions a cpu can issue. */
23636
23637 static int
23638 ix86_issue_rate (void)
23639 {
23640 switch (ix86_tune)
23641 {
23642 case PROCESSOR_PENTIUM:
23643 case PROCESSOR_ATOM:
23644 case PROCESSOR_K6:
23645 return 2;
23646
23647 case PROCESSOR_PENTIUMPRO:
23648 case PROCESSOR_PENTIUM4:
23649 case PROCESSOR_CORE2_32:
23650 case PROCESSOR_CORE2_64:
23651 case PROCESSOR_COREI7_32:
23652 case PROCESSOR_COREI7_64:
23653 case PROCESSOR_ATHLON:
23654 case PROCESSOR_K8:
23655 case PROCESSOR_AMDFAM10:
23656 case PROCESSOR_NOCONA:
23657 case PROCESSOR_GENERIC32:
23658 case PROCESSOR_GENERIC64:
23659 case PROCESSOR_BDVER1:
23660 case PROCESSOR_BDVER2:
23661 case PROCESSOR_BTVER1:
23662 return 3;
23663
23664 default:
23665 return 1;
23666 }
23667 }
23668
23669 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23670 by DEP_INSN and nothing set by DEP_INSN. */
23671
23672 static bool
23673 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23674 {
23675 rtx set, set2;
23676
23677 /* Simplify the test for uninteresting insns. */
23678 if (insn_type != TYPE_SETCC
23679 && insn_type != TYPE_ICMOV
23680 && insn_type != TYPE_FCMOV
23681 && insn_type != TYPE_IBR)
23682 return false;
23683
23684 if ((set = single_set (dep_insn)) != 0)
23685 {
23686 set = SET_DEST (set);
23687 set2 = NULL_RTX;
23688 }
23689 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23690 && XVECLEN (PATTERN (dep_insn), 0) == 2
23691 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23692 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23693 {
23694 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23695 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23696 }
23697 else
23698 return false;
23699
23700 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23701 return false;
23702
23703 /* This test is true if the dependent insn reads the flags but
23704 not any other potentially set register. */
23705 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23706 return false;
23707
23708 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23709 return false;
23710
23711 return true;
23712 }
23713
23714 /* Return true iff USE_INSN has a memory address with operands set by
23715 SET_INSN. */
23716
23717 bool
23718 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23719 {
23720 int i;
23721 extract_insn_cached (use_insn);
23722 for (i = recog_data.n_operands - 1; i >= 0; --i)
23723 if (MEM_P (recog_data.operand[i]))
23724 {
23725 rtx addr = XEXP (recog_data.operand[i], 0);
23726 return modified_in_p (addr, set_insn) != 0;
23727 }
23728 return false;
23729 }
23730
23731 static int
23732 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23733 {
23734 enum attr_type insn_type, dep_insn_type;
23735 enum attr_memory memory;
23736 rtx set, set2;
23737 int dep_insn_code_number;
23738
23739 /* Anti and output dependencies have zero cost on all CPUs. */
23740 if (REG_NOTE_KIND (link) != 0)
23741 return 0;
23742
23743 dep_insn_code_number = recog_memoized (dep_insn);
23744
23745 /* If we can't recognize the insns, we can't really do anything. */
23746 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23747 return cost;
23748
23749 insn_type = get_attr_type (insn);
23750 dep_insn_type = get_attr_type (dep_insn);
23751
23752 switch (ix86_tune)
23753 {
23754 case PROCESSOR_PENTIUM:
23755 /* Address Generation Interlock adds a cycle of latency. */
23756 if (insn_type == TYPE_LEA)
23757 {
23758 rtx addr = PATTERN (insn);
23759
23760 if (GET_CODE (addr) == PARALLEL)
23761 addr = XVECEXP (addr, 0, 0);
23762
23763 gcc_assert (GET_CODE (addr) == SET);
23764
23765 addr = SET_SRC (addr);
23766 if (modified_in_p (addr, dep_insn))
23767 cost += 1;
23768 }
23769 else if (ix86_agi_dependent (dep_insn, insn))
23770 cost += 1;
23771
23772 /* ??? Compares pair with jump/setcc. */
23773 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23774 cost = 0;
23775
23776 /* Floating point stores require value to be ready one cycle earlier. */
23777 if (insn_type == TYPE_FMOV
23778 && get_attr_memory (insn) == MEMORY_STORE
23779 && !ix86_agi_dependent (dep_insn, insn))
23780 cost += 1;
23781 break;
23782
23783 case PROCESSOR_PENTIUMPRO:
23784 memory = get_attr_memory (insn);
23785
23786 /* INT->FP conversion is expensive. */
23787 if (get_attr_fp_int_src (dep_insn))
23788 cost += 5;
23789
23790 /* There is one cycle extra latency between an FP op and a store. */
23791 if (insn_type == TYPE_FMOV
23792 && (set = single_set (dep_insn)) != NULL_RTX
23793 && (set2 = single_set (insn)) != NULL_RTX
23794 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23795 && MEM_P (SET_DEST (set2)))
23796 cost += 1;
23797
23798 /* Show ability of reorder buffer to hide latency of load by executing
23799 in parallel with previous instruction in case
23800 previous instruction is not needed to compute the address. */
23801 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23802 && !ix86_agi_dependent (dep_insn, insn))
23803 {
23804 /* Claim moves to take one cycle, as core can issue one load
23805 at time and the next load can start cycle later. */
23806 if (dep_insn_type == TYPE_IMOV
23807 || dep_insn_type == TYPE_FMOV)
23808 cost = 1;
23809 else if (cost > 1)
23810 cost--;
23811 }
23812 break;
23813
23814 case PROCESSOR_K6:
23815 memory = get_attr_memory (insn);
23816
23817 /* The esp dependency is resolved before the instruction is really
23818 finished. */
23819 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23820 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23821 return 1;
23822
23823 /* INT->FP conversion is expensive. */
23824 if (get_attr_fp_int_src (dep_insn))
23825 cost += 5;
23826
23827 /* Show ability of reorder buffer to hide latency of load by executing
23828 in parallel with previous instruction in case
23829 previous instruction is not needed to compute the address. */
23830 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23831 && !ix86_agi_dependent (dep_insn, insn))
23832 {
23833 /* Claim moves to take one cycle, as core can issue one load
23834 at time and the next load can start cycle later. */
23835 if (dep_insn_type == TYPE_IMOV
23836 || dep_insn_type == TYPE_FMOV)
23837 cost = 1;
23838 else if (cost > 2)
23839 cost -= 2;
23840 else
23841 cost = 1;
23842 }
23843 break;
23844
23845 case PROCESSOR_ATHLON:
23846 case PROCESSOR_K8:
23847 case PROCESSOR_AMDFAM10:
23848 case PROCESSOR_BDVER1:
23849 case PROCESSOR_BDVER2:
23850 case PROCESSOR_BTVER1:
23851 case PROCESSOR_ATOM:
23852 case PROCESSOR_GENERIC32:
23853 case PROCESSOR_GENERIC64:
23854 memory = get_attr_memory (insn);
23855
23856 /* Show ability of reorder buffer to hide latency of load by executing
23857 in parallel with previous instruction in case
23858 previous instruction is not needed to compute the address. */
23859 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23860 && !ix86_agi_dependent (dep_insn, insn))
23861 {
23862 enum attr_unit unit = get_attr_unit (insn);
23863 int loadcost = 3;
23864
23865 /* Because of the difference between the length of integer and
23866 floating unit pipeline preparation stages, the memory operands
23867 for floating point are cheaper.
23868
23869 ??? For Athlon it the difference is most probably 2. */
23870 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23871 loadcost = 3;
23872 else
23873 loadcost = TARGET_ATHLON ? 2 : 0;
23874
23875 if (cost >= loadcost)
23876 cost -= loadcost;
23877 else
23878 cost = 0;
23879 }
23880
23881 default:
23882 break;
23883 }
23884
23885 return cost;
23886 }
23887
23888 /* How many alternative schedules to try. This should be as wide as the
23889 scheduling freedom in the DFA, but no wider. Making this value too
23890 large results extra work for the scheduler. */
23891
23892 static int
23893 ia32_multipass_dfa_lookahead (void)
23894 {
23895 switch (ix86_tune)
23896 {
23897 case PROCESSOR_PENTIUM:
23898 return 2;
23899
23900 case PROCESSOR_PENTIUMPRO:
23901 case PROCESSOR_K6:
23902 return 1;
23903
23904 case PROCESSOR_CORE2_32:
23905 case PROCESSOR_CORE2_64:
23906 case PROCESSOR_COREI7_32:
23907 case PROCESSOR_COREI7_64:
23908 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23909 as many instructions can be executed on a cycle, i.e.,
23910 issue_rate. I wonder why tuning for many CPUs does not do this. */
23911 return ix86_issue_rate ();
23912
23913 default:
23914 return 0;
23915 }
23916 }
23917
23918 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
23919 execution. It is applied if
23920 (1) IMUL instruction is on the top of list;
23921 (2) There exists the only producer of independent IMUL instruction in
23922 ready list;
23923 (3) Put found producer on the top of ready list.
23924 Returns issue rate. */
23925
23926 static int
23927 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
23928 int clock_var ATTRIBUTE_UNUSED)
23929 {
23930 static int issue_rate = -1;
23931 int n_ready = *pn_ready;
23932 rtx insn, insn1, insn2;
23933 int i;
23934 sd_iterator_def sd_it;
23935 dep_t dep;
23936 int index = -1;
23937
23938 /* Set up issue rate. */
23939 issue_rate = ix86_issue_rate();
23940
23941 /* Do reodering for Atom only. */
23942 if (ix86_tune != PROCESSOR_ATOM)
23943 return issue_rate;
23944 /* Nothing to do if ready list contains only 1 instruction. */
23945 if (n_ready <= 1)
23946 return issue_rate;
23947
23948 /* Check that IMUL instruction is on the top of ready list. */
23949 insn = ready[n_ready - 1];
23950 if (!NONDEBUG_INSN_P (insn))
23951 return issue_rate;
23952 insn = PATTERN (insn);
23953 if (GET_CODE (insn) == PARALLEL)
23954 insn = XVECEXP (insn, 0, 0);
23955 if (GET_CODE (insn) != SET)
23956 return issue_rate;
23957 if (!(GET_CODE (SET_SRC (insn)) == MULT
23958 && GET_MODE (SET_SRC (insn)) == SImode))
23959 return issue_rate;
23960
23961 /* Search for producer of independent IMUL instruction. */
23962 for (i = n_ready - 2; i>= 0; i--)
23963 {
23964 insn = ready[i];
23965 if (!NONDEBUG_INSN_P (insn))
23966 continue;
23967 /* Skip IMUL instruction. */
23968 insn2 = PATTERN (insn);
23969 if (GET_CODE (insn2) == PARALLEL)
23970 insn2 = XVECEXP (insn2, 0, 0);
23971 if (GET_CODE (insn2) == SET
23972 && GET_CODE (SET_SRC (insn2)) == MULT
23973 && GET_MODE (SET_SRC (insn2)) == SImode)
23974 continue;
23975
23976 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
23977 {
23978 rtx con;
23979 con = DEP_CON (dep);
23980 if (!NONDEBUG_INSN_P (con))
23981 continue;
23982 insn1 = PATTERN (con);
23983 if (GET_CODE (insn1) == PARALLEL)
23984 insn1 = XVECEXP (insn1, 0, 0);
23985
23986 if (GET_CODE (insn1) == SET
23987 && GET_CODE (SET_SRC (insn1)) == MULT
23988 && GET_MODE (SET_SRC (insn1)) == SImode)
23989 {
23990 sd_iterator_def sd_it1;
23991 dep_t dep1;
23992 /* Check if there is no other dependee for IMUL. */
23993 index = i;
23994 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
23995 {
23996 rtx pro;
23997 pro = DEP_PRO (dep1);
23998 if (!NONDEBUG_INSN_P (pro))
23999 continue;
24000 if (pro != insn)
24001 index = -1;
24002 }
24003 if (index >= 0)
24004 break;
24005 }
24006 }
24007 if (index >= 0)
24008 break;
24009 }
24010 if (index < 0)
24011 return issue_rate; /* Didn't find IMUL producer. */
24012
24013 if (sched_verbose > 1)
24014 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24015 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24016
24017 /* Put IMUL producer (ready[index]) at the top of ready list. */
24018 insn1= ready[index];
24019 for (i = index; i < n_ready - 1; i++)
24020 ready[i] = ready[i + 1];
24021 ready[n_ready - 1] = insn1;
24022
24023 return issue_rate;
24024 }
24025
24026 \f
24027
24028 /* Model decoder of Core 2/i7.
24029 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24030 track the instruction fetch block boundaries and make sure that long
24031 (9+ bytes) instructions are assigned to D0. */
24032
24033 /* Maximum length of an insn that can be handled by
24034 a secondary decoder unit. '8' for Core 2/i7. */
24035 static int core2i7_secondary_decoder_max_insn_size;
24036
24037 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24038 '16' for Core 2/i7. */
24039 static int core2i7_ifetch_block_size;
24040
24041 /* Maximum number of instructions decoder can handle per cycle.
24042 '6' for Core 2/i7. */
24043 static int core2i7_ifetch_block_max_insns;
24044
24045 typedef struct ix86_first_cycle_multipass_data_ *
24046 ix86_first_cycle_multipass_data_t;
24047 typedef const struct ix86_first_cycle_multipass_data_ *
24048 const_ix86_first_cycle_multipass_data_t;
24049
24050 /* A variable to store target state across calls to max_issue within
24051 one cycle. */
24052 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24053 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24054
24055 /* Initialize DATA. */
24056 static void
24057 core2i7_first_cycle_multipass_init (void *_data)
24058 {
24059 ix86_first_cycle_multipass_data_t data
24060 = (ix86_first_cycle_multipass_data_t) _data;
24061
24062 data->ifetch_block_len = 0;
24063 data->ifetch_block_n_insns = 0;
24064 data->ready_try_change = NULL;
24065 data->ready_try_change_size = 0;
24066 }
24067
24068 /* Advancing the cycle; reset ifetch block counts. */
24069 static void
24070 core2i7_dfa_post_advance_cycle (void)
24071 {
24072 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24073
24074 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24075
24076 data->ifetch_block_len = 0;
24077 data->ifetch_block_n_insns = 0;
24078 }
24079
24080 static int min_insn_size (rtx);
24081
24082 /* Filter out insns from ready_try that the core will not be able to issue
24083 on current cycle due to decoder. */
24084 static void
24085 core2i7_first_cycle_multipass_filter_ready_try
24086 (const_ix86_first_cycle_multipass_data_t data,
24087 char *ready_try, int n_ready, bool first_cycle_insn_p)
24088 {
24089 while (n_ready--)
24090 {
24091 rtx insn;
24092 int insn_size;
24093
24094 if (ready_try[n_ready])
24095 continue;
24096
24097 insn = get_ready_element (n_ready);
24098 insn_size = min_insn_size (insn);
24099
24100 if (/* If this is a too long an insn for a secondary decoder ... */
24101 (!first_cycle_insn_p
24102 && insn_size > core2i7_secondary_decoder_max_insn_size)
24103 /* ... or it would not fit into the ifetch block ... */
24104 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24105 /* ... or the decoder is full already ... */
24106 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24107 /* ... mask the insn out. */
24108 {
24109 ready_try[n_ready] = 1;
24110
24111 if (data->ready_try_change)
24112 SET_BIT (data->ready_try_change, n_ready);
24113 }
24114 }
24115 }
24116
24117 /* Prepare for a new round of multipass lookahead scheduling. */
24118 static void
24119 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24120 bool first_cycle_insn_p)
24121 {
24122 ix86_first_cycle_multipass_data_t data
24123 = (ix86_first_cycle_multipass_data_t) _data;
24124 const_ix86_first_cycle_multipass_data_t prev_data
24125 = ix86_first_cycle_multipass_data;
24126
24127 /* Restore the state from the end of the previous round. */
24128 data->ifetch_block_len = prev_data->ifetch_block_len;
24129 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24130
24131 /* Filter instructions that cannot be issued on current cycle due to
24132 decoder restrictions. */
24133 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24134 first_cycle_insn_p);
24135 }
24136
24137 /* INSN is being issued in current solution. Account for its impact on
24138 the decoder model. */
24139 static void
24140 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24141 rtx insn, const void *_prev_data)
24142 {
24143 ix86_first_cycle_multipass_data_t data
24144 = (ix86_first_cycle_multipass_data_t) _data;
24145 const_ix86_first_cycle_multipass_data_t prev_data
24146 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24147
24148 int insn_size = min_insn_size (insn);
24149
24150 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24151 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24152 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24153 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24154
24155 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24156 if (!data->ready_try_change)
24157 {
24158 data->ready_try_change = sbitmap_alloc (n_ready);
24159 data->ready_try_change_size = n_ready;
24160 }
24161 else if (data->ready_try_change_size < n_ready)
24162 {
24163 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24164 n_ready, 0);
24165 data->ready_try_change_size = n_ready;
24166 }
24167 sbitmap_zero (data->ready_try_change);
24168
24169 /* Filter out insns from ready_try that the core will not be able to issue
24170 on current cycle due to decoder. */
24171 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24172 false);
24173 }
24174
24175 /* Revert the effect on ready_try. */
24176 static void
24177 core2i7_first_cycle_multipass_backtrack (const void *_data,
24178 char *ready_try,
24179 int n_ready ATTRIBUTE_UNUSED)
24180 {
24181 const_ix86_first_cycle_multipass_data_t data
24182 = (const_ix86_first_cycle_multipass_data_t) _data;
24183 unsigned int i = 0;
24184 sbitmap_iterator sbi;
24185
24186 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24187 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24188 {
24189 ready_try[i] = 0;
24190 }
24191 }
24192
24193 /* Save the result of multipass lookahead scheduling for the next round. */
24194 static void
24195 core2i7_first_cycle_multipass_end (const void *_data)
24196 {
24197 const_ix86_first_cycle_multipass_data_t data
24198 = (const_ix86_first_cycle_multipass_data_t) _data;
24199 ix86_first_cycle_multipass_data_t next_data
24200 = ix86_first_cycle_multipass_data;
24201
24202 if (data != NULL)
24203 {
24204 next_data->ifetch_block_len = data->ifetch_block_len;
24205 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24206 }
24207 }
24208
24209 /* Deallocate target data. */
24210 static void
24211 core2i7_first_cycle_multipass_fini (void *_data)
24212 {
24213 ix86_first_cycle_multipass_data_t data
24214 = (ix86_first_cycle_multipass_data_t) _data;
24215
24216 if (data->ready_try_change)
24217 {
24218 sbitmap_free (data->ready_try_change);
24219 data->ready_try_change = NULL;
24220 data->ready_try_change_size = 0;
24221 }
24222 }
24223
24224 /* Prepare for scheduling pass. */
24225 static void
24226 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24227 int verbose ATTRIBUTE_UNUSED,
24228 int max_uid ATTRIBUTE_UNUSED)
24229 {
24230 /* Install scheduling hooks for current CPU. Some of these hooks are used
24231 in time-critical parts of the scheduler, so we only set them up when
24232 they are actually used. */
24233 switch (ix86_tune)
24234 {
24235 case PROCESSOR_CORE2_32:
24236 case PROCESSOR_CORE2_64:
24237 case PROCESSOR_COREI7_32:
24238 case PROCESSOR_COREI7_64:
24239 targetm.sched.dfa_post_advance_cycle
24240 = core2i7_dfa_post_advance_cycle;
24241 targetm.sched.first_cycle_multipass_init
24242 = core2i7_first_cycle_multipass_init;
24243 targetm.sched.first_cycle_multipass_begin
24244 = core2i7_first_cycle_multipass_begin;
24245 targetm.sched.first_cycle_multipass_issue
24246 = core2i7_first_cycle_multipass_issue;
24247 targetm.sched.first_cycle_multipass_backtrack
24248 = core2i7_first_cycle_multipass_backtrack;
24249 targetm.sched.first_cycle_multipass_end
24250 = core2i7_first_cycle_multipass_end;
24251 targetm.sched.first_cycle_multipass_fini
24252 = core2i7_first_cycle_multipass_fini;
24253
24254 /* Set decoder parameters. */
24255 core2i7_secondary_decoder_max_insn_size = 8;
24256 core2i7_ifetch_block_size = 16;
24257 core2i7_ifetch_block_max_insns = 6;
24258 break;
24259
24260 default:
24261 targetm.sched.dfa_post_advance_cycle = NULL;
24262 targetm.sched.first_cycle_multipass_init = NULL;
24263 targetm.sched.first_cycle_multipass_begin = NULL;
24264 targetm.sched.first_cycle_multipass_issue = NULL;
24265 targetm.sched.first_cycle_multipass_backtrack = NULL;
24266 targetm.sched.first_cycle_multipass_end = NULL;
24267 targetm.sched.first_cycle_multipass_fini = NULL;
24268 break;
24269 }
24270 }
24271
24272 \f
24273 /* Compute the alignment given to a constant that is being placed in memory.
24274 EXP is the constant and ALIGN is the alignment that the object would
24275 ordinarily have.
24276 The value of this function is used instead of that alignment to align
24277 the object. */
24278
24279 int
24280 ix86_constant_alignment (tree exp, int align)
24281 {
24282 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24283 || TREE_CODE (exp) == INTEGER_CST)
24284 {
24285 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24286 return 64;
24287 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24288 return 128;
24289 }
24290 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24291 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24292 return BITS_PER_WORD;
24293
24294 return align;
24295 }
24296
24297 /* Compute the alignment for a static variable.
24298 TYPE is the data type, and ALIGN is the alignment that
24299 the object would ordinarily have. The value of this function is used
24300 instead of that alignment to align the object. */
24301
24302 int
24303 ix86_data_alignment (tree type, int align)
24304 {
24305 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24306
24307 if (AGGREGATE_TYPE_P (type)
24308 && TYPE_SIZE (type)
24309 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24310 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24311 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24312 && align < max_align)
24313 align = max_align;
24314
24315 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24316 to 16byte boundary. */
24317 if (TARGET_64BIT)
24318 {
24319 if (AGGREGATE_TYPE_P (type)
24320 && TYPE_SIZE (type)
24321 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24322 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24323 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24324 return 128;
24325 }
24326
24327 if (TREE_CODE (type) == ARRAY_TYPE)
24328 {
24329 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24330 return 64;
24331 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24332 return 128;
24333 }
24334 else if (TREE_CODE (type) == COMPLEX_TYPE)
24335 {
24336
24337 if (TYPE_MODE (type) == DCmode && align < 64)
24338 return 64;
24339 if ((TYPE_MODE (type) == XCmode
24340 || TYPE_MODE (type) == TCmode) && align < 128)
24341 return 128;
24342 }
24343 else if ((TREE_CODE (type) == RECORD_TYPE
24344 || TREE_CODE (type) == UNION_TYPE
24345 || TREE_CODE (type) == QUAL_UNION_TYPE)
24346 && TYPE_FIELDS (type))
24347 {
24348 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24349 return 64;
24350 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24351 return 128;
24352 }
24353 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24354 || TREE_CODE (type) == INTEGER_TYPE)
24355 {
24356 if (TYPE_MODE (type) == DFmode && align < 64)
24357 return 64;
24358 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24359 return 128;
24360 }
24361
24362 return align;
24363 }
24364
24365 /* Compute the alignment for a local variable or a stack slot. EXP is
24366 the data type or decl itself, MODE is the widest mode available and
24367 ALIGN is the alignment that the object would ordinarily have. The
24368 value of this macro is used instead of that alignment to align the
24369 object. */
24370
24371 unsigned int
24372 ix86_local_alignment (tree exp, enum machine_mode mode,
24373 unsigned int align)
24374 {
24375 tree type, decl;
24376
24377 if (exp && DECL_P (exp))
24378 {
24379 type = TREE_TYPE (exp);
24380 decl = exp;
24381 }
24382 else
24383 {
24384 type = exp;
24385 decl = NULL;
24386 }
24387
24388 /* Don't do dynamic stack realignment for long long objects with
24389 -mpreferred-stack-boundary=2. */
24390 if (!TARGET_64BIT
24391 && align == 64
24392 && ix86_preferred_stack_boundary < 64
24393 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24394 && (!type || !TYPE_USER_ALIGN (type))
24395 && (!decl || !DECL_USER_ALIGN (decl)))
24396 align = 32;
24397
24398 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24399 register in MODE. We will return the largest alignment of XF
24400 and DF. */
24401 if (!type)
24402 {
24403 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24404 align = GET_MODE_ALIGNMENT (DFmode);
24405 return align;
24406 }
24407
24408 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24409 to 16byte boundary. Exact wording is:
24410
24411 An array uses the same alignment as its elements, except that a local or
24412 global array variable of length at least 16 bytes or
24413 a C99 variable-length array variable always has alignment of at least 16 bytes.
24414
24415 This was added to allow use of aligned SSE instructions at arrays. This
24416 rule is meant for static storage (where compiler can not do the analysis
24417 by itself). We follow it for automatic variables only when convenient.
24418 We fully control everything in the function compiled and functions from
24419 other unit can not rely on the alignment.
24420
24421 Exclude va_list type. It is the common case of local array where
24422 we can not benefit from the alignment. */
24423 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24424 && TARGET_SSE)
24425 {
24426 if (AGGREGATE_TYPE_P (type)
24427 && (va_list_type_node == NULL_TREE
24428 || (TYPE_MAIN_VARIANT (type)
24429 != TYPE_MAIN_VARIANT (va_list_type_node)))
24430 && TYPE_SIZE (type)
24431 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24432 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24433 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24434 return 128;
24435 }
24436 if (TREE_CODE (type) == ARRAY_TYPE)
24437 {
24438 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24439 return 64;
24440 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24441 return 128;
24442 }
24443 else if (TREE_CODE (type) == COMPLEX_TYPE)
24444 {
24445 if (TYPE_MODE (type) == DCmode && align < 64)
24446 return 64;
24447 if ((TYPE_MODE (type) == XCmode
24448 || TYPE_MODE (type) == TCmode) && align < 128)
24449 return 128;
24450 }
24451 else if ((TREE_CODE (type) == RECORD_TYPE
24452 || TREE_CODE (type) == UNION_TYPE
24453 || TREE_CODE (type) == QUAL_UNION_TYPE)
24454 && TYPE_FIELDS (type))
24455 {
24456 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24457 return 64;
24458 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24459 return 128;
24460 }
24461 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24462 || TREE_CODE (type) == INTEGER_TYPE)
24463 {
24464
24465 if (TYPE_MODE (type) == DFmode && align < 64)
24466 return 64;
24467 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24468 return 128;
24469 }
24470 return align;
24471 }
24472
24473 /* Compute the minimum required alignment for dynamic stack realignment
24474 purposes for a local variable, parameter or a stack slot. EXP is
24475 the data type or decl itself, MODE is its mode and ALIGN is the
24476 alignment that the object would ordinarily have. */
24477
24478 unsigned int
24479 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24480 unsigned int align)
24481 {
24482 tree type, decl;
24483
24484 if (exp && DECL_P (exp))
24485 {
24486 type = TREE_TYPE (exp);
24487 decl = exp;
24488 }
24489 else
24490 {
24491 type = exp;
24492 decl = NULL;
24493 }
24494
24495 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24496 return align;
24497
24498 /* Don't do dynamic stack realignment for long long objects with
24499 -mpreferred-stack-boundary=2. */
24500 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24501 && (!type || !TYPE_USER_ALIGN (type))
24502 && (!decl || !DECL_USER_ALIGN (decl)))
24503 return 32;
24504
24505 return align;
24506 }
24507 \f
24508 /* Find a location for the static chain incoming to a nested function.
24509 This is a register, unless all free registers are used by arguments. */
24510
24511 static rtx
24512 ix86_static_chain (const_tree fndecl, bool incoming_p)
24513 {
24514 unsigned regno;
24515
24516 if (!DECL_STATIC_CHAIN (fndecl))
24517 return NULL;
24518
24519 if (TARGET_64BIT)
24520 {
24521 /* We always use R10 in 64-bit mode. */
24522 regno = R10_REG;
24523 }
24524 else
24525 {
24526 tree fntype;
24527 unsigned int ccvt;
24528
24529 /* By default in 32-bit mode we use ECX to pass the static chain. */
24530 regno = CX_REG;
24531
24532 fntype = TREE_TYPE (fndecl);
24533 ccvt = ix86_get_callcvt (fntype);
24534 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24535 {
24536 /* Fastcall functions use ecx/edx for arguments, which leaves
24537 us with EAX for the static chain.
24538 Thiscall functions use ecx for arguments, which also
24539 leaves us with EAX for the static chain. */
24540 regno = AX_REG;
24541 }
24542 else if (ix86_function_regparm (fntype, fndecl) == 3)
24543 {
24544 /* For regparm 3, we have no free call-clobbered registers in
24545 which to store the static chain. In order to implement this,
24546 we have the trampoline push the static chain to the stack.
24547 However, we can't push a value below the return address when
24548 we call the nested function directly, so we have to use an
24549 alternate entry point. For this we use ESI, and have the
24550 alternate entry point push ESI, so that things appear the
24551 same once we're executing the nested function. */
24552 if (incoming_p)
24553 {
24554 if (fndecl == current_function_decl)
24555 ix86_static_chain_on_stack = true;
24556 return gen_frame_mem (SImode,
24557 plus_constant (Pmode,
24558 arg_pointer_rtx, -8));
24559 }
24560 regno = SI_REG;
24561 }
24562 }
24563
24564 return gen_rtx_REG (Pmode, regno);
24565 }
24566
24567 /* Emit RTL insns to initialize the variable parts of a trampoline.
24568 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24569 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24570 to be passed to the target function. */
24571
24572 static void
24573 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24574 {
24575 rtx mem, fnaddr;
24576 int opcode;
24577 int offset = 0;
24578
24579 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24580
24581 if (TARGET_64BIT)
24582 {
24583 int size;
24584
24585 /* Load the function address to r11. Try to load address using
24586 the shorter movl instead of movabs. We may want to support
24587 movq for kernel mode, but kernel does not use trampolines at
24588 the moment. FNADDR is a 32bit address and may not be in
24589 DImode when ptr_mode == SImode. Always use movl in this
24590 case. */
24591 if (ptr_mode == SImode
24592 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24593 {
24594 fnaddr = copy_addr_to_reg (fnaddr);
24595
24596 mem = adjust_address (m_tramp, HImode, offset);
24597 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24598
24599 mem = adjust_address (m_tramp, SImode, offset + 2);
24600 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24601 offset += 6;
24602 }
24603 else
24604 {
24605 mem = adjust_address (m_tramp, HImode, offset);
24606 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24607
24608 mem = adjust_address (m_tramp, DImode, offset + 2);
24609 emit_move_insn (mem, fnaddr);
24610 offset += 10;
24611 }
24612
24613 /* Load static chain using movabs to r10. Use the shorter movl
24614 instead of movabs when ptr_mode == SImode. */
24615 if (ptr_mode == SImode)
24616 {
24617 opcode = 0xba41;
24618 size = 6;
24619 }
24620 else
24621 {
24622 opcode = 0xba49;
24623 size = 10;
24624 }
24625
24626 mem = adjust_address (m_tramp, HImode, offset);
24627 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24628
24629 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24630 emit_move_insn (mem, chain_value);
24631 offset += size;
24632
24633 /* Jump to r11; the last (unused) byte is a nop, only there to
24634 pad the write out to a single 32-bit store. */
24635 mem = adjust_address (m_tramp, SImode, offset);
24636 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24637 offset += 4;
24638 }
24639 else
24640 {
24641 rtx disp, chain;
24642
24643 /* Depending on the static chain location, either load a register
24644 with a constant, or push the constant to the stack. All of the
24645 instructions are the same size. */
24646 chain = ix86_static_chain (fndecl, true);
24647 if (REG_P (chain))
24648 {
24649 switch (REGNO (chain))
24650 {
24651 case AX_REG:
24652 opcode = 0xb8; break;
24653 case CX_REG:
24654 opcode = 0xb9; break;
24655 default:
24656 gcc_unreachable ();
24657 }
24658 }
24659 else
24660 opcode = 0x68;
24661
24662 mem = adjust_address (m_tramp, QImode, offset);
24663 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24664
24665 mem = adjust_address (m_tramp, SImode, offset + 1);
24666 emit_move_insn (mem, chain_value);
24667 offset += 5;
24668
24669 mem = adjust_address (m_tramp, QImode, offset);
24670 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24671
24672 mem = adjust_address (m_tramp, SImode, offset + 1);
24673
24674 /* Compute offset from the end of the jmp to the target function.
24675 In the case in which the trampoline stores the static chain on
24676 the stack, we need to skip the first insn which pushes the
24677 (call-saved) register static chain; this push is 1 byte. */
24678 offset += 5;
24679 disp = expand_binop (SImode, sub_optab, fnaddr,
24680 plus_constant (Pmode, XEXP (m_tramp, 0),
24681 offset - (MEM_P (chain) ? 1 : 0)),
24682 NULL_RTX, 1, OPTAB_DIRECT);
24683 emit_move_insn (mem, disp);
24684 }
24685
24686 gcc_assert (offset <= TRAMPOLINE_SIZE);
24687
24688 #ifdef HAVE_ENABLE_EXECUTE_STACK
24689 #ifdef CHECK_EXECUTE_STACK_ENABLED
24690 if (CHECK_EXECUTE_STACK_ENABLED)
24691 #endif
24692 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24693 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24694 #endif
24695 }
24696 \f
24697 /* The following file contains several enumerations and data structures
24698 built from the definitions in i386-builtin-types.def. */
24699
24700 #include "i386-builtin-types.inc"
24701
24702 /* Table for the ix86 builtin non-function types. */
24703 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24704
24705 /* Retrieve an element from the above table, building some of
24706 the types lazily. */
24707
24708 static tree
24709 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24710 {
24711 unsigned int index;
24712 tree type, itype;
24713
24714 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24715
24716 type = ix86_builtin_type_tab[(int) tcode];
24717 if (type != NULL)
24718 return type;
24719
24720 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24721 if (tcode <= IX86_BT_LAST_VECT)
24722 {
24723 enum machine_mode mode;
24724
24725 index = tcode - IX86_BT_LAST_PRIM - 1;
24726 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24727 mode = ix86_builtin_type_vect_mode[index];
24728
24729 type = build_vector_type_for_mode (itype, mode);
24730 }
24731 else
24732 {
24733 int quals;
24734
24735 index = tcode - IX86_BT_LAST_VECT - 1;
24736 if (tcode <= IX86_BT_LAST_PTR)
24737 quals = TYPE_UNQUALIFIED;
24738 else
24739 quals = TYPE_QUAL_CONST;
24740
24741 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24742 if (quals != TYPE_UNQUALIFIED)
24743 itype = build_qualified_type (itype, quals);
24744
24745 type = build_pointer_type (itype);
24746 }
24747
24748 ix86_builtin_type_tab[(int) tcode] = type;
24749 return type;
24750 }
24751
24752 /* Table for the ix86 builtin function types. */
24753 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24754
24755 /* Retrieve an element from the above table, building some of
24756 the types lazily. */
24757
24758 static tree
24759 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24760 {
24761 tree type;
24762
24763 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24764
24765 type = ix86_builtin_func_type_tab[(int) tcode];
24766 if (type != NULL)
24767 return type;
24768
24769 if (tcode <= IX86_BT_LAST_FUNC)
24770 {
24771 unsigned start = ix86_builtin_func_start[(int) tcode];
24772 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24773 tree rtype, atype, args = void_list_node;
24774 unsigned i;
24775
24776 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24777 for (i = after - 1; i > start; --i)
24778 {
24779 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24780 args = tree_cons (NULL, atype, args);
24781 }
24782
24783 type = build_function_type (rtype, args);
24784 }
24785 else
24786 {
24787 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24788 enum ix86_builtin_func_type icode;
24789
24790 icode = ix86_builtin_func_alias_base[index];
24791 type = ix86_get_builtin_func_type (icode);
24792 }
24793
24794 ix86_builtin_func_type_tab[(int) tcode] = type;
24795 return type;
24796 }
24797
24798
24799 /* Codes for all the SSE/MMX builtins. */
24800 enum ix86_builtins
24801 {
24802 IX86_BUILTIN_ADDPS,
24803 IX86_BUILTIN_ADDSS,
24804 IX86_BUILTIN_DIVPS,
24805 IX86_BUILTIN_DIVSS,
24806 IX86_BUILTIN_MULPS,
24807 IX86_BUILTIN_MULSS,
24808 IX86_BUILTIN_SUBPS,
24809 IX86_BUILTIN_SUBSS,
24810
24811 IX86_BUILTIN_CMPEQPS,
24812 IX86_BUILTIN_CMPLTPS,
24813 IX86_BUILTIN_CMPLEPS,
24814 IX86_BUILTIN_CMPGTPS,
24815 IX86_BUILTIN_CMPGEPS,
24816 IX86_BUILTIN_CMPNEQPS,
24817 IX86_BUILTIN_CMPNLTPS,
24818 IX86_BUILTIN_CMPNLEPS,
24819 IX86_BUILTIN_CMPNGTPS,
24820 IX86_BUILTIN_CMPNGEPS,
24821 IX86_BUILTIN_CMPORDPS,
24822 IX86_BUILTIN_CMPUNORDPS,
24823 IX86_BUILTIN_CMPEQSS,
24824 IX86_BUILTIN_CMPLTSS,
24825 IX86_BUILTIN_CMPLESS,
24826 IX86_BUILTIN_CMPNEQSS,
24827 IX86_BUILTIN_CMPNLTSS,
24828 IX86_BUILTIN_CMPNLESS,
24829 IX86_BUILTIN_CMPNGTSS,
24830 IX86_BUILTIN_CMPNGESS,
24831 IX86_BUILTIN_CMPORDSS,
24832 IX86_BUILTIN_CMPUNORDSS,
24833
24834 IX86_BUILTIN_COMIEQSS,
24835 IX86_BUILTIN_COMILTSS,
24836 IX86_BUILTIN_COMILESS,
24837 IX86_BUILTIN_COMIGTSS,
24838 IX86_BUILTIN_COMIGESS,
24839 IX86_BUILTIN_COMINEQSS,
24840 IX86_BUILTIN_UCOMIEQSS,
24841 IX86_BUILTIN_UCOMILTSS,
24842 IX86_BUILTIN_UCOMILESS,
24843 IX86_BUILTIN_UCOMIGTSS,
24844 IX86_BUILTIN_UCOMIGESS,
24845 IX86_BUILTIN_UCOMINEQSS,
24846
24847 IX86_BUILTIN_CVTPI2PS,
24848 IX86_BUILTIN_CVTPS2PI,
24849 IX86_BUILTIN_CVTSI2SS,
24850 IX86_BUILTIN_CVTSI642SS,
24851 IX86_BUILTIN_CVTSS2SI,
24852 IX86_BUILTIN_CVTSS2SI64,
24853 IX86_BUILTIN_CVTTPS2PI,
24854 IX86_BUILTIN_CVTTSS2SI,
24855 IX86_BUILTIN_CVTTSS2SI64,
24856
24857 IX86_BUILTIN_MAXPS,
24858 IX86_BUILTIN_MAXSS,
24859 IX86_BUILTIN_MINPS,
24860 IX86_BUILTIN_MINSS,
24861
24862 IX86_BUILTIN_LOADUPS,
24863 IX86_BUILTIN_STOREUPS,
24864 IX86_BUILTIN_MOVSS,
24865
24866 IX86_BUILTIN_MOVHLPS,
24867 IX86_BUILTIN_MOVLHPS,
24868 IX86_BUILTIN_LOADHPS,
24869 IX86_BUILTIN_LOADLPS,
24870 IX86_BUILTIN_STOREHPS,
24871 IX86_BUILTIN_STORELPS,
24872
24873 IX86_BUILTIN_MASKMOVQ,
24874 IX86_BUILTIN_MOVMSKPS,
24875 IX86_BUILTIN_PMOVMSKB,
24876
24877 IX86_BUILTIN_MOVNTPS,
24878 IX86_BUILTIN_MOVNTQ,
24879
24880 IX86_BUILTIN_LOADDQU,
24881 IX86_BUILTIN_STOREDQU,
24882
24883 IX86_BUILTIN_PACKSSWB,
24884 IX86_BUILTIN_PACKSSDW,
24885 IX86_BUILTIN_PACKUSWB,
24886
24887 IX86_BUILTIN_PADDB,
24888 IX86_BUILTIN_PADDW,
24889 IX86_BUILTIN_PADDD,
24890 IX86_BUILTIN_PADDQ,
24891 IX86_BUILTIN_PADDSB,
24892 IX86_BUILTIN_PADDSW,
24893 IX86_BUILTIN_PADDUSB,
24894 IX86_BUILTIN_PADDUSW,
24895 IX86_BUILTIN_PSUBB,
24896 IX86_BUILTIN_PSUBW,
24897 IX86_BUILTIN_PSUBD,
24898 IX86_BUILTIN_PSUBQ,
24899 IX86_BUILTIN_PSUBSB,
24900 IX86_BUILTIN_PSUBSW,
24901 IX86_BUILTIN_PSUBUSB,
24902 IX86_BUILTIN_PSUBUSW,
24903
24904 IX86_BUILTIN_PAND,
24905 IX86_BUILTIN_PANDN,
24906 IX86_BUILTIN_POR,
24907 IX86_BUILTIN_PXOR,
24908
24909 IX86_BUILTIN_PAVGB,
24910 IX86_BUILTIN_PAVGW,
24911
24912 IX86_BUILTIN_PCMPEQB,
24913 IX86_BUILTIN_PCMPEQW,
24914 IX86_BUILTIN_PCMPEQD,
24915 IX86_BUILTIN_PCMPGTB,
24916 IX86_BUILTIN_PCMPGTW,
24917 IX86_BUILTIN_PCMPGTD,
24918
24919 IX86_BUILTIN_PMADDWD,
24920
24921 IX86_BUILTIN_PMAXSW,
24922 IX86_BUILTIN_PMAXUB,
24923 IX86_BUILTIN_PMINSW,
24924 IX86_BUILTIN_PMINUB,
24925
24926 IX86_BUILTIN_PMULHUW,
24927 IX86_BUILTIN_PMULHW,
24928 IX86_BUILTIN_PMULLW,
24929
24930 IX86_BUILTIN_PSADBW,
24931 IX86_BUILTIN_PSHUFW,
24932
24933 IX86_BUILTIN_PSLLW,
24934 IX86_BUILTIN_PSLLD,
24935 IX86_BUILTIN_PSLLQ,
24936 IX86_BUILTIN_PSRAW,
24937 IX86_BUILTIN_PSRAD,
24938 IX86_BUILTIN_PSRLW,
24939 IX86_BUILTIN_PSRLD,
24940 IX86_BUILTIN_PSRLQ,
24941 IX86_BUILTIN_PSLLWI,
24942 IX86_BUILTIN_PSLLDI,
24943 IX86_BUILTIN_PSLLQI,
24944 IX86_BUILTIN_PSRAWI,
24945 IX86_BUILTIN_PSRADI,
24946 IX86_BUILTIN_PSRLWI,
24947 IX86_BUILTIN_PSRLDI,
24948 IX86_BUILTIN_PSRLQI,
24949
24950 IX86_BUILTIN_PUNPCKHBW,
24951 IX86_BUILTIN_PUNPCKHWD,
24952 IX86_BUILTIN_PUNPCKHDQ,
24953 IX86_BUILTIN_PUNPCKLBW,
24954 IX86_BUILTIN_PUNPCKLWD,
24955 IX86_BUILTIN_PUNPCKLDQ,
24956
24957 IX86_BUILTIN_SHUFPS,
24958
24959 IX86_BUILTIN_RCPPS,
24960 IX86_BUILTIN_RCPSS,
24961 IX86_BUILTIN_RSQRTPS,
24962 IX86_BUILTIN_RSQRTPS_NR,
24963 IX86_BUILTIN_RSQRTSS,
24964 IX86_BUILTIN_RSQRTF,
24965 IX86_BUILTIN_SQRTPS,
24966 IX86_BUILTIN_SQRTPS_NR,
24967 IX86_BUILTIN_SQRTSS,
24968
24969 IX86_BUILTIN_UNPCKHPS,
24970 IX86_BUILTIN_UNPCKLPS,
24971
24972 IX86_BUILTIN_ANDPS,
24973 IX86_BUILTIN_ANDNPS,
24974 IX86_BUILTIN_ORPS,
24975 IX86_BUILTIN_XORPS,
24976
24977 IX86_BUILTIN_EMMS,
24978 IX86_BUILTIN_LDMXCSR,
24979 IX86_BUILTIN_STMXCSR,
24980 IX86_BUILTIN_SFENCE,
24981
24982 /* 3DNow! Original */
24983 IX86_BUILTIN_FEMMS,
24984 IX86_BUILTIN_PAVGUSB,
24985 IX86_BUILTIN_PF2ID,
24986 IX86_BUILTIN_PFACC,
24987 IX86_BUILTIN_PFADD,
24988 IX86_BUILTIN_PFCMPEQ,
24989 IX86_BUILTIN_PFCMPGE,
24990 IX86_BUILTIN_PFCMPGT,
24991 IX86_BUILTIN_PFMAX,
24992 IX86_BUILTIN_PFMIN,
24993 IX86_BUILTIN_PFMUL,
24994 IX86_BUILTIN_PFRCP,
24995 IX86_BUILTIN_PFRCPIT1,
24996 IX86_BUILTIN_PFRCPIT2,
24997 IX86_BUILTIN_PFRSQIT1,
24998 IX86_BUILTIN_PFRSQRT,
24999 IX86_BUILTIN_PFSUB,
25000 IX86_BUILTIN_PFSUBR,
25001 IX86_BUILTIN_PI2FD,
25002 IX86_BUILTIN_PMULHRW,
25003
25004 /* 3DNow! Athlon Extensions */
25005 IX86_BUILTIN_PF2IW,
25006 IX86_BUILTIN_PFNACC,
25007 IX86_BUILTIN_PFPNACC,
25008 IX86_BUILTIN_PI2FW,
25009 IX86_BUILTIN_PSWAPDSI,
25010 IX86_BUILTIN_PSWAPDSF,
25011
25012 /* SSE2 */
25013 IX86_BUILTIN_ADDPD,
25014 IX86_BUILTIN_ADDSD,
25015 IX86_BUILTIN_DIVPD,
25016 IX86_BUILTIN_DIVSD,
25017 IX86_BUILTIN_MULPD,
25018 IX86_BUILTIN_MULSD,
25019 IX86_BUILTIN_SUBPD,
25020 IX86_BUILTIN_SUBSD,
25021
25022 IX86_BUILTIN_CMPEQPD,
25023 IX86_BUILTIN_CMPLTPD,
25024 IX86_BUILTIN_CMPLEPD,
25025 IX86_BUILTIN_CMPGTPD,
25026 IX86_BUILTIN_CMPGEPD,
25027 IX86_BUILTIN_CMPNEQPD,
25028 IX86_BUILTIN_CMPNLTPD,
25029 IX86_BUILTIN_CMPNLEPD,
25030 IX86_BUILTIN_CMPNGTPD,
25031 IX86_BUILTIN_CMPNGEPD,
25032 IX86_BUILTIN_CMPORDPD,
25033 IX86_BUILTIN_CMPUNORDPD,
25034 IX86_BUILTIN_CMPEQSD,
25035 IX86_BUILTIN_CMPLTSD,
25036 IX86_BUILTIN_CMPLESD,
25037 IX86_BUILTIN_CMPNEQSD,
25038 IX86_BUILTIN_CMPNLTSD,
25039 IX86_BUILTIN_CMPNLESD,
25040 IX86_BUILTIN_CMPORDSD,
25041 IX86_BUILTIN_CMPUNORDSD,
25042
25043 IX86_BUILTIN_COMIEQSD,
25044 IX86_BUILTIN_COMILTSD,
25045 IX86_BUILTIN_COMILESD,
25046 IX86_BUILTIN_COMIGTSD,
25047 IX86_BUILTIN_COMIGESD,
25048 IX86_BUILTIN_COMINEQSD,
25049 IX86_BUILTIN_UCOMIEQSD,
25050 IX86_BUILTIN_UCOMILTSD,
25051 IX86_BUILTIN_UCOMILESD,
25052 IX86_BUILTIN_UCOMIGTSD,
25053 IX86_BUILTIN_UCOMIGESD,
25054 IX86_BUILTIN_UCOMINEQSD,
25055
25056 IX86_BUILTIN_MAXPD,
25057 IX86_BUILTIN_MAXSD,
25058 IX86_BUILTIN_MINPD,
25059 IX86_BUILTIN_MINSD,
25060
25061 IX86_BUILTIN_ANDPD,
25062 IX86_BUILTIN_ANDNPD,
25063 IX86_BUILTIN_ORPD,
25064 IX86_BUILTIN_XORPD,
25065
25066 IX86_BUILTIN_SQRTPD,
25067 IX86_BUILTIN_SQRTSD,
25068
25069 IX86_BUILTIN_UNPCKHPD,
25070 IX86_BUILTIN_UNPCKLPD,
25071
25072 IX86_BUILTIN_SHUFPD,
25073
25074 IX86_BUILTIN_LOADUPD,
25075 IX86_BUILTIN_STOREUPD,
25076 IX86_BUILTIN_MOVSD,
25077
25078 IX86_BUILTIN_LOADHPD,
25079 IX86_BUILTIN_LOADLPD,
25080
25081 IX86_BUILTIN_CVTDQ2PD,
25082 IX86_BUILTIN_CVTDQ2PS,
25083
25084 IX86_BUILTIN_CVTPD2DQ,
25085 IX86_BUILTIN_CVTPD2PI,
25086 IX86_BUILTIN_CVTPD2PS,
25087 IX86_BUILTIN_CVTTPD2DQ,
25088 IX86_BUILTIN_CVTTPD2PI,
25089
25090 IX86_BUILTIN_CVTPI2PD,
25091 IX86_BUILTIN_CVTSI2SD,
25092 IX86_BUILTIN_CVTSI642SD,
25093
25094 IX86_BUILTIN_CVTSD2SI,
25095 IX86_BUILTIN_CVTSD2SI64,
25096 IX86_BUILTIN_CVTSD2SS,
25097 IX86_BUILTIN_CVTSS2SD,
25098 IX86_BUILTIN_CVTTSD2SI,
25099 IX86_BUILTIN_CVTTSD2SI64,
25100
25101 IX86_BUILTIN_CVTPS2DQ,
25102 IX86_BUILTIN_CVTPS2PD,
25103 IX86_BUILTIN_CVTTPS2DQ,
25104
25105 IX86_BUILTIN_MOVNTI,
25106 IX86_BUILTIN_MOVNTI64,
25107 IX86_BUILTIN_MOVNTPD,
25108 IX86_BUILTIN_MOVNTDQ,
25109
25110 IX86_BUILTIN_MOVQ128,
25111
25112 /* SSE2 MMX */
25113 IX86_BUILTIN_MASKMOVDQU,
25114 IX86_BUILTIN_MOVMSKPD,
25115 IX86_BUILTIN_PMOVMSKB128,
25116
25117 IX86_BUILTIN_PACKSSWB128,
25118 IX86_BUILTIN_PACKSSDW128,
25119 IX86_BUILTIN_PACKUSWB128,
25120
25121 IX86_BUILTIN_PADDB128,
25122 IX86_BUILTIN_PADDW128,
25123 IX86_BUILTIN_PADDD128,
25124 IX86_BUILTIN_PADDQ128,
25125 IX86_BUILTIN_PADDSB128,
25126 IX86_BUILTIN_PADDSW128,
25127 IX86_BUILTIN_PADDUSB128,
25128 IX86_BUILTIN_PADDUSW128,
25129 IX86_BUILTIN_PSUBB128,
25130 IX86_BUILTIN_PSUBW128,
25131 IX86_BUILTIN_PSUBD128,
25132 IX86_BUILTIN_PSUBQ128,
25133 IX86_BUILTIN_PSUBSB128,
25134 IX86_BUILTIN_PSUBSW128,
25135 IX86_BUILTIN_PSUBUSB128,
25136 IX86_BUILTIN_PSUBUSW128,
25137
25138 IX86_BUILTIN_PAND128,
25139 IX86_BUILTIN_PANDN128,
25140 IX86_BUILTIN_POR128,
25141 IX86_BUILTIN_PXOR128,
25142
25143 IX86_BUILTIN_PAVGB128,
25144 IX86_BUILTIN_PAVGW128,
25145
25146 IX86_BUILTIN_PCMPEQB128,
25147 IX86_BUILTIN_PCMPEQW128,
25148 IX86_BUILTIN_PCMPEQD128,
25149 IX86_BUILTIN_PCMPGTB128,
25150 IX86_BUILTIN_PCMPGTW128,
25151 IX86_BUILTIN_PCMPGTD128,
25152
25153 IX86_BUILTIN_PMADDWD128,
25154
25155 IX86_BUILTIN_PMAXSW128,
25156 IX86_BUILTIN_PMAXUB128,
25157 IX86_BUILTIN_PMINSW128,
25158 IX86_BUILTIN_PMINUB128,
25159
25160 IX86_BUILTIN_PMULUDQ,
25161 IX86_BUILTIN_PMULUDQ128,
25162 IX86_BUILTIN_PMULHUW128,
25163 IX86_BUILTIN_PMULHW128,
25164 IX86_BUILTIN_PMULLW128,
25165
25166 IX86_BUILTIN_PSADBW128,
25167 IX86_BUILTIN_PSHUFHW,
25168 IX86_BUILTIN_PSHUFLW,
25169 IX86_BUILTIN_PSHUFD,
25170
25171 IX86_BUILTIN_PSLLDQI128,
25172 IX86_BUILTIN_PSLLWI128,
25173 IX86_BUILTIN_PSLLDI128,
25174 IX86_BUILTIN_PSLLQI128,
25175 IX86_BUILTIN_PSRAWI128,
25176 IX86_BUILTIN_PSRADI128,
25177 IX86_BUILTIN_PSRLDQI128,
25178 IX86_BUILTIN_PSRLWI128,
25179 IX86_BUILTIN_PSRLDI128,
25180 IX86_BUILTIN_PSRLQI128,
25181
25182 IX86_BUILTIN_PSLLDQ128,
25183 IX86_BUILTIN_PSLLW128,
25184 IX86_BUILTIN_PSLLD128,
25185 IX86_BUILTIN_PSLLQ128,
25186 IX86_BUILTIN_PSRAW128,
25187 IX86_BUILTIN_PSRAD128,
25188 IX86_BUILTIN_PSRLW128,
25189 IX86_BUILTIN_PSRLD128,
25190 IX86_BUILTIN_PSRLQ128,
25191
25192 IX86_BUILTIN_PUNPCKHBW128,
25193 IX86_BUILTIN_PUNPCKHWD128,
25194 IX86_BUILTIN_PUNPCKHDQ128,
25195 IX86_BUILTIN_PUNPCKHQDQ128,
25196 IX86_BUILTIN_PUNPCKLBW128,
25197 IX86_BUILTIN_PUNPCKLWD128,
25198 IX86_BUILTIN_PUNPCKLDQ128,
25199 IX86_BUILTIN_PUNPCKLQDQ128,
25200
25201 IX86_BUILTIN_CLFLUSH,
25202 IX86_BUILTIN_MFENCE,
25203 IX86_BUILTIN_LFENCE,
25204 IX86_BUILTIN_PAUSE,
25205
25206 IX86_BUILTIN_BSRSI,
25207 IX86_BUILTIN_BSRDI,
25208 IX86_BUILTIN_RDPMC,
25209 IX86_BUILTIN_RDTSC,
25210 IX86_BUILTIN_RDTSCP,
25211 IX86_BUILTIN_ROLQI,
25212 IX86_BUILTIN_ROLHI,
25213 IX86_BUILTIN_RORQI,
25214 IX86_BUILTIN_RORHI,
25215
25216 /* SSE3. */
25217 IX86_BUILTIN_ADDSUBPS,
25218 IX86_BUILTIN_HADDPS,
25219 IX86_BUILTIN_HSUBPS,
25220 IX86_BUILTIN_MOVSHDUP,
25221 IX86_BUILTIN_MOVSLDUP,
25222 IX86_BUILTIN_ADDSUBPD,
25223 IX86_BUILTIN_HADDPD,
25224 IX86_BUILTIN_HSUBPD,
25225 IX86_BUILTIN_LDDQU,
25226
25227 IX86_BUILTIN_MONITOR,
25228 IX86_BUILTIN_MWAIT,
25229
25230 /* SSSE3. */
25231 IX86_BUILTIN_PHADDW,
25232 IX86_BUILTIN_PHADDD,
25233 IX86_BUILTIN_PHADDSW,
25234 IX86_BUILTIN_PHSUBW,
25235 IX86_BUILTIN_PHSUBD,
25236 IX86_BUILTIN_PHSUBSW,
25237 IX86_BUILTIN_PMADDUBSW,
25238 IX86_BUILTIN_PMULHRSW,
25239 IX86_BUILTIN_PSHUFB,
25240 IX86_BUILTIN_PSIGNB,
25241 IX86_BUILTIN_PSIGNW,
25242 IX86_BUILTIN_PSIGND,
25243 IX86_BUILTIN_PALIGNR,
25244 IX86_BUILTIN_PABSB,
25245 IX86_BUILTIN_PABSW,
25246 IX86_BUILTIN_PABSD,
25247
25248 IX86_BUILTIN_PHADDW128,
25249 IX86_BUILTIN_PHADDD128,
25250 IX86_BUILTIN_PHADDSW128,
25251 IX86_BUILTIN_PHSUBW128,
25252 IX86_BUILTIN_PHSUBD128,
25253 IX86_BUILTIN_PHSUBSW128,
25254 IX86_BUILTIN_PMADDUBSW128,
25255 IX86_BUILTIN_PMULHRSW128,
25256 IX86_BUILTIN_PSHUFB128,
25257 IX86_BUILTIN_PSIGNB128,
25258 IX86_BUILTIN_PSIGNW128,
25259 IX86_BUILTIN_PSIGND128,
25260 IX86_BUILTIN_PALIGNR128,
25261 IX86_BUILTIN_PABSB128,
25262 IX86_BUILTIN_PABSW128,
25263 IX86_BUILTIN_PABSD128,
25264
25265 /* AMDFAM10 - SSE4A New Instructions. */
25266 IX86_BUILTIN_MOVNTSD,
25267 IX86_BUILTIN_MOVNTSS,
25268 IX86_BUILTIN_EXTRQI,
25269 IX86_BUILTIN_EXTRQ,
25270 IX86_BUILTIN_INSERTQI,
25271 IX86_BUILTIN_INSERTQ,
25272
25273 /* SSE4.1. */
25274 IX86_BUILTIN_BLENDPD,
25275 IX86_BUILTIN_BLENDPS,
25276 IX86_BUILTIN_BLENDVPD,
25277 IX86_BUILTIN_BLENDVPS,
25278 IX86_BUILTIN_PBLENDVB128,
25279 IX86_BUILTIN_PBLENDW128,
25280
25281 IX86_BUILTIN_DPPD,
25282 IX86_BUILTIN_DPPS,
25283
25284 IX86_BUILTIN_INSERTPS128,
25285
25286 IX86_BUILTIN_MOVNTDQA,
25287 IX86_BUILTIN_MPSADBW128,
25288 IX86_BUILTIN_PACKUSDW128,
25289 IX86_BUILTIN_PCMPEQQ,
25290 IX86_BUILTIN_PHMINPOSUW128,
25291
25292 IX86_BUILTIN_PMAXSB128,
25293 IX86_BUILTIN_PMAXSD128,
25294 IX86_BUILTIN_PMAXUD128,
25295 IX86_BUILTIN_PMAXUW128,
25296
25297 IX86_BUILTIN_PMINSB128,
25298 IX86_BUILTIN_PMINSD128,
25299 IX86_BUILTIN_PMINUD128,
25300 IX86_BUILTIN_PMINUW128,
25301
25302 IX86_BUILTIN_PMOVSXBW128,
25303 IX86_BUILTIN_PMOVSXBD128,
25304 IX86_BUILTIN_PMOVSXBQ128,
25305 IX86_BUILTIN_PMOVSXWD128,
25306 IX86_BUILTIN_PMOVSXWQ128,
25307 IX86_BUILTIN_PMOVSXDQ128,
25308
25309 IX86_BUILTIN_PMOVZXBW128,
25310 IX86_BUILTIN_PMOVZXBD128,
25311 IX86_BUILTIN_PMOVZXBQ128,
25312 IX86_BUILTIN_PMOVZXWD128,
25313 IX86_BUILTIN_PMOVZXWQ128,
25314 IX86_BUILTIN_PMOVZXDQ128,
25315
25316 IX86_BUILTIN_PMULDQ128,
25317 IX86_BUILTIN_PMULLD128,
25318
25319 IX86_BUILTIN_ROUNDSD,
25320 IX86_BUILTIN_ROUNDSS,
25321
25322 IX86_BUILTIN_ROUNDPD,
25323 IX86_BUILTIN_ROUNDPS,
25324
25325 IX86_BUILTIN_FLOORPD,
25326 IX86_BUILTIN_CEILPD,
25327 IX86_BUILTIN_TRUNCPD,
25328 IX86_BUILTIN_RINTPD,
25329 IX86_BUILTIN_ROUNDPD_AZ,
25330
25331 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25332 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25333 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25334
25335 IX86_BUILTIN_FLOORPS,
25336 IX86_BUILTIN_CEILPS,
25337 IX86_BUILTIN_TRUNCPS,
25338 IX86_BUILTIN_RINTPS,
25339 IX86_BUILTIN_ROUNDPS_AZ,
25340
25341 IX86_BUILTIN_FLOORPS_SFIX,
25342 IX86_BUILTIN_CEILPS_SFIX,
25343 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25344
25345 IX86_BUILTIN_PTESTZ,
25346 IX86_BUILTIN_PTESTC,
25347 IX86_BUILTIN_PTESTNZC,
25348
25349 IX86_BUILTIN_VEC_INIT_V2SI,
25350 IX86_BUILTIN_VEC_INIT_V4HI,
25351 IX86_BUILTIN_VEC_INIT_V8QI,
25352 IX86_BUILTIN_VEC_EXT_V2DF,
25353 IX86_BUILTIN_VEC_EXT_V2DI,
25354 IX86_BUILTIN_VEC_EXT_V4SF,
25355 IX86_BUILTIN_VEC_EXT_V4SI,
25356 IX86_BUILTIN_VEC_EXT_V8HI,
25357 IX86_BUILTIN_VEC_EXT_V2SI,
25358 IX86_BUILTIN_VEC_EXT_V4HI,
25359 IX86_BUILTIN_VEC_EXT_V16QI,
25360 IX86_BUILTIN_VEC_SET_V2DI,
25361 IX86_BUILTIN_VEC_SET_V4SF,
25362 IX86_BUILTIN_VEC_SET_V4SI,
25363 IX86_BUILTIN_VEC_SET_V8HI,
25364 IX86_BUILTIN_VEC_SET_V4HI,
25365 IX86_BUILTIN_VEC_SET_V16QI,
25366
25367 IX86_BUILTIN_VEC_PACK_SFIX,
25368 IX86_BUILTIN_VEC_PACK_SFIX256,
25369
25370 /* SSE4.2. */
25371 IX86_BUILTIN_CRC32QI,
25372 IX86_BUILTIN_CRC32HI,
25373 IX86_BUILTIN_CRC32SI,
25374 IX86_BUILTIN_CRC32DI,
25375
25376 IX86_BUILTIN_PCMPESTRI128,
25377 IX86_BUILTIN_PCMPESTRM128,
25378 IX86_BUILTIN_PCMPESTRA128,
25379 IX86_BUILTIN_PCMPESTRC128,
25380 IX86_BUILTIN_PCMPESTRO128,
25381 IX86_BUILTIN_PCMPESTRS128,
25382 IX86_BUILTIN_PCMPESTRZ128,
25383 IX86_BUILTIN_PCMPISTRI128,
25384 IX86_BUILTIN_PCMPISTRM128,
25385 IX86_BUILTIN_PCMPISTRA128,
25386 IX86_BUILTIN_PCMPISTRC128,
25387 IX86_BUILTIN_PCMPISTRO128,
25388 IX86_BUILTIN_PCMPISTRS128,
25389 IX86_BUILTIN_PCMPISTRZ128,
25390
25391 IX86_BUILTIN_PCMPGTQ,
25392
25393 /* AES instructions */
25394 IX86_BUILTIN_AESENC128,
25395 IX86_BUILTIN_AESENCLAST128,
25396 IX86_BUILTIN_AESDEC128,
25397 IX86_BUILTIN_AESDECLAST128,
25398 IX86_BUILTIN_AESIMC128,
25399 IX86_BUILTIN_AESKEYGENASSIST128,
25400
25401 /* PCLMUL instruction */
25402 IX86_BUILTIN_PCLMULQDQ128,
25403
25404 /* AVX */
25405 IX86_BUILTIN_ADDPD256,
25406 IX86_BUILTIN_ADDPS256,
25407 IX86_BUILTIN_ADDSUBPD256,
25408 IX86_BUILTIN_ADDSUBPS256,
25409 IX86_BUILTIN_ANDPD256,
25410 IX86_BUILTIN_ANDPS256,
25411 IX86_BUILTIN_ANDNPD256,
25412 IX86_BUILTIN_ANDNPS256,
25413 IX86_BUILTIN_BLENDPD256,
25414 IX86_BUILTIN_BLENDPS256,
25415 IX86_BUILTIN_BLENDVPD256,
25416 IX86_BUILTIN_BLENDVPS256,
25417 IX86_BUILTIN_DIVPD256,
25418 IX86_BUILTIN_DIVPS256,
25419 IX86_BUILTIN_DPPS256,
25420 IX86_BUILTIN_HADDPD256,
25421 IX86_BUILTIN_HADDPS256,
25422 IX86_BUILTIN_HSUBPD256,
25423 IX86_BUILTIN_HSUBPS256,
25424 IX86_BUILTIN_MAXPD256,
25425 IX86_BUILTIN_MAXPS256,
25426 IX86_BUILTIN_MINPD256,
25427 IX86_BUILTIN_MINPS256,
25428 IX86_BUILTIN_MULPD256,
25429 IX86_BUILTIN_MULPS256,
25430 IX86_BUILTIN_ORPD256,
25431 IX86_BUILTIN_ORPS256,
25432 IX86_BUILTIN_SHUFPD256,
25433 IX86_BUILTIN_SHUFPS256,
25434 IX86_BUILTIN_SUBPD256,
25435 IX86_BUILTIN_SUBPS256,
25436 IX86_BUILTIN_XORPD256,
25437 IX86_BUILTIN_XORPS256,
25438 IX86_BUILTIN_CMPSD,
25439 IX86_BUILTIN_CMPSS,
25440 IX86_BUILTIN_CMPPD,
25441 IX86_BUILTIN_CMPPS,
25442 IX86_BUILTIN_CMPPD256,
25443 IX86_BUILTIN_CMPPS256,
25444 IX86_BUILTIN_CVTDQ2PD256,
25445 IX86_BUILTIN_CVTDQ2PS256,
25446 IX86_BUILTIN_CVTPD2PS256,
25447 IX86_BUILTIN_CVTPS2DQ256,
25448 IX86_BUILTIN_CVTPS2PD256,
25449 IX86_BUILTIN_CVTTPD2DQ256,
25450 IX86_BUILTIN_CVTPD2DQ256,
25451 IX86_BUILTIN_CVTTPS2DQ256,
25452 IX86_BUILTIN_EXTRACTF128PD256,
25453 IX86_BUILTIN_EXTRACTF128PS256,
25454 IX86_BUILTIN_EXTRACTF128SI256,
25455 IX86_BUILTIN_VZEROALL,
25456 IX86_BUILTIN_VZEROUPPER,
25457 IX86_BUILTIN_VPERMILVARPD,
25458 IX86_BUILTIN_VPERMILVARPS,
25459 IX86_BUILTIN_VPERMILVARPD256,
25460 IX86_BUILTIN_VPERMILVARPS256,
25461 IX86_BUILTIN_VPERMILPD,
25462 IX86_BUILTIN_VPERMILPS,
25463 IX86_BUILTIN_VPERMILPD256,
25464 IX86_BUILTIN_VPERMILPS256,
25465 IX86_BUILTIN_VPERMIL2PD,
25466 IX86_BUILTIN_VPERMIL2PS,
25467 IX86_BUILTIN_VPERMIL2PD256,
25468 IX86_BUILTIN_VPERMIL2PS256,
25469 IX86_BUILTIN_VPERM2F128PD256,
25470 IX86_BUILTIN_VPERM2F128PS256,
25471 IX86_BUILTIN_VPERM2F128SI256,
25472 IX86_BUILTIN_VBROADCASTSS,
25473 IX86_BUILTIN_VBROADCASTSD256,
25474 IX86_BUILTIN_VBROADCASTSS256,
25475 IX86_BUILTIN_VBROADCASTPD256,
25476 IX86_BUILTIN_VBROADCASTPS256,
25477 IX86_BUILTIN_VINSERTF128PD256,
25478 IX86_BUILTIN_VINSERTF128PS256,
25479 IX86_BUILTIN_VINSERTF128SI256,
25480 IX86_BUILTIN_LOADUPD256,
25481 IX86_BUILTIN_LOADUPS256,
25482 IX86_BUILTIN_STOREUPD256,
25483 IX86_BUILTIN_STOREUPS256,
25484 IX86_BUILTIN_LDDQU256,
25485 IX86_BUILTIN_MOVNTDQ256,
25486 IX86_BUILTIN_MOVNTPD256,
25487 IX86_BUILTIN_MOVNTPS256,
25488 IX86_BUILTIN_LOADDQU256,
25489 IX86_BUILTIN_STOREDQU256,
25490 IX86_BUILTIN_MASKLOADPD,
25491 IX86_BUILTIN_MASKLOADPS,
25492 IX86_BUILTIN_MASKSTOREPD,
25493 IX86_BUILTIN_MASKSTOREPS,
25494 IX86_BUILTIN_MASKLOADPD256,
25495 IX86_BUILTIN_MASKLOADPS256,
25496 IX86_BUILTIN_MASKSTOREPD256,
25497 IX86_BUILTIN_MASKSTOREPS256,
25498 IX86_BUILTIN_MOVSHDUP256,
25499 IX86_BUILTIN_MOVSLDUP256,
25500 IX86_BUILTIN_MOVDDUP256,
25501
25502 IX86_BUILTIN_SQRTPD256,
25503 IX86_BUILTIN_SQRTPS256,
25504 IX86_BUILTIN_SQRTPS_NR256,
25505 IX86_BUILTIN_RSQRTPS256,
25506 IX86_BUILTIN_RSQRTPS_NR256,
25507
25508 IX86_BUILTIN_RCPPS256,
25509
25510 IX86_BUILTIN_ROUNDPD256,
25511 IX86_BUILTIN_ROUNDPS256,
25512
25513 IX86_BUILTIN_FLOORPD256,
25514 IX86_BUILTIN_CEILPD256,
25515 IX86_BUILTIN_TRUNCPD256,
25516 IX86_BUILTIN_RINTPD256,
25517 IX86_BUILTIN_ROUNDPD_AZ256,
25518
25519 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25520 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25521 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25522
25523 IX86_BUILTIN_FLOORPS256,
25524 IX86_BUILTIN_CEILPS256,
25525 IX86_BUILTIN_TRUNCPS256,
25526 IX86_BUILTIN_RINTPS256,
25527 IX86_BUILTIN_ROUNDPS_AZ256,
25528
25529 IX86_BUILTIN_FLOORPS_SFIX256,
25530 IX86_BUILTIN_CEILPS_SFIX256,
25531 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25532
25533 IX86_BUILTIN_UNPCKHPD256,
25534 IX86_BUILTIN_UNPCKLPD256,
25535 IX86_BUILTIN_UNPCKHPS256,
25536 IX86_BUILTIN_UNPCKLPS256,
25537
25538 IX86_BUILTIN_SI256_SI,
25539 IX86_BUILTIN_PS256_PS,
25540 IX86_BUILTIN_PD256_PD,
25541 IX86_BUILTIN_SI_SI256,
25542 IX86_BUILTIN_PS_PS256,
25543 IX86_BUILTIN_PD_PD256,
25544
25545 IX86_BUILTIN_VTESTZPD,
25546 IX86_BUILTIN_VTESTCPD,
25547 IX86_BUILTIN_VTESTNZCPD,
25548 IX86_BUILTIN_VTESTZPS,
25549 IX86_BUILTIN_VTESTCPS,
25550 IX86_BUILTIN_VTESTNZCPS,
25551 IX86_BUILTIN_VTESTZPD256,
25552 IX86_BUILTIN_VTESTCPD256,
25553 IX86_BUILTIN_VTESTNZCPD256,
25554 IX86_BUILTIN_VTESTZPS256,
25555 IX86_BUILTIN_VTESTCPS256,
25556 IX86_BUILTIN_VTESTNZCPS256,
25557 IX86_BUILTIN_PTESTZ256,
25558 IX86_BUILTIN_PTESTC256,
25559 IX86_BUILTIN_PTESTNZC256,
25560
25561 IX86_BUILTIN_MOVMSKPD256,
25562 IX86_BUILTIN_MOVMSKPS256,
25563
25564 /* AVX2 */
25565 IX86_BUILTIN_MPSADBW256,
25566 IX86_BUILTIN_PABSB256,
25567 IX86_BUILTIN_PABSW256,
25568 IX86_BUILTIN_PABSD256,
25569 IX86_BUILTIN_PACKSSDW256,
25570 IX86_BUILTIN_PACKSSWB256,
25571 IX86_BUILTIN_PACKUSDW256,
25572 IX86_BUILTIN_PACKUSWB256,
25573 IX86_BUILTIN_PADDB256,
25574 IX86_BUILTIN_PADDW256,
25575 IX86_BUILTIN_PADDD256,
25576 IX86_BUILTIN_PADDQ256,
25577 IX86_BUILTIN_PADDSB256,
25578 IX86_BUILTIN_PADDSW256,
25579 IX86_BUILTIN_PADDUSB256,
25580 IX86_BUILTIN_PADDUSW256,
25581 IX86_BUILTIN_PALIGNR256,
25582 IX86_BUILTIN_AND256I,
25583 IX86_BUILTIN_ANDNOT256I,
25584 IX86_BUILTIN_PAVGB256,
25585 IX86_BUILTIN_PAVGW256,
25586 IX86_BUILTIN_PBLENDVB256,
25587 IX86_BUILTIN_PBLENDVW256,
25588 IX86_BUILTIN_PCMPEQB256,
25589 IX86_BUILTIN_PCMPEQW256,
25590 IX86_BUILTIN_PCMPEQD256,
25591 IX86_BUILTIN_PCMPEQQ256,
25592 IX86_BUILTIN_PCMPGTB256,
25593 IX86_BUILTIN_PCMPGTW256,
25594 IX86_BUILTIN_PCMPGTD256,
25595 IX86_BUILTIN_PCMPGTQ256,
25596 IX86_BUILTIN_PHADDW256,
25597 IX86_BUILTIN_PHADDD256,
25598 IX86_BUILTIN_PHADDSW256,
25599 IX86_BUILTIN_PHSUBW256,
25600 IX86_BUILTIN_PHSUBD256,
25601 IX86_BUILTIN_PHSUBSW256,
25602 IX86_BUILTIN_PMADDUBSW256,
25603 IX86_BUILTIN_PMADDWD256,
25604 IX86_BUILTIN_PMAXSB256,
25605 IX86_BUILTIN_PMAXSW256,
25606 IX86_BUILTIN_PMAXSD256,
25607 IX86_BUILTIN_PMAXUB256,
25608 IX86_BUILTIN_PMAXUW256,
25609 IX86_BUILTIN_PMAXUD256,
25610 IX86_BUILTIN_PMINSB256,
25611 IX86_BUILTIN_PMINSW256,
25612 IX86_BUILTIN_PMINSD256,
25613 IX86_BUILTIN_PMINUB256,
25614 IX86_BUILTIN_PMINUW256,
25615 IX86_BUILTIN_PMINUD256,
25616 IX86_BUILTIN_PMOVMSKB256,
25617 IX86_BUILTIN_PMOVSXBW256,
25618 IX86_BUILTIN_PMOVSXBD256,
25619 IX86_BUILTIN_PMOVSXBQ256,
25620 IX86_BUILTIN_PMOVSXWD256,
25621 IX86_BUILTIN_PMOVSXWQ256,
25622 IX86_BUILTIN_PMOVSXDQ256,
25623 IX86_BUILTIN_PMOVZXBW256,
25624 IX86_BUILTIN_PMOVZXBD256,
25625 IX86_BUILTIN_PMOVZXBQ256,
25626 IX86_BUILTIN_PMOVZXWD256,
25627 IX86_BUILTIN_PMOVZXWQ256,
25628 IX86_BUILTIN_PMOVZXDQ256,
25629 IX86_BUILTIN_PMULDQ256,
25630 IX86_BUILTIN_PMULHRSW256,
25631 IX86_BUILTIN_PMULHUW256,
25632 IX86_BUILTIN_PMULHW256,
25633 IX86_BUILTIN_PMULLW256,
25634 IX86_BUILTIN_PMULLD256,
25635 IX86_BUILTIN_PMULUDQ256,
25636 IX86_BUILTIN_POR256,
25637 IX86_BUILTIN_PSADBW256,
25638 IX86_BUILTIN_PSHUFB256,
25639 IX86_BUILTIN_PSHUFD256,
25640 IX86_BUILTIN_PSHUFHW256,
25641 IX86_BUILTIN_PSHUFLW256,
25642 IX86_BUILTIN_PSIGNB256,
25643 IX86_BUILTIN_PSIGNW256,
25644 IX86_BUILTIN_PSIGND256,
25645 IX86_BUILTIN_PSLLDQI256,
25646 IX86_BUILTIN_PSLLWI256,
25647 IX86_BUILTIN_PSLLW256,
25648 IX86_BUILTIN_PSLLDI256,
25649 IX86_BUILTIN_PSLLD256,
25650 IX86_BUILTIN_PSLLQI256,
25651 IX86_BUILTIN_PSLLQ256,
25652 IX86_BUILTIN_PSRAWI256,
25653 IX86_BUILTIN_PSRAW256,
25654 IX86_BUILTIN_PSRADI256,
25655 IX86_BUILTIN_PSRAD256,
25656 IX86_BUILTIN_PSRLDQI256,
25657 IX86_BUILTIN_PSRLWI256,
25658 IX86_BUILTIN_PSRLW256,
25659 IX86_BUILTIN_PSRLDI256,
25660 IX86_BUILTIN_PSRLD256,
25661 IX86_BUILTIN_PSRLQI256,
25662 IX86_BUILTIN_PSRLQ256,
25663 IX86_BUILTIN_PSUBB256,
25664 IX86_BUILTIN_PSUBW256,
25665 IX86_BUILTIN_PSUBD256,
25666 IX86_BUILTIN_PSUBQ256,
25667 IX86_BUILTIN_PSUBSB256,
25668 IX86_BUILTIN_PSUBSW256,
25669 IX86_BUILTIN_PSUBUSB256,
25670 IX86_BUILTIN_PSUBUSW256,
25671 IX86_BUILTIN_PUNPCKHBW256,
25672 IX86_BUILTIN_PUNPCKHWD256,
25673 IX86_BUILTIN_PUNPCKHDQ256,
25674 IX86_BUILTIN_PUNPCKHQDQ256,
25675 IX86_BUILTIN_PUNPCKLBW256,
25676 IX86_BUILTIN_PUNPCKLWD256,
25677 IX86_BUILTIN_PUNPCKLDQ256,
25678 IX86_BUILTIN_PUNPCKLQDQ256,
25679 IX86_BUILTIN_PXOR256,
25680 IX86_BUILTIN_MOVNTDQA256,
25681 IX86_BUILTIN_VBROADCASTSS_PS,
25682 IX86_BUILTIN_VBROADCASTSS_PS256,
25683 IX86_BUILTIN_VBROADCASTSD_PD256,
25684 IX86_BUILTIN_VBROADCASTSI256,
25685 IX86_BUILTIN_PBLENDD256,
25686 IX86_BUILTIN_PBLENDD128,
25687 IX86_BUILTIN_PBROADCASTB256,
25688 IX86_BUILTIN_PBROADCASTW256,
25689 IX86_BUILTIN_PBROADCASTD256,
25690 IX86_BUILTIN_PBROADCASTQ256,
25691 IX86_BUILTIN_PBROADCASTB128,
25692 IX86_BUILTIN_PBROADCASTW128,
25693 IX86_BUILTIN_PBROADCASTD128,
25694 IX86_BUILTIN_PBROADCASTQ128,
25695 IX86_BUILTIN_VPERMVARSI256,
25696 IX86_BUILTIN_VPERMDF256,
25697 IX86_BUILTIN_VPERMVARSF256,
25698 IX86_BUILTIN_VPERMDI256,
25699 IX86_BUILTIN_VPERMTI256,
25700 IX86_BUILTIN_VEXTRACT128I256,
25701 IX86_BUILTIN_VINSERT128I256,
25702 IX86_BUILTIN_MASKLOADD,
25703 IX86_BUILTIN_MASKLOADQ,
25704 IX86_BUILTIN_MASKLOADD256,
25705 IX86_BUILTIN_MASKLOADQ256,
25706 IX86_BUILTIN_MASKSTORED,
25707 IX86_BUILTIN_MASKSTOREQ,
25708 IX86_BUILTIN_MASKSTORED256,
25709 IX86_BUILTIN_MASKSTOREQ256,
25710 IX86_BUILTIN_PSLLVV4DI,
25711 IX86_BUILTIN_PSLLVV2DI,
25712 IX86_BUILTIN_PSLLVV8SI,
25713 IX86_BUILTIN_PSLLVV4SI,
25714 IX86_BUILTIN_PSRAVV8SI,
25715 IX86_BUILTIN_PSRAVV4SI,
25716 IX86_BUILTIN_PSRLVV4DI,
25717 IX86_BUILTIN_PSRLVV2DI,
25718 IX86_BUILTIN_PSRLVV8SI,
25719 IX86_BUILTIN_PSRLVV4SI,
25720
25721 IX86_BUILTIN_GATHERSIV2DF,
25722 IX86_BUILTIN_GATHERSIV4DF,
25723 IX86_BUILTIN_GATHERDIV2DF,
25724 IX86_BUILTIN_GATHERDIV4DF,
25725 IX86_BUILTIN_GATHERSIV4SF,
25726 IX86_BUILTIN_GATHERSIV8SF,
25727 IX86_BUILTIN_GATHERDIV4SF,
25728 IX86_BUILTIN_GATHERDIV8SF,
25729 IX86_BUILTIN_GATHERSIV2DI,
25730 IX86_BUILTIN_GATHERSIV4DI,
25731 IX86_BUILTIN_GATHERDIV2DI,
25732 IX86_BUILTIN_GATHERDIV4DI,
25733 IX86_BUILTIN_GATHERSIV4SI,
25734 IX86_BUILTIN_GATHERSIV8SI,
25735 IX86_BUILTIN_GATHERDIV4SI,
25736 IX86_BUILTIN_GATHERDIV8SI,
25737
25738 /* Alternate 4 element gather for the vectorizer where
25739 all operands are 32-byte wide. */
25740 IX86_BUILTIN_GATHERALTSIV4DF,
25741 IX86_BUILTIN_GATHERALTDIV8SF,
25742 IX86_BUILTIN_GATHERALTSIV4DI,
25743 IX86_BUILTIN_GATHERALTDIV8SI,
25744
25745 /* TFmode support builtins. */
25746 IX86_BUILTIN_INFQ,
25747 IX86_BUILTIN_HUGE_VALQ,
25748 IX86_BUILTIN_FABSQ,
25749 IX86_BUILTIN_COPYSIGNQ,
25750
25751 /* Vectorizer support builtins. */
25752 IX86_BUILTIN_CPYSGNPS,
25753 IX86_BUILTIN_CPYSGNPD,
25754 IX86_BUILTIN_CPYSGNPS256,
25755 IX86_BUILTIN_CPYSGNPD256,
25756
25757 /* FMA4 instructions. */
25758 IX86_BUILTIN_VFMADDSS,
25759 IX86_BUILTIN_VFMADDSD,
25760 IX86_BUILTIN_VFMADDPS,
25761 IX86_BUILTIN_VFMADDPD,
25762 IX86_BUILTIN_VFMADDPS256,
25763 IX86_BUILTIN_VFMADDPD256,
25764 IX86_BUILTIN_VFMADDSUBPS,
25765 IX86_BUILTIN_VFMADDSUBPD,
25766 IX86_BUILTIN_VFMADDSUBPS256,
25767 IX86_BUILTIN_VFMADDSUBPD256,
25768
25769 /* FMA3 instructions. */
25770 IX86_BUILTIN_VFMADDSS3,
25771 IX86_BUILTIN_VFMADDSD3,
25772
25773 /* XOP instructions. */
25774 IX86_BUILTIN_VPCMOV,
25775 IX86_BUILTIN_VPCMOV_V2DI,
25776 IX86_BUILTIN_VPCMOV_V4SI,
25777 IX86_BUILTIN_VPCMOV_V8HI,
25778 IX86_BUILTIN_VPCMOV_V16QI,
25779 IX86_BUILTIN_VPCMOV_V4SF,
25780 IX86_BUILTIN_VPCMOV_V2DF,
25781 IX86_BUILTIN_VPCMOV256,
25782 IX86_BUILTIN_VPCMOV_V4DI256,
25783 IX86_BUILTIN_VPCMOV_V8SI256,
25784 IX86_BUILTIN_VPCMOV_V16HI256,
25785 IX86_BUILTIN_VPCMOV_V32QI256,
25786 IX86_BUILTIN_VPCMOV_V8SF256,
25787 IX86_BUILTIN_VPCMOV_V4DF256,
25788
25789 IX86_BUILTIN_VPPERM,
25790
25791 IX86_BUILTIN_VPMACSSWW,
25792 IX86_BUILTIN_VPMACSWW,
25793 IX86_BUILTIN_VPMACSSWD,
25794 IX86_BUILTIN_VPMACSWD,
25795 IX86_BUILTIN_VPMACSSDD,
25796 IX86_BUILTIN_VPMACSDD,
25797 IX86_BUILTIN_VPMACSSDQL,
25798 IX86_BUILTIN_VPMACSSDQH,
25799 IX86_BUILTIN_VPMACSDQL,
25800 IX86_BUILTIN_VPMACSDQH,
25801 IX86_BUILTIN_VPMADCSSWD,
25802 IX86_BUILTIN_VPMADCSWD,
25803
25804 IX86_BUILTIN_VPHADDBW,
25805 IX86_BUILTIN_VPHADDBD,
25806 IX86_BUILTIN_VPHADDBQ,
25807 IX86_BUILTIN_VPHADDWD,
25808 IX86_BUILTIN_VPHADDWQ,
25809 IX86_BUILTIN_VPHADDDQ,
25810 IX86_BUILTIN_VPHADDUBW,
25811 IX86_BUILTIN_VPHADDUBD,
25812 IX86_BUILTIN_VPHADDUBQ,
25813 IX86_BUILTIN_VPHADDUWD,
25814 IX86_BUILTIN_VPHADDUWQ,
25815 IX86_BUILTIN_VPHADDUDQ,
25816 IX86_BUILTIN_VPHSUBBW,
25817 IX86_BUILTIN_VPHSUBWD,
25818 IX86_BUILTIN_VPHSUBDQ,
25819
25820 IX86_BUILTIN_VPROTB,
25821 IX86_BUILTIN_VPROTW,
25822 IX86_BUILTIN_VPROTD,
25823 IX86_BUILTIN_VPROTQ,
25824 IX86_BUILTIN_VPROTB_IMM,
25825 IX86_BUILTIN_VPROTW_IMM,
25826 IX86_BUILTIN_VPROTD_IMM,
25827 IX86_BUILTIN_VPROTQ_IMM,
25828
25829 IX86_BUILTIN_VPSHLB,
25830 IX86_BUILTIN_VPSHLW,
25831 IX86_BUILTIN_VPSHLD,
25832 IX86_BUILTIN_VPSHLQ,
25833 IX86_BUILTIN_VPSHAB,
25834 IX86_BUILTIN_VPSHAW,
25835 IX86_BUILTIN_VPSHAD,
25836 IX86_BUILTIN_VPSHAQ,
25837
25838 IX86_BUILTIN_VFRCZSS,
25839 IX86_BUILTIN_VFRCZSD,
25840 IX86_BUILTIN_VFRCZPS,
25841 IX86_BUILTIN_VFRCZPD,
25842 IX86_BUILTIN_VFRCZPS256,
25843 IX86_BUILTIN_VFRCZPD256,
25844
25845 IX86_BUILTIN_VPCOMEQUB,
25846 IX86_BUILTIN_VPCOMNEUB,
25847 IX86_BUILTIN_VPCOMLTUB,
25848 IX86_BUILTIN_VPCOMLEUB,
25849 IX86_BUILTIN_VPCOMGTUB,
25850 IX86_BUILTIN_VPCOMGEUB,
25851 IX86_BUILTIN_VPCOMFALSEUB,
25852 IX86_BUILTIN_VPCOMTRUEUB,
25853
25854 IX86_BUILTIN_VPCOMEQUW,
25855 IX86_BUILTIN_VPCOMNEUW,
25856 IX86_BUILTIN_VPCOMLTUW,
25857 IX86_BUILTIN_VPCOMLEUW,
25858 IX86_BUILTIN_VPCOMGTUW,
25859 IX86_BUILTIN_VPCOMGEUW,
25860 IX86_BUILTIN_VPCOMFALSEUW,
25861 IX86_BUILTIN_VPCOMTRUEUW,
25862
25863 IX86_BUILTIN_VPCOMEQUD,
25864 IX86_BUILTIN_VPCOMNEUD,
25865 IX86_BUILTIN_VPCOMLTUD,
25866 IX86_BUILTIN_VPCOMLEUD,
25867 IX86_BUILTIN_VPCOMGTUD,
25868 IX86_BUILTIN_VPCOMGEUD,
25869 IX86_BUILTIN_VPCOMFALSEUD,
25870 IX86_BUILTIN_VPCOMTRUEUD,
25871
25872 IX86_BUILTIN_VPCOMEQUQ,
25873 IX86_BUILTIN_VPCOMNEUQ,
25874 IX86_BUILTIN_VPCOMLTUQ,
25875 IX86_BUILTIN_VPCOMLEUQ,
25876 IX86_BUILTIN_VPCOMGTUQ,
25877 IX86_BUILTIN_VPCOMGEUQ,
25878 IX86_BUILTIN_VPCOMFALSEUQ,
25879 IX86_BUILTIN_VPCOMTRUEUQ,
25880
25881 IX86_BUILTIN_VPCOMEQB,
25882 IX86_BUILTIN_VPCOMNEB,
25883 IX86_BUILTIN_VPCOMLTB,
25884 IX86_BUILTIN_VPCOMLEB,
25885 IX86_BUILTIN_VPCOMGTB,
25886 IX86_BUILTIN_VPCOMGEB,
25887 IX86_BUILTIN_VPCOMFALSEB,
25888 IX86_BUILTIN_VPCOMTRUEB,
25889
25890 IX86_BUILTIN_VPCOMEQW,
25891 IX86_BUILTIN_VPCOMNEW,
25892 IX86_BUILTIN_VPCOMLTW,
25893 IX86_BUILTIN_VPCOMLEW,
25894 IX86_BUILTIN_VPCOMGTW,
25895 IX86_BUILTIN_VPCOMGEW,
25896 IX86_BUILTIN_VPCOMFALSEW,
25897 IX86_BUILTIN_VPCOMTRUEW,
25898
25899 IX86_BUILTIN_VPCOMEQD,
25900 IX86_BUILTIN_VPCOMNED,
25901 IX86_BUILTIN_VPCOMLTD,
25902 IX86_BUILTIN_VPCOMLED,
25903 IX86_BUILTIN_VPCOMGTD,
25904 IX86_BUILTIN_VPCOMGED,
25905 IX86_BUILTIN_VPCOMFALSED,
25906 IX86_BUILTIN_VPCOMTRUED,
25907
25908 IX86_BUILTIN_VPCOMEQQ,
25909 IX86_BUILTIN_VPCOMNEQ,
25910 IX86_BUILTIN_VPCOMLTQ,
25911 IX86_BUILTIN_VPCOMLEQ,
25912 IX86_BUILTIN_VPCOMGTQ,
25913 IX86_BUILTIN_VPCOMGEQ,
25914 IX86_BUILTIN_VPCOMFALSEQ,
25915 IX86_BUILTIN_VPCOMTRUEQ,
25916
25917 /* LWP instructions. */
25918 IX86_BUILTIN_LLWPCB,
25919 IX86_BUILTIN_SLWPCB,
25920 IX86_BUILTIN_LWPVAL32,
25921 IX86_BUILTIN_LWPVAL64,
25922 IX86_BUILTIN_LWPINS32,
25923 IX86_BUILTIN_LWPINS64,
25924
25925 IX86_BUILTIN_CLZS,
25926
25927 /* RTM */
25928 IX86_BUILTIN_XBEGIN,
25929 IX86_BUILTIN_XEND,
25930 IX86_BUILTIN_XABORT,
25931 IX86_BUILTIN_XTEST,
25932
25933 /* BMI instructions. */
25934 IX86_BUILTIN_BEXTR32,
25935 IX86_BUILTIN_BEXTR64,
25936 IX86_BUILTIN_CTZS,
25937
25938 /* TBM instructions. */
25939 IX86_BUILTIN_BEXTRI32,
25940 IX86_BUILTIN_BEXTRI64,
25941
25942 /* BMI2 instructions. */
25943 IX86_BUILTIN_BZHI32,
25944 IX86_BUILTIN_BZHI64,
25945 IX86_BUILTIN_PDEP32,
25946 IX86_BUILTIN_PDEP64,
25947 IX86_BUILTIN_PEXT32,
25948 IX86_BUILTIN_PEXT64,
25949
25950 /* FSGSBASE instructions. */
25951 IX86_BUILTIN_RDFSBASE32,
25952 IX86_BUILTIN_RDFSBASE64,
25953 IX86_BUILTIN_RDGSBASE32,
25954 IX86_BUILTIN_RDGSBASE64,
25955 IX86_BUILTIN_WRFSBASE32,
25956 IX86_BUILTIN_WRFSBASE64,
25957 IX86_BUILTIN_WRGSBASE32,
25958 IX86_BUILTIN_WRGSBASE64,
25959
25960 /* RDRND instructions. */
25961 IX86_BUILTIN_RDRAND16_STEP,
25962 IX86_BUILTIN_RDRAND32_STEP,
25963 IX86_BUILTIN_RDRAND64_STEP,
25964
25965 /* F16C instructions. */
25966 IX86_BUILTIN_CVTPH2PS,
25967 IX86_BUILTIN_CVTPH2PS256,
25968 IX86_BUILTIN_CVTPS2PH,
25969 IX86_BUILTIN_CVTPS2PH256,
25970
25971 /* CFString built-in for darwin */
25972 IX86_BUILTIN_CFSTRING,
25973
25974 /* Builtins to get CPU type and supported features. */
25975 IX86_BUILTIN_CPU_INIT,
25976 IX86_BUILTIN_CPU_IS,
25977 IX86_BUILTIN_CPU_SUPPORTS,
25978
25979 IX86_BUILTIN_MAX
25980 };
25981
25982 /* Table for the ix86 builtin decls. */
25983 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25984
25985 /* Table of all of the builtin functions that are possible with different ISA's
25986 but are waiting to be built until a function is declared to use that
25987 ISA. */
25988 struct builtin_isa {
25989 const char *name; /* function name */
25990 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25991 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25992 bool const_p; /* true if the declaration is constant */
25993 bool set_and_not_built_p;
25994 };
25995
25996 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25997
25998
25999 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26000 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26001 function decl in the ix86_builtins array. Returns the function decl or
26002 NULL_TREE, if the builtin was not added.
26003
26004 If the front end has a special hook for builtin functions, delay adding
26005 builtin functions that aren't in the current ISA until the ISA is changed
26006 with function specific optimization. Doing so, can save about 300K for the
26007 default compiler. When the builtin is expanded, check at that time whether
26008 it is valid.
26009
26010 If the front end doesn't have a special hook, record all builtins, even if
26011 it isn't an instruction set in the current ISA in case the user uses
26012 function specific options for a different ISA, so that we don't get scope
26013 errors if a builtin is added in the middle of a function scope. */
26014
26015 static inline tree
26016 def_builtin (HOST_WIDE_INT mask, const char *name,
26017 enum ix86_builtin_func_type tcode,
26018 enum ix86_builtins code)
26019 {
26020 tree decl = NULL_TREE;
26021
26022 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26023 {
26024 ix86_builtins_isa[(int) code].isa = mask;
26025
26026 mask &= ~OPTION_MASK_ISA_64BIT;
26027 if (mask == 0
26028 || (mask & ix86_isa_flags) != 0
26029 || (lang_hooks.builtin_function
26030 == lang_hooks.builtin_function_ext_scope))
26031
26032 {
26033 tree type = ix86_get_builtin_func_type (tcode);
26034 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26035 NULL, NULL_TREE);
26036 ix86_builtins[(int) code] = decl;
26037 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26038 }
26039 else
26040 {
26041 ix86_builtins[(int) code] = NULL_TREE;
26042 ix86_builtins_isa[(int) code].tcode = tcode;
26043 ix86_builtins_isa[(int) code].name = name;
26044 ix86_builtins_isa[(int) code].const_p = false;
26045 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26046 }
26047 }
26048
26049 return decl;
26050 }
26051
26052 /* Like def_builtin, but also marks the function decl "const". */
26053
26054 static inline tree
26055 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26056 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26057 {
26058 tree decl = def_builtin (mask, name, tcode, code);
26059 if (decl)
26060 TREE_READONLY (decl) = 1;
26061 else
26062 ix86_builtins_isa[(int) code].const_p = true;
26063
26064 return decl;
26065 }
26066
26067 /* Add any new builtin functions for a given ISA that may not have been
26068 declared. This saves a bit of space compared to adding all of the
26069 declarations to the tree, even if we didn't use them. */
26070
26071 static void
26072 ix86_add_new_builtins (HOST_WIDE_INT isa)
26073 {
26074 int i;
26075
26076 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26077 {
26078 if ((ix86_builtins_isa[i].isa & isa) != 0
26079 && ix86_builtins_isa[i].set_and_not_built_p)
26080 {
26081 tree decl, type;
26082
26083 /* Don't define the builtin again. */
26084 ix86_builtins_isa[i].set_and_not_built_p = false;
26085
26086 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26087 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26088 type, i, BUILT_IN_MD, NULL,
26089 NULL_TREE);
26090
26091 ix86_builtins[i] = decl;
26092 if (ix86_builtins_isa[i].const_p)
26093 TREE_READONLY (decl) = 1;
26094 }
26095 }
26096 }
26097
26098 /* Bits for builtin_description.flag. */
26099
26100 /* Set when we don't support the comparison natively, and should
26101 swap_comparison in order to support it. */
26102 #define BUILTIN_DESC_SWAP_OPERANDS 1
26103
26104 struct builtin_description
26105 {
26106 const HOST_WIDE_INT mask;
26107 const enum insn_code icode;
26108 const char *const name;
26109 const enum ix86_builtins code;
26110 const enum rtx_code comparison;
26111 const int flag;
26112 };
26113
26114 static const struct builtin_description bdesc_comi[] =
26115 {
26116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26128 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26129 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26131 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26132 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26133 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26134 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26138 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26140 };
26141
26142 static const struct builtin_description bdesc_pcmpestr[] =
26143 {
26144 /* SSE4.2 */
26145 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26146 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26147 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26148 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26149 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26150 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26151 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26152 };
26153
26154 static const struct builtin_description bdesc_pcmpistr[] =
26155 {
26156 /* SSE4.2 */
26157 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26158 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26159 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26160 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26161 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26162 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26163 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26164 };
26165
26166 /* Special builtins with variable number of arguments. */
26167 static const struct builtin_description bdesc_special_args[] =
26168 {
26169 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26170 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26171 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26172
26173 /* MMX */
26174 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26175
26176 /* 3DNow! */
26177 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26178
26179 /* SSE */
26180 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26182 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26183
26184 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26185 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26186 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26187 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26188
26189 /* SSE or 3DNow!A */
26190 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26191 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26192
26193 /* SSE2 */
26194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26201 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26204
26205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26207
26208 /* SSE3 */
26209 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26210
26211 /* SSE4.1 */
26212 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26213
26214 /* SSE4A */
26215 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26216 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26217
26218 /* AVX */
26219 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26220 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26221
26222 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26223 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26224 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26225 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26226 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26227
26228 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26229 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26230 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26231 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26232 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26234 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26235
26236 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26237 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26238 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26239
26240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26241 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26242 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26243 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26244 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26245 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26246 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26247 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26248
26249 /* AVX2 */
26250 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26251 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26253 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26254 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26255 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26256 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26257 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26258 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26259
26260 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26261 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26262 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26263 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26264 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26265 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26266
26267 /* FSGSBASE */
26268 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26269 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26270 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26271 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26272 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26273 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26274 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26275 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26276
26277 /* RTM */
26278 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26279 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26280 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26281 };
26282
26283 /* Builtins with variable number of arguments. */
26284 static const struct builtin_description bdesc_args[] =
26285 {
26286 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26287 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26288 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26289 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26290 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26291 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26292 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26293
26294 /* MMX */
26295 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26296 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26297 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26298 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26299 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26300 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26301
26302 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26303 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26304 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26305 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26306 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26307 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26308 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26309 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26310
26311 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26312 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26313
26314 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26315 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26316 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26317 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26318
26319 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26320 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26321 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26322 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26323 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26324 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26325
26326 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26327 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26328 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26329 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26330 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26331 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26332
26333 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26334 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26335 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26336
26337 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26338
26339 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26340 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26341 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26342 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26343 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26344 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26345
26346 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26347 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26348 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26349 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26350 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26351 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26352
26353 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26354 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26355 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26356 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26357
26358 /* 3DNow! */
26359 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26360 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26361 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26362 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26363
26364 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26365 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26366 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26367 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26368 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26369 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26370 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26371 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26372 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26373 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26374 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26375 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26376 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26377 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26378 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26379
26380 /* 3DNow!A */
26381 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26382 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26383 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26384 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26385 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26386 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26387
26388 /* SSE */
26389 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26391 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26392 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26393 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26394 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26395 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26396 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26397 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26398 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26399 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26400 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26401
26402 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26403
26404 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26405 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26406 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26407 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26408 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26409 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26410 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26411 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26412
26413 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26414 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26415 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26416 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26417 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26418 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26419 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26420 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26421 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26422 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26423 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26424 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26425 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26426 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26427 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26428 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26429 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26430 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26431 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26432 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26433 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26434 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26435
26436 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26437 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26438 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26439 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26440
26441 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26442 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26443 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26444 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26445
26446 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26447
26448 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26449 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26450 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26451 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26452 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26453
26454 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26455 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26456 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26457
26458 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26459
26460 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26461 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26462 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26463
26464 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26465 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26466
26467 /* SSE MMX or 3Dnow!A */
26468 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26469 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26470 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26471
26472 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26473 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26474 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26475 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26476
26477 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26478 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26479
26480 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26481
26482 /* SSE2 */
26483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26484
26485 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26489 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26490
26491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26495 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26496
26497 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26498
26499 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26500 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26501 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26502 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26503
26504 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26505 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26506 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26507
26508 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26509 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26510 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26511 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26515 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26516
26517 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26519 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26522 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26524 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26526 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26527 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26532 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26533 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26534 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26535 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26537
26538 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26539 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26540 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26541 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26542
26543 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26544 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26545 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26546 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26547
26548 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26549
26550 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26551 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26552 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26553
26554 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26555
26556 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26557 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26558 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26559 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26560 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26561 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26562 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26563 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26564
26565 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26566 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26567 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26568 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26569 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26570 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26571 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26572 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26573
26574 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26575 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26576
26577 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26578 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26579 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26580 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26581
26582 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26583 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26584
26585 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26586 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26587 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26588 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26589 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26590 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26591
26592 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26593 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26594 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26595 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26596
26597 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26598 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26599 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26600 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26601 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26602 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26603 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26604 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26605
26606 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26607 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26608 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26609
26610 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26611 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26612
26613 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26614 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26615
26616 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26617
26618 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26619 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26620 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26621 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26622
26623 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26624 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26625 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26626 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26627 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26628 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26629 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26630
26631 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26632 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26633 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26634 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26635 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26636 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26637 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26638
26639 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26640 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26641 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26642 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26643
26644 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26645 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26646 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26647
26648 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26649
26650 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26651
26652 /* SSE2 MMX */
26653 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26654 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26655
26656 /* SSE3 */
26657 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26658 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26659
26660 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26661 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26662 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26663 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26664 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26665 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26666
26667 /* SSSE3 */
26668 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26669 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26670 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26671 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26672 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26673 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26674
26675 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26676 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26677 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26678 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26679 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26680 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26681 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26682 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26683 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26684 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26685 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26686 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26687 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26688 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26689 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26690 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26691 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26692 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26693 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26694 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26695 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26696 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26697 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26698 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26699
26700 /* SSSE3. */
26701 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26702 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26703
26704 /* SSE4.1 */
26705 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26706 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26707 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26708 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26709 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26710 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26711 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26712 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26713 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26714 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26715
26716 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26717 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26718 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26719 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26720 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26721 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26722 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26723 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26724 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26725 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26726 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26727 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26728 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26729
26730 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26731 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26732 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26733 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26734 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26735 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26736 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26737 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26738 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26739 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26740 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26741 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26742
26743 /* SSE4.1 */
26744 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26745 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26746 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26747 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26748
26749 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26750 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26751 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26752 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26753
26754 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26755 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26756
26757 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26758 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26759
26760 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26761 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26762 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26763 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26764
26765 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26766 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26767
26768 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26769 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26770
26771 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26772 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26773 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26774
26775 /* SSE4.2 */
26776 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26777 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26778 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26779 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26780 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26781
26782 /* SSE4A */
26783 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26784 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26785 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26786 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26787
26788 /* AES */
26789 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26790 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26791
26792 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26793 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26794 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26795 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26796
26797 /* PCLMUL */
26798 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26799
26800 /* AVX */
26801 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26802 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26805 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26806 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26809 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26814 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26815 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26816 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26817 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26818 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26819 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26820 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26821 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26822 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26823 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26824 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26825 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26826 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26827
26828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26832
26833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26837 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26842 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26844 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26846 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26849 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26850 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26854 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26855 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26856 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26857 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26861 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26862 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26863 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26865 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26866 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26867
26868 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26869 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26870 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26871
26872 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26873 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26874 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26876 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26877
26878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26879
26880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26882
26883 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26887
26888 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26889 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26890
26891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26893
26894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26896 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26898
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26900 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26901
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26904
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26907 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26908 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26909
26910 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26912 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26913 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26915 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26916
26917 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26918 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26919 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26922 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26923 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26926 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26927 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26928 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26930 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26931 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26932
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26935
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26938
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26940
26941 /* AVX2 */
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26951 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26952 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26953 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26959 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26961 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26973 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26974 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26975 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26976 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26977 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26978 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26979 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26980 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26981 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26982 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26983 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26984 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26985 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26986 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26987 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26988 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26989 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26990 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26991 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26992 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26998 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27000 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27002 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27003 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27004 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27005 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27006 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27007 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27008 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27009 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27010 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27011 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27012 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27013 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27014 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27017 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27018 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27019 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27020 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27021 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27022 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27023 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27024 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27025 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27026 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27027 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27028 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27029 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27030 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27031 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27032 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27033 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27034 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27035 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27036 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27037 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27038 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27039 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27040 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27041 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27042 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27043 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27044 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27045 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27046 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27047 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27048 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27049 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27050 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27051 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27052 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27053 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27054 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27055 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27056 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27057 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27058 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27059 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27060 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27061 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27062 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27063 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27064 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27065 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27066 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27067 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27068 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27069 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27070 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27071 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27072 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27073 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27074 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27075 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27076 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27077 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27078 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27079 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27080 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27081 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27082 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27083 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27084 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27085 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27086 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27087 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27088
27089 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27090
27091 /* BMI */
27092 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27093 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27094 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27095
27096 /* TBM */
27097 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27098 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27099
27100 /* F16C */
27101 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27102 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27103 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27104 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27105
27106 /* BMI2 */
27107 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27108 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27109 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27110 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27111 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27112 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27113 };
27114
27115 /* FMA4 and XOP. */
27116 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27117 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27118 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27119 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27120 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27121 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27122 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27123 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27124 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27125 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27126 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27127 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27128 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27129 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27130 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27131 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27132 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27133 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27134 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27135 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27136 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27137 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27138 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27139 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27140 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27141 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27142 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27143 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27144 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27145 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27146 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27147 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27148 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27149 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27150 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27151 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27152 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27153 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27154 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27155 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27156 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27157 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27158 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27159 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27160 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27161 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27162 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27163 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27164 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27165 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27166 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27167 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27168
27169 static const struct builtin_description bdesc_multi_arg[] =
27170 {
27171 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27172 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27173 UNKNOWN, (int)MULTI_ARG_3_SF },
27174 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27175 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27176 UNKNOWN, (int)MULTI_ARG_3_DF },
27177
27178 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27179 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27180 UNKNOWN, (int)MULTI_ARG_3_SF },
27181 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27182 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27183 UNKNOWN, (int)MULTI_ARG_3_DF },
27184
27185 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27186 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27187 UNKNOWN, (int)MULTI_ARG_3_SF },
27188 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27189 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27190 UNKNOWN, (int)MULTI_ARG_3_DF },
27191 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27192 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27193 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27194 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27195 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27196 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27197
27198 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27199 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27200 UNKNOWN, (int)MULTI_ARG_3_SF },
27201 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27202 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27203 UNKNOWN, (int)MULTI_ARG_3_DF },
27204 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27205 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27206 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27207 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27208 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27209 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27210
27211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27218
27219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27226
27227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27228
27229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27241
27242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27258
27259 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27265
27266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27268 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27275 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27277 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27278 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27279 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27281
27282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27283 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27285 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27286 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27287 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27289
27290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27291 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27292 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27293 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27294 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27295 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27296 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27297
27298 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27299 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27300 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27301 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27302 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27303 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27304 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27305
27306 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27307 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27308 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27309 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27310 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27311 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27312 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27313
27314 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27315 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27316 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27317 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27318 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27319 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27320 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27321
27322 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27323 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27324 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27325 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27326 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27327 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27328 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27329
27330 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27331 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27332 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27333 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27334 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27335 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27336 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27337
27338 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27339 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27340 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27341 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27342 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27343 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27344 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27345
27346 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27347 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27348 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27349 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27350 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27351 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27352 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27353 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27354
27355 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27356 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27357 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27358 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27359 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27360 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27361 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27362 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27363
27364 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27365 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27366 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27367 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27368
27369 };
27370 \f
27371 /* TM vector builtins. */
27372
27373 /* Reuse the existing x86-specific `struct builtin_description' cause
27374 we're lazy. Add casts to make them fit. */
27375 static const struct builtin_description bdesc_tm[] =
27376 {
27377 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27378 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27379 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27380 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27381 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27382 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27383 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27384
27385 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27386 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27387 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27388 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27389 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27390 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27391 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27392
27393 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27394 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27395 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27396 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27397 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27398 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27399 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27400
27401 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27402 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27403 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27404 };
27405
27406 /* TM callbacks. */
27407
27408 /* Return the builtin decl needed to load a vector of TYPE. */
27409
27410 static tree
27411 ix86_builtin_tm_load (tree type)
27412 {
27413 if (TREE_CODE (type) == VECTOR_TYPE)
27414 {
27415 switch (tree_low_cst (TYPE_SIZE (type), 1))
27416 {
27417 case 64:
27418 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27419 case 128:
27420 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27421 case 256:
27422 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27423 }
27424 }
27425 return NULL_TREE;
27426 }
27427
27428 /* Return the builtin decl needed to store a vector of TYPE. */
27429
27430 static tree
27431 ix86_builtin_tm_store (tree type)
27432 {
27433 if (TREE_CODE (type) == VECTOR_TYPE)
27434 {
27435 switch (tree_low_cst (TYPE_SIZE (type), 1))
27436 {
27437 case 64:
27438 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27439 case 128:
27440 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27441 case 256:
27442 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27443 }
27444 }
27445 return NULL_TREE;
27446 }
27447 \f
27448 /* Initialize the transactional memory vector load/store builtins. */
27449
27450 static void
27451 ix86_init_tm_builtins (void)
27452 {
27453 enum ix86_builtin_func_type ftype;
27454 const struct builtin_description *d;
27455 size_t i;
27456 tree decl;
27457 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27458 tree attrs_log, attrs_type_log;
27459
27460 if (!flag_tm)
27461 return;
27462
27463 /* If there are no builtins defined, we must be compiling in a
27464 language without trans-mem support. */
27465 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27466 return;
27467
27468 /* Use whatever attributes a normal TM load has. */
27469 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27470 attrs_load = DECL_ATTRIBUTES (decl);
27471 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27472 /* Use whatever attributes a normal TM store has. */
27473 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27474 attrs_store = DECL_ATTRIBUTES (decl);
27475 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27476 /* Use whatever attributes a normal TM log has. */
27477 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27478 attrs_log = DECL_ATTRIBUTES (decl);
27479 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27480
27481 for (i = 0, d = bdesc_tm;
27482 i < ARRAY_SIZE (bdesc_tm);
27483 i++, d++)
27484 {
27485 if ((d->mask & ix86_isa_flags) != 0
27486 || (lang_hooks.builtin_function
27487 == lang_hooks.builtin_function_ext_scope))
27488 {
27489 tree type, attrs, attrs_type;
27490 enum built_in_function code = (enum built_in_function) d->code;
27491
27492 ftype = (enum ix86_builtin_func_type) d->flag;
27493 type = ix86_get_builtin_func_type (ftype);
27494
27495 if (BUILTIN_TM_LOAD_P (code))
27496 {
27497 attrs = attrs_load;
27498 attrs_type = attrs_type_load;
27499 }
27500 else if (BUILTIN_TM_STORE_P (code))
27501 {
27502 attrs = attrs_store;
27503 attrs_type = attrs_type_store;
27504 }
27505 else
27506 {
27507 attrs = attrs_log;
27508 attrs_type = attrs_type_log;
27509 }
27510 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27511 /* The builtin without the prefix for
27512 calling it directly. */
27513 d->name + strlen ("__builtin_"),
27514 attrs);
27515 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27516 set the TYPE_ATTRIBUTES. */
27517 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27518
27519 set_builtin_decl (code, decl, false);
27520 }
27521 }
27522 }
27523
27524 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27525 in the current target ISA to allow the user to compile particular modules
27526 with different target specific options that differ from the command line
27527 options. */
27528 static void
27529 ix86_init_mmx_sse_builtins (void)
27530 {
27531 const struct builtin_description * d;
27532 enum ix86_builtin_func_type ftype;
27533 size_t i;
27534
27535 /* Add all special builtins with variable number of operands. */
27536 for (i = 0, d = bdesc_special_args;
27537 i < ARRAY_SIZE (bdesc_special_args);
27538 i++, d++)
27539 {
27540 if (d->name == 0)
27541 continue;
27542
27543 ftype = (enum ix86_builtin_func_type) d->flag;
27544 def_builtin (d->mask, d->name, ftype, d->code);
27545 }
27546
27547 /* Add all builtins with variable number of operands. */
27548 for (i = 0, d = bdesc_args;
27549 i < ARRAY_SIZE (bdesc_args);
27550 i++, d++)
27551 {
27552 if (d->name == 0)
27553 continue;
27554
27555 ftype = (enum ix86_builtin_func_type) d->flag;
27556 def_builtin_const (d->mask, d->name, ftype, d->code);
27557 }
27558
27559 /* pcmpestr[im] insns. */
27560 for (i = 0, d = bdesc_pcmpestr;
27561 i < ARRAY_SIZE (bdesc_pcmpestr);
27562 i++, d++)
27563 {
27564 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27565 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27566 else
27567 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27568 def_builtin_const (d->mask, d->name, ftype, d->code);
27569 }
27570
27571 /* pcmpistr[im] insns. */
27572 for (i = 0, d = bdesc_pcmpistr;
27573 i < ARRAY_SIZE (bdesc_pcmpistr);
27574 i++, d++)
27575 {
27576 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27577 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27578 else
27579 ftype = INT_FTYPE_V16QI_V16QI_INT;
27580 def_builtin_const (d->mask, d->name, ftype, d->code);
27581 }
27582
27583 /* comi/ucomi insns. */
27584 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27585 {
27586 if (d->mask == OPTION_MASK_ISA_SSE2)
27587 ftype = INT_FTYPE_V2DF_V2DF;
27588 else
27589 ftype = INT_FTYPE_V4SF_V4SF;
27590 def_builtin_const (d->mask, d->name, ftype, d->code);
27591 }
27592
27593 /* SSE */
27594 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27595 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27596 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27597 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27598
27599 /* SSE or 3DNow!A */
27600 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27601 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27602 IX86_BUILTIN_MASKMOVQ);
27603
27604 /* SSE2 */
27605 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27606 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27607
27608 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27609 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27610 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27611 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27612
27613 /* SSE3. */
27614 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27615 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27616 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27617 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27618
27619 /* AES */
27620 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27621 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27622 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27623 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27624 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27625 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27626 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27627 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27628 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27629 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27630 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27631 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27632
27633 /* PCLMUL */
27634 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27635 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27636
27637 /* RDRND */
27638 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27639 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27640 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27641 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27642 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27643 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27644 IX86_BUILTIN_RDRAND64_STEP);
27645
27646 /* AVX2 */
27647 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27648 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27649 IX86_BUILTIN_GATHERSIV2DF);
27650
27651 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27652 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27653 IX86_BUILTIN_GATHERSIV4DF);
27654
27655 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27656 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27657 IX86_BUILTIN_GATHERDIV2DF);
27658
27659 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27660 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27661 IX86_BUILTIN_GATHERDIV4DF);
27662
27663 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27664 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27665 IX86_BUILTIN_GATHERSIV4SF);
27666
27667 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27668 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27669 IX86_BUILTIN_GATHERSIV8SF);
27670
27671 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27672 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27673 IX86_BUILTIN_GATHERDIV4SF);
27674
27675 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27676 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27677 IX86_BUILTIN_GATHERDIV8SF);
27678
27679 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27680 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27681 IX86_BUILTIN_GATHERSIV2DI);
27682
27683 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27684 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27685 IX86_BUILTIN_GATHERSIV4DI);
27686
27687 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27688 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27689 IX86_BUILTIN_GATHERDIV2DI);
27690
27691 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27692 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27693 IX86_BUILTIN_GATHERDIV4DI);
27694
27695 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27696 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27697 IX86_BUILTIN_GATHERSIV4SI);
27698
27699 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27700 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27701 IX86_BUILTIN_GATHERSIV8SI);
27702
27703 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27704 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27705 IX86_BUILTIN_GATHERDIV4SI);
27706
27707 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27708 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27709 IX86_BUILTIN_GATHERDIV8SI);
27710
27711 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27712 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27713 IX86_BUILTIN_GATHERALTSIV4DF);
27714
27715 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27716 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27717 IX86_BUILTIN_GATHERALTDIV8SF);
27718
27719 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27720 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27721 IX86_BUILTIN_GATHERALTSIV4DI);
27722
27723 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27724 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27725 IX86_BUILTIN_GATHERALTDIV8SI);
27726
27727 /* RTM. */
27728 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27729 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27730
27731 /* MMX access to the vec_init patterns. */
27732 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27733 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27734
27735 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27736 V4HI_FTYPE_HI_HI_HI_HI,
27737 IX86_BUILTIN_VEC_INIT_V4HI);
27738
27739 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27740 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27741 IX86_BUILTIN_VEC_INIT_V8QI);
27742
27743 /* Access to the vec_extract patterns. */
27744 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27745 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27746 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27747 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27748 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27749 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27750 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27751 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27752 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27753 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27754
27755 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27756 "__builtin_ia32_vec_ext_v4hi",
27757 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27758
27759 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27760 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27761
27762 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27763 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27764
27765 /* Access to the vec_set patterns. */
27766 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27767 "__builtin_ia32_vec_set_v2di",
27768 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27769
27770 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27771 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27772
27773 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27774 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27775
27776 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27777 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27778
27779 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27780 "__builtin_ia32_vec_set_v4hi",
27781 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27782
27783 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27784 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27785
27786 /* Add FMA4 multi-arg argument instructions */
27787 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27788 {
27789 if (d->name == 0)
27790 continue;
27791
27792 ftype = (enum ix86_builtin_func_type) d->flag;
27793 def_builtin_const (d->mask, d->name, ftype, d->code);
27794 }
27795 }
27796
27797 /* This builds the processor_model struct type defined in
27798 libgcc/config/i386/cpuinfo.c */
27799
27800 static tree
27801 build_processor_model_struct (void)
27802 {
27803 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
27804 "__cpu_features"};
27805 tree field = NULL_TREE, field_chain = NULL_TREE;
27806 int i;
27807 tree type = make_node (RECORD_TYPE);
27808
27809 /* The first 3 fields are unsigned int. */
27810 for (i = 0; i < 3; ++i)
27811 {
27812 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27813 get_identifier (field_name[i]), unsigned_type_node);
27814 if (field_chain != NULL_TREE)
27815 DECL_CHAIN (field) = field_chain;
27816 field_chain = field;
27817 }
27818
27819 /* The last field is an array of unsigned integers of size one. */
27820 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27821 get_identifier (field_name[3]),
27822 build_array_type (unsigned_type_node,
27823 build_index_type (size_one_node)));
27824 if (field_chain != NULL_TREE)
27825 DECL_CHAIN (field) = field_chain;
27826 field_chain = field;
27827
27828 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
27829 return type;
27830 }
27831
27832 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
27833
27834 static tree
27835 make_var_decl (tree type, const char *name)
27836 {
27837 tree new_decl;
27838
27839 new_decl = build_decl (UNKNOWN_LOCATION,
27840 VAR_DECL,
27841 get_identifier(name),
27842 type);
27843
27844 DECL_EXTERNAL (new_decl) = 1;
27845 TREE_STATIC (new_decl) = 1;
27846 TREE_PUBLIC (new_decl) = 1;
27847 DECL_INITIAL (new_decl) = 0;
27848 DECL_ARTIFICIAL (new_decl) = 0;
27849 DECL_PRESERVE_P (new_decl) = 1;
27850
27851 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
27852 assemble_variable (new_decl, 0, 0, 0);
27853
27854 return new_decl;
27855 }
27856
27857 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
27858 into an integer defined in libgcc/config/i386/cpuinfo.c */
27859
27860 static tree
27861 fold_builtin_cpu (tree fndecl, tree *args)
27862 {
27863 unsigned int i;
27864 enum ix86_builtins fn_code = (enum ix86_builtins)
27865 DECL_FUNCTION_CODE (fndecl);
27866 tree param_string_cst = NULL;
27867
27868 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
27869 enum processor_features
27870 {
27871 F_CMOV = 0,
27872 F_MMX,
27873 F_POPCNT,
27874 F_SSE,
27875 F_SSE2,
27876 F_SSE3,
27877 F_SSSE3,
27878 F_SSE4_1,
27879 F_SSE4_2,
27880 F_AVX,
27881 F_AVX2,
27882 F_MAX
27883 };
27884
27885 /* These are the values for vendor types and cpu types and subtypes
27886 in cpuinfo.c. Cpu types and subtypes should be subtracted by
27887 the corresponding start value. */
27888 enum processor_model
27889 {
27890 M_INTEL = 1,
27891 M_AMD,
27892 M_CPU_TYPE_START,
27893 M_INTEL_ATOM,
27894 M_INTEL_CORE2,
27895 M_INTEL_COREI7,
27896 M_AMDFAM10H,
27897 M_AMDFAM15H,
27898 M_CPU_SUBTYPE_START,
27899 M_INTEL_COREI7_NEHALEM,
27900 M_INTEL_COREI7_WESTMERE,
27901 M_INTEL_COREI7_SANDYBRIDGE,
27902 M_AMDFAM10H_BARCELONA,
27903 M_AMDFAM10H_SHANGHAI,
27904 M_AMDFAM10H_ISTANBUL,
27905 M_AMDFAM15H_BDVER1,
27906 M_AMDFAM15H_BDVER2
27907 };
27908
27909 static struct _arch_names_table
27910 {
27911 const char *const name;
27912 const enum processor_model model;
27913 }
27914 const arch_names_table[] =
27915 {
27916 {"amd", M_AMD},
27917 {"intel", M_INTEL},
27918 {"atom", M_INTEL_ATOM},
27919 {"core2", M_INTEL_CORE2},
27920 {"corei7", M_INTEL_COREI7},
27921 {"nehalem", M_INTEL_COREI7_NEHALEM},
27922 {"westmere", M_INTEL_COREI7_WESTMERE},
27923 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
27924 {"amdfam10h", M_AMDFAM10H},
27925 {"barcelona", M_AMDFAM10H_BARCELONA},
27926 {"shanghai", M_AMDFAM10H_SHANGHAI},
27927 {"istanbul", M_AMDFAM10H_ISTANBUL},
27928 {"amdfam15h", M_AMDFAM15H},
27929 {"bdver1", M_AMDFAM15H_BDVER1},
27930 {"bdver2", M_AMDFAM15H_BDVER2},
27931 };
27932
27933 static struct _isa_names_table
27934 {
27935 const char *const name;
27936 const enum processor_features feature;
27937 }
27938 const isa_names_table[] =
27939 {
27940 {"cmov", F_CMOV},
27941 {"mmx", F_MMX},
27942 {"popcnt", F_POPCNT},
27943 {"sse", F_SSE},
27944 {"sse2", F_SSE2},
27945 {"sse3", F_SSE3},
27946 {"ssse3", F_SSSE3},
27947 {"sse4.1", F_SSE4_1},
27948 {"sse4.2", F_SSE4_2},
27949 {"avx", F_AVX},
27950 {"avx2", F_AVX2}
27951 };
27952
27953 static tree __processor_model_type = NULL_TREE;
27954 static tree __cpu_model_var = NULL_TREE;
27955
27956 if (__processor_model_type == NULL_TREE)
27957 __processor_model_type = build_processor_model_struct ();
27958
27959 if (__cpu_model_var == NULL_TREE)
27960 __cpu_model_var = make_var_decl (__processor_model_type,
27961 "__cpu_model");
27962
27963 gcc_assert ((args != NULL) && (*args != NULL));
27964
27965 param_string_cst = *args;
27966 while (param_string_cst
27967 && TREE_CODE (param_string_cst) != STRING_CST)
27968 {
27969 /* *args must be a expr that can contain other EXPRS leading to a
27970 STRING_CST. */
27971 if (!EXPR_P (param_string_cst))
27972 {
27973 error ("Parameter to builtin must be a string constant or literal");
27974 return integer_zero_node;
27975 }
27976 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
27977 }
27978
27979 gcc_assert (param_string_cst);
27980
27981 if (fn_code == IX86_BUILTIN_CPU_IS)
27982 {
27983 tree ref;
27984 tree field;
27985 unsigned int field_val = 0;
27986 unsigned int NUM_ARCH_NAMES
27987 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
27988
27989 for (i = 0; i < NUM_ARCH_NAMES; i++)
27990 if (strcmp (arch_names_table[i].name,
27991 TREE_STRING_POINTER (param_string_cst)) == 0)
27992 break;
27993
27994 if (i == NUM_ARCH_NAMES)
27995 {
27996 error ("Parameter to builtin not valid: %s",
27997 TREE_STRING_POINTER (param_string_cst));
27998 return integer_zero_node;
27999 }
28000
28001 field = TYPE_FIELDS (__processor_model_type);
28002 field_val = arch_names_table[i].model;
28003
28004 /* CPU types are stored in the next field. */
28005 if (field_val > M_CPU_TYPE_START
28006 && field_val < M_CPU_SUBTYPE_START)
28007 {
28008 field = DECL_CHAIN (field);
28009 field_val -= M_CPU_TYPE_START;
28010 }
28011
28012 /* CPU subtypes are stored in the next field. */
28013 if (field_val > M_CPU_SUBTYPE_START)
28014 {
28015 field = DECL_CHAIN ( DECL_CHAIN (field));
28016 field_val -= M_CPU_SUBTYPE_START;
28017 }
28018
28019 /* Get the appropriate field in __cpu_model. */
28020 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28021 field, NULL_TREE);
28022
28023 /* Check the value. */
28024 return build2 (EQ_EXPR, unsigned_type_node, ref,
28025 build_int_cstu (unsigned_type_node, field_val));
28026 }
28027 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28028 {
28029 tree ref;
28030 tree array_elt;
28031 tree field;
28032 unsigned int field_val = 0;
28033 unsigned int NUM_ISA_NAMES
28034 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28035
28036 for (i = 0; i < NUM_ISA_NAMES; i++)
28037 if (strcmp (isa_names_table[i].name,
28038 TREE_STRING_POINTER (param_string_cst)) == 0)
28039 break;
28040
28041 if (i == NUM_ISA_NAMES)
28042 {
28043 error ("Parameter to builtin not valid: %s",
28044 TREE_STRING_POINTER (param_string_cst));
28045 return integer_zero_node;
28046 }
28047
28048 field = TYPE_FIELDS (__processor_model_type);
28049 /* Get the last field, which is __cpu_features. */
28050 while (DECL_CHAIN (field))
28051 field = DECL_CHAIN (field);
28052
28053 /* Get the appropriate field: __cpu_model.__cpu_features */
28054 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28055 field, NULL_TREE);
28056
28057 /* Access the 0th element of __cpu_features array. */
28058 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28059 integer_zero_node, NULL_TREE, NULL_TREE);
28060
28061 field_val = (1 << isa_names_table[i].feature);
28062 /* Return __cpu_model.__cpu_features[0] & field_val */
28063 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28064 build_int_cstu (unsigned_type_node, field_val));
28065 }
28066 gcc_unreachable ();
28067 }
28068
28069 static tree
28070 ix86_fold_builtin (tree fndecl, int n_args,
28071 tree *args, bool ignore ATTRIBUTE_UNUSED)
28072 {
28073 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28074 {
28075 enum ix86_builtins fn_code = (enum ix86_builtins)
28076 DECL_FUNCTION_CODE (fndecl);
28077 if (fn_code == IX86_BUILTIN_CPU_IS
28078 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28079 {
28080 gcc_assert (n_args == 1);
28081 return fold_builtin_cpu (fndecl, args);
28082 }
28083 }
28084
28085 return NULL_TREE;
28086 }
28087
28088 /* Make builtins to detect cpu type and features supported. NAME is
28089 the builtin name, CODE is the builtin code, and FTYPE is the function
28090 type of the builtin. */
28091
28092 static void
28093 make_cpu_type_builtin (const char* name, int code,
28094 enum ix86_builtin_func_type ftype, bool is_const)
28095 {
28096 tree decl;
28097 tree type;
28098
28099 type = ix86_get_builtin_func_type (ftype);
28100 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28101 NULL, NULL_TREE);
28102 gcc_assert (decl != NULL_TREE);
28103 ix86_builtins[(int) code] = decl;
28104 TREE_READONLY (decl) = is_const;
28105 }
28106
28107 /* Make builtins to get CPU type and features supported. The created
28108 builtins are :
28109
28110 __builtin_cpu_init (), to detect cpu type and features,
28111 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28112 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28113 */
28114
28115 static void
28116 ix86_init_platform_type_builtins (void)
28117 {
28118 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28119 INT_FTYPE_VOID, false);
28120 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28121 INT_FTYPE_PCCHAR, true);
28122 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28123 INT_FTYPE_PCCHAR, true);
28124 }
28125
28126 /* Internal method for ix86_init_builtins. */
28127
28128 static void
28129 ix86_init_builtins_va_builtins_abi (void)
28130 {
28131 tree ms_va_ref, sysv_va_ref;
28132 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28133 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28134 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28135 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28136
28137 if (!TARGET_64BIT)
28138 return;
28139 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28140 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28141 ms_va_ref = build_reference_type (ms_va_list_type_node);
28142 sysv_va_ref =
28143 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28144
28145 fnvoid_va_end_ms =
28146 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28147 fnvoid_va_start_ms =
28148 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28149 fnvoid_va_end_sysv =
28150 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28151 fnvoid_va_start_sysv =
28152 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28153 NULL_TREE);
28154 fnvoid_va_copy_ms =
28155 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28156 NULL_TREE);
28157 fnvoid_va_copy_sysv =
28158 build_function_type_list (void_type_node, sysv_va_ref,
28159 sysv_va_ref, NULL_TREE);
28160
28161 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28162 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28163 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28164 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28165 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28166 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28167 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28168 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28169 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28170 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28171 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28172 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28173 }
28174
28175 static void
28176 ix86_init_builtin_types (void)
28177 {
28178 tree float128_type_node, float80_type_node;
28179
28180 /* The __float80 type. */
28181 float80_type_node = long_double_type_node;
28182 if (TYPE_MODE (float80_type_node) != XFmode)
28183 {
28184 /* The __float80 type. */
28185 float80_type_node = make_node (REAL_TYPE);
28186
28187 TYPE_PRECISION (float80_type_node) = 80;
28188 layout_type (float80_type_node);
28189 }
28190 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28191
28192 /* The __float128 type. */
28193 float128_type_node = make_node (REAL_TYPE);
28194 TYPE_PRECISION (float128_type_node) = 128;
28195 layout_type (float128_type_node);
28196 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28197
28198 /* This macro is built by i386-builtin-types.awk. */
28199 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28200 }
28201
28202 static void
28203 ix86_init_builtins (void)
28204 {
28205 tree t;
28206
28207 ix86_init_builtin_types ();
28208
28209 /* Builtins to get CPU type and features. */
28210 ix86_init_platform_type_builtins ();
28211
28212 /* TFmode support builtins. */
28213 def_builtin_const (0, "__builtin_infq",
28214 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28215 def_builtin_const (0, "__builtin_huge_valq",
28216 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28217
28218 /* We will expand them to normal call if SSE isn't available since
28219 they are used by libgcc. */
28220 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28221 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28222 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28223 TREE_READONLY (t) = 1;
28224 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28225
28226 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28227 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28228 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28229 TREE_READONLY (t) = 1;
28230 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28231
28232 ix86_init_tm_builtins ();
28233 ix86_init_mmx_sse_builtins ();
28234
28235 if (TARGET_LP64)
28236 ix86_init_builtins_va_builtins_abi ();
28237
28238 #ifdef SUBTARGET_INIT_BUILTINS
28239 SUBTARGET_INIT_BUILTINS;
28240 #endif
28241 }
28242
28243 /* Return the ix86 builtin for CODE. */
28244
28245 static tree
28246 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28247 {
28248 if (code >= IX86_BUILTIN_MAX)
28249 return error_mark_node;
28250
28251 return ix86_builtins[code];
28252 }
28253
28254 /* Errors in the source file can cause expand_expr to return const0_rtx
28255 where we expect a vector. To avoid crashing, use one of the vector
28256 clear instructions. */
28257 static rtx
28258 safe_vector_operand (rtx x, enum machine_mode mode)
28259 {
28260 if (x == const0_rtx)
28261 x = CONST0_RTX (mode);
28262 return x;
28263 }
28264
28265 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28266
28267 static rtx
28268 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28269 {
28270 rtx pat;
28271 tree arg0 = CALL_EXPR_ARG (exp, 0);
28272 tree arg1 = CALL_EXPR_ARG (exp, 1);
28273 rtx op0 = expand_normal (arg0);
28274 rtx op1 = expand_normal (arg1);
28275 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28276 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28277 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28278
28279 if (VECTOR_MODE_P (mode0))
28280 op0 = safe_vector_operand (op0, mode0);
28281 if (VECTOR_MODE_P (mode1))
28282 op1 = safe_vector_operand (op1, mode1);
28283
28284 if (optimize || !target
28285 || GET_MODE (target) != tmode
28286 || !insn_data[icode].operand[0].predicate (target, tmode))
28287 target = gen_reg_rtx (tmode);
28288
28289 if (GET_MODE (op1) == SImode && mode1 == TImode)
28290 {
28291 rtx x = gen_reg_rtx (V4SImode);
28292 emit_insn (gen_sse2_loadd (x, op1));
28293 op1 = gen_lowpart (TImode, x);
28294 }
28295
28296 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28297 op0 = copy_to_mode_reg (mode0, op0);
28298 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28299 op1 = copy_to_mode_reg (mode1, op1);
28300
28301 pat = GEN_FCN (icode) (target, op0, op1);
28302 if (! pat)
28303 return 0;
28304
28305 emit_insn (pat);
28306
28307 return target;
28308 }
28309
28310 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28311
28312 static rtx
28313 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28314 enum ix86_builtin_func_type m_type,
28315 enum rtx_code sub_code)
28316 {
28317 rtx pat;
28318 int i;
28319 int nargs;
28320 bool comparison_p = false;
28321 bool tf_p = false;
28322 bool last_arg_constant = false;
28323 int num_memory = 0;
28324 struct {
28325 rtx op;
28326 enum machine_mode mode;
28327 } args[4];
28328
28329 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28330
28331 switch (m_type)
28332 {
28333 case MULTI_ARG_4_DF2_DI_I:
28334 case MULTI_ARG_4_DF2_DI_I1:
28335 case MULTI_ARG_4_SF2_SI_I:
28336 case MULTI_ARG_4_SF2_SI_I1:
28337 nargs = 4;
28338 last_arg_constant = true;
28339 break;
28340
28341 case MULTI_ARG_3_SF:
28342 case MULTI_ARG_3_DF:
28343 case MULTI_ARG_3_SF2:
28344 case MULTI_ARG_3_DF2:
28345 case MULTI_ARG_3_DI:
28346 case MULTI_ARG_3_SI:
28347 case MULTI_ARG_3_SI_DI:
28348 case MULTI_ARG_3_HI:
28349 case MULTI_ARG_3_HI_SI:
28350 case MULTI_ARG_3_QI:
28351 case MULTI_ARG_3_DI2:
28352 case MULTI_ARG_3_SI2:
28353 case MULTI_ARG_3_HI2:
28354 case MULTI_ARG_3_QI2:
28355 nargs = 3;
28356 break;
28357
28358 case MULTI_ARG_2_SF:
28359 case MULTI_ARG_2_DF:
28360 case MULTI_ARG_2_DI:
28361 case MULTI_ARG_2_SI:
28362 case MULTI_ARG_2_HI:
28363 case MULTI_ARG_2_QI:
28364 nargs = 2;
28365 break;
28366
28367 case MULTI_ARG_2_DI_IMM:
28368 case MULTI_ARG_2_SI_IMM:
28369 case MULTI_ARG_2_HI_IMM:
28370 case MULTI_ARG_2_QI_IMM:
28371 nargs = 2;
28372 last_arg_constant = true;
28373 break;
28374
28375 case MULTI_ARG_1_SF:
28376 case MULTI_ARG_1_DF:
28377 case MULTI_ARG_1_SF2:
28378 case MULTI_ARG_1_DF2:
28379 case MULTI_ARG_1_DI:
28380 case MULTI_ARG_1_SI:
28381 case MULTI_ARG_1_HI:
28382 case MULTI_ARG_1_QI:
28383 case MULTI_ARG_1_SI_DI:
28384 case MULTI_ARG_1_HI_DI:
28385 case MULTI_ARG_1_HI_SI:
28386 case MULTI_ARG_1_QI_DI:
28387 case MULTI_ARG_1_QI_SI:
28388 case MULTI_ARG_1_QI_HI:
28389 nargs = 1;
28390 break;
28391
28392 case MULTI_ARG_2_DI_CMP:
28393 case MULTI_ARG_2_SI_CMP:
28394 case MULTI_ARG_2_HI_CMP:
28395 case MULTI_ARG_2_QI_CMP:
28396 nargs = 2;
28397 comparison_p = true;
28398 break;
28399
28400 case MULTI_ARG_2_SF_TF:
28401 case MULTI_ARG_2_DF_TF:
28402 case MULTI_ARG_2_DI_TF:
28403 case MULTI_ARG_2_SI_TF:
28404 case MULTI_ARG_2_HI_TF:
28405 case MULTI_ARG_2_QI_TF:
28406 nargs = 2;
28407 tf_p = true;
28408 break;
28409
28410 default:
28411 gcc_unreachable ();
28412 }
28413
28414 if (optimize || !target
28415 || GET_MODE (target) != tmode
28416 || !insn_data[icode].operand[0].predicate (target, tmode))
28417 target = gen_reg_rtx (tmode);
28418
28419 gcc_assert (nargs <= 4);
28420
28421 for (i = 0; i < nargs; i++)
28422 {
28423 tree arg = CALL_EXPR_ARG (exp, i);
28424 rtx op = expand_normal (arg);
28425 int adjust = (comparison_p) ? 1 : 0;
28426 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28427
28428 if (last_arg_constant && i == nargs - 1)
28429 {
28430 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28431 {
28432 enum insn_code new_icode = icode;
28433 switch (icode)
28434 {
28435 case CODE_FOR_xop_vpermil2v2df3:
28436 case CODE_FOR_xop_vpermil2v4sf3:
28437 case CODE_FOR_xop_vpermil2v4df3:
28438 case CODE_FOR_xop_vpermil2v8sf3:
28439 error ("the last argument must be a 2-bit immediate");
28440 return gen_reg_rtx (tmode);
28441 case CODE_FOR_xop_rotlv2di3:
28442 new_icode = CODE_FOR_rotlv2di3;
28443 goto xop_rotl;
28444 case CODE_FOR_xop_rotlv4si3:
28445 new_icode = CODE_FOR_rotlv4si3;
28446 goto xop_rotl;
28447 case CODE_FOR_xop_rotlv8hi3:
28448 new_icode = CODE_FOR_rotlv8hi3;
28449 goto xop_rotl;
28450 case CODE_FOR_xop_rotlv16qi3:
28451 new_icode = CODE_FOR_rotlv16qi3;
28452 xop_rotl:
28453 if (CONST_INT_P (op))
28454 {
28455 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28456 op = GEN_INT (INTVAL (op) & mask);
28457 gcc_checking_assert
28458 (insn_data[icode].operand[i + 1].predicate (op, mode));
28459 }
28460 else
28461 {
28462 gcc_checking_assert
28463 (nargs == 2
28464 && insn_data[new_icode].operand[0].mode == tmode
28465 && insn_data[new_icode].operand[1].mode == tmode
28466 && insn_data[new_icode].operand[2].mode == mode
28467 && insn_data[new_icode].operand[0].predicate
28468 == insn_data[icode].operand[0].predicate
28469 && insn_data[new_icode].operand[1].predicate
28470 == insn_data[icode].operand[1].predicate);
28471 icode = new_icode;
28472 goto non_constant;
28473 }
28474 break;
28475 default:
28476 gcc_unreachable ();
28477 }
28478 }
28479 }
28480 else
28481 {
28482 non_constant:
28483 if (VECTOR_MODE_P (mode))
28484 op = safe_vector_operand (op, mode);
28485
28486 /* If we aren't optimizing, only allow one memory operand to be
28487 generated. */
28488 if (memory_operand (op, mode))
28489 num_memory++;
28490
28491 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28492
28493 if (optimize
28494 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28495 || num_memory > 1)
28496 op = force_reg (mode, op);
28497 }
28498
28499 args[i].op = op;
28500 args[i].mode = mode;
28501 }
28502
28503 switch (nargs)
28504 {
28505 case 1:
28506 pat = GEN_FCN (icode) (target, args[0].op);
28507 break;
28508
28509 case 2:
28510 if (tf_p)
28511 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28512 GEN_INT ((int)sub_code));
28513 else if (! comparison_p)
28514 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28515 else
28516 {
28517 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28518 args[0].op,
28519 args[1].op);
28520
28521 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28522 }
28523 break;
28524
28525 case 3:
28526 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28527 break;
28528
28529 case 4:
28530 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28531 break;
28532
28533 default:
28534 gcc_unreachable ();
28535 }
28536
28537 if (! pat)
28538 return 0;
28539
28540 emit_insn (pat);
28541 return target;
28542 }
28543
28544 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28545 insns with vec_merge. */
28546
28547 static rtx
28548 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28549 rtx target)
28550 {
28551 rtx pat;
28552 tree arg0 = CALL_EXPR_ARG (exp, 0);
28553 rtx op1, op0 = expand_normal (arg0);
28554 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28555 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28556
28557 if (optimize || !target
28558 || GET_MODE (target) != tmode
28559 || !insn_data[icode].operand[0].predicate (target, tmode))
28560 target = gen_reg_rtx (tmode);
28561
28562 if (VECTOR_MODE_P (mode0))
28563 op0 = safe_vector_operand (op0, mode0);
28564
28565 if ((optimize && !register_operand (op0, mode0))
28566 || !insn_data[icode].operand[1].predicate (op0, mode0))
28567 op0 = copy_to_mode_reg (mode0, op0);
28568
28569 op1 = op0;
28570 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28571 op1 = copy_to_mode_reg (mode0, op1);
28572
28573 pat = GEN_FCN (icode) (target, op0, op1);
28574 if (! pat)
28575 return 0;
28576 emit_insn (pat);
28577 return target;
28578 }
28579
28580 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28581
28582 static rtx
28583 ix86_expand_sse_compare (const struct builtin_description *d,
28584 tree exp, rtx target, bool swap)
28585 {
28586 rtx pat;
28587 tree arg0 = CALL_EXPR_ARG (exp, 0);
28588 tree arg1 = CALL_EXPR_ARG (exp, 1);
28589 rtx op0 = expand_normal (arg0);
28590 rtx op1 = expand_normal (arg1);
28591 rtx op2;
28592 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28593 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28594 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28595 enum rtx_code comparison = d->comparison;
28596
28597 if (VECTOR_MODE_P (mode0))
28598 op0 = safe_vector_operand (op0, mode0);
28599 if (VECTOR_MODE_P (mode1))
28600 op1 = safe_vector_operand (op1, mode1);
28601
28602 /* Swap operands if we have a comparison that isn't available in
28603 hardware. */
28604 if (swap)
28605 {
28606 rtx tmp = gen_reg_rtx (mode1);
28607 emit_move_insn (tmp, op1);
28608 op1 = op0;
28609 op0 = tmp;
28610 }
28611
28612 if (optimize || !target
28613 || GET_MODE (target) != tmode
28614 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28615 target = gen_reg_rtx (tmode);
28616
28617 if ((optimize && !register_operand (op0, mode0))
28618 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28619 op0 = copy_to_mode_reg (mode0, op0);
28620 if ((optimize && !register_operand (op1, mode1))
28621 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28622 op1 = copy_to_mode_reg (mode1, op1);
28623
28624 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28625 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28626 if (! pat)
28627 return 0;
28628 emit_insn (pat);
28629 return target;
28630 }
28631
28632 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28633
28634 static rtx
28635 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28636 rtx target)
28637 {
28638 rtx pat;
28639 tree arg0 = CALL_EXPR_ARG (exp, 0);
28640 tree arg1 = CALL_EXPR_ARG (exp, 1);
28641 rtx op0 = expand_normal (arg0);
28642 rtx op1 = expand_normal (arg1);
28643 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28644 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28645 enum rtx_code comparison = d->comparison;
28646
28647 if (VECTOR_MODE_P (mode0))
28648 op0 = safe_vector_operand (op0, mode0);
28649 if (VECTOR_MODE_P (mode1))
28650 op1 = safe_vector_operand (op1, mode1);
28651
28652 /* Swap operands if we have a comparison that isn't available in
28653 hardware. */
28654 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28655 {
28656 rtx tmp = op1;
28657 op1 = op0;
28658 op0 = tmp;
28659 }
28660
28661 target = gen_reg_rtx (SImode);
28662 emit_move_insn (target, const0_rtx);
28663 target = gen_rtx_SUBREG (QImode, target, 0);
28664
28665 if ((optimize && !register_operand (op0, mode0))
28666 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28667 op0 = copy_to_mode_reg (mode0, op0);
28668 if ((optimize && !register_operand (op1, mode1))
28669 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28670 op1 = copy_to_mode_reg (mode1, op1);
28671
28672 pat = GEN_FCN (d->icode) (op0, op1);
28673 if (! pat)
28674 return 0;
28675 emit_insn (pat);
28676 emit_insn (gen_rtx_SET (VOIDmode,
28677 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28678 gen_rtx_fmt_ee (comparison, QImode,
28679 SET_DEST (pat),
28680 const0_rtx)));
28681
28682 return SUBREG_REG (target);
28683 }
28684
28685 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28686
28687 static rtx
28688 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28689 rtx target)
28690 {
28691 rtx pat;
28692 tree arg0 = CALL_EXPR_ARG (exp, 0);
28693 rtx op1, op0 = expand_normal (arg0);
28694 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28695 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28696
28697 if (optimize || target == 0
28698 || GET_MODE (target) != tmode
28699 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28700 target = gen_reg_rtx (tmode);
28701
28702 if (VECTOR_MODE_P (mode0))
28703 op0 = safe_vector_operand (op0, mode0);
28704
28705 if ((optimize && !register_operand (op0, mode0))
28706 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28707 op0 = copy_to_mode_reg (mode0, op0);
28708
28709 op1 = GEN_INT (d->comparison);
28710
28711 pat = GEN_FCN (d->icode) (target, op0, op1);
28712 if (! pat)
28713 return 0;
28714 emit_insn (pat);
28715 return target;
28716 }
28717
28718 static rtx
28719 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28720 tree exp, rtx target)
28721 {
28722 rtx pat;
28723 tree arg0 = CALL_EXPR_ARG (exp, 0);
28724 tree arg1 = CALL_EXPR_ARG (exp, 1);
28725 rtx op0 = expand_normal (arg0);
28726 rtx op1 = expand_normal (arg1);
28727 rtx op2;
28728 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28729 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28730 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28731
28732 if (optimize || target == 0
28733 || GET_MODE (target) != tmode
28734 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28735 target = gen_reg_rtx (tmode);
28736
28737 op0 = safe_vector_operand (op0, mode0);
28738 op1 = safe_vector_operand (op1, mode1);
28739
28740 if ((optimize && !register_operand (op0, mode0))
28741 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28742 op0 = copy_to_mode_reg (mode0, op0);
28743 if ((optimize && !register_operand (op1, mode1))
28744 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28745 op1 = copy_to_mode_reg (mode1, op1);
28746
28747 op2 = GEN_INT (d->comparison);
28748
28749 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28750 if (! pat)
28751 return 0;
28752 emit_insn (pat);
28753 return target;
28754 }
28755
28756 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28757
28758 static rtx
28759 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28760 rtx target)
28761 {
28762 rtx pat;
28763 tree arg0 = CALL_EXPR_ARG (exp, 0);
28764 tree arg1 = CALL_EXPR_ARG (exp, 1);
28765 rtx op0 = expand_normal (arg0);
28766 rtx op1 = expand_normal (arg1);
28767 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28768 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28769 enum rtx_code comparison = d->comparison;
28770
28771 if (VECTOR_MODE_P (mode0))
28772 op0 = safe_vector_operand (op0, mode0);
28773 if (VECTOR_MODE_P (mode1))
28774 op1 = safe_vector_operand (op1, mode1);
28775
28776 target = gen_reg_rtx (SImode);
28777 emit_move_insn (target, const0_rtx);
28778 target = gen_rtx_SUBREG (QImode, target, 0);
28779
28780 if ((optimize && !register_operand (op0, mode0))
28781 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28782 op0 = copy_to_mode_reg (mode0, op0);
28783 if ((optimize && !register_operand (op1, mode1))
28784 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28785 op1 = copy_to_mode_reg (mode1, op1);
28786
28787 pat = GEN_FCN (d->icode) (op0, op1);
28788 if (! pat)
28789 return 0;
28790 emit_insn (pat);
28791 emit_insn (gen_rtx_SET (VOIDmode,
28792 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28793 gen_rtx_fmt_ee (comparison, QImode,
28794 SET_DEST (pat),
28795 const0_rtx)));
28796
28797 return SUBREG_REG (target);
28798 }
28799
28800 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28801
28802 static rtx
28803 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28804 tree exp, rtx target)
28805 {
28806 rtx pat;
28807 tree arg0 = CALL_EXPR_ARG (exp, 0);
28808 tree arg1 = CALL_EXPR_ARG (exp, 1);
28809 tree arg2 = CALL_EXPR_ARG (exp, 2);
28810 tree arg3 = CALL_EXPR_ARG (exp, 3);
28811 tree arg4 = CALL_EXPR_ARG (exp, 4);
28812 rtx scratch0, scratch1;
28813 rtx op0 = expand_normal (arg0);
28814 rtx op1 = expand_normal (arg1);
28815 rtx op2 = expand_normal (arg2);
28816 rtx op3 = expand_normal (arg3);
28817 rtx op4 = expand_normal (arg4);
28818 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28819
28820 tmode0 = insn_data[d->icode].operand[0].mode;
28821 tmode1 = insn_data[d->icode].operand[1].mode;
28822 modev2 = insn_data[d->icode].operand[2].mode;
28823 modei3 = insn_data[d->icode].operand[3].mode;
28824 modev4 = insn_data[d->icode].operand[4].mode;
28825 modei5 = insn_data[d->icode].operand[5].mode;
28826 modeimm = insn_data[d->icode].operand[6].mode;
28827
28828 if (VECTOR_MODE_P (modev2))
28829 op0 = safe_vector_operand (op0, modev2);
28830 if (VECTOR_MODE_P (modev4))
28831 op2 = safe_vector_operand (op2, modev4);
28832
28833 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28834 op0 = copy_to_mode_reg (modev2, op0);
28835 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28836 op1 = copy_to_mode_reg (modei3, op1);
28837 if ((optimize && !register_operand (op2, modev4))
28838 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28839 op2 = copy_to_mode_reg (modev4, op2);
28840 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28841 op3 = copy_to_mode_reg (modei5, op3);
28842
28843 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28844 {
28845 error ("the fifth argument must be an 8-bit immediate");
28846 return const0_rtx;
28847 }
28848
28849 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28850 {
28851 if (optimize || !target
28852 || GET_MODE (target) != tmode0
28853 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28854 target = gen_reg_rtx (tmode0);
28855
28856 scratch1 = gen_reg_rtx (tmode1);
28857
28858 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28859 }
28860 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28861 {
28862 if (optimize || !target
28863 || GET_MODE (target) != tmode1
28864 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28865 target = gen_reg_rtx (tmode1);
28866
28867 scratch0 = gen_reg_rtx (tmode0);
28868
28869 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28870 }
28871 else
28872 {
28873 gcc_assert (d->flag);
28874
28875 scratch0 = gen_reg_rtx (tmode0);
28876 scratch1 = gen_reg_rtx (tmode1);
28877
28878 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28879 }
28880
28881 if (! pat)
28882 return 0;
28883
28884 emit_insn (pat);
28885
28886 if (d->flag)
28887 {
28888 target = gen_reg_rtx (SImode);
28889 emit_move_insn (target, const0_rtx);
28890 target = gen_rtx_SUBREG (QImode, target, 0);
28891
28892 emit_insn
28893 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28894 gen_rtx_fmt_ee (EQ, QImode,
28895 gen_rtx_REG ((enum machine_mode) d->flag,
28896 FLAGS_REG),
28897 const0_rtx)));
28898 return SUBREG_REG (target);
28899 }
28900 else
28901 return target;
28902 }
28903
28904
28905 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28906
28907 static rtx
28908 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28909 tree exp, rtx target)
28910 {
28911 rtx pat;
28912 tree arg0 = CALL_EXPR_ARG (exp, 0);
28913 tree arg1 = CALL_EXPR_ARG (exp, 1);
28914 tree arg2 = CALL_EXPR_ARG (exp, 2);
28915 rtx scratch0, scratch1;
28916 rtx op0 = expand_normal (arg0);
28917 rtx op1 = expand_normal (arg1);
28918 rtx op2 = expand_normal (arg2);
28919 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28920
28921 tmode0 = insn_data[d->icode].operand[0].mode;
28922 tmode1 = insn_data[d->icode].operand[1].mode;
28923 modev2 = insn_data[d->icode].operand[2].mode;
28924 modev3 = insn_data[d->icode].operand[3].mode;
28925 modeimm = insn_data[d->icode].operand[4].mode;
28926
28927 if (VECTOR_MODE_P (modev2))
28928 op0 = safe_vector_operand (op0, modev2);
28929 if (VECTOR_MODE_P (modev3))
28930 op1 = safe_vector_operand (op1, modev3);
28931
28932 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28933 op0 = copy_to_mode_reg (modev2, op0);
28934 if ((optimize && !register_operand (op1, modev3))
28935 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28936 op1 = copy_to_mode_reg (modev3, op1);
28937
28938 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28939 {
28940 error ("the third argument must be an 8-bit immediate");
28941 return const0_rtx;
28942 }
28943
28944 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28945 {
28946 if (optimize || !target
28947 || GET_MODE (target) != tmode0
28948 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28949 target = gen_reg_rtx (tmode0);
28950
28951 scratch1 = gen_reg_rtx (tmode1);
28952
28953 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28954 }
28955 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28956 {
28957 if (optimize || !target
28958 || GET_MODE (target) != tmode1
28959 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28960 target = gen_reg_rtx (tmode1);
28961
28962 scratch0 = gen_reg_rtx (tmode0);
28963
28964 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28965 }
28966 else
28967 {
28968 gcc_assert (d->flag);
28969
28970 scratch0 = gen_reg_rtx (tmode0);
28971 scratch1 = gen_reg_rtx (tmode1);
28972
28973 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28974 }
28975
28976 if (! pat)
28977 return 0;
28978
28979 emit_insn (pat);
28980
28981 if (d->flag)
28982 {
28983 target = gen_reg_rtx (SImode);
28984 emit_move_insn (target, const0_rtx);
28985 target = gen_rtx_SUBREG (QImode, target, 0);
28986
28987 emit_insn
28988 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28989 gen_rtx_fmt_ee (EQ, QImode,
28990 gen_rtx_REG ((enum machine_mode) d->flag,
28991 FLAGS_REG),
28992 const0_rtx)));
28993 return SUBREG_REG (target);
28994 }
28995 else
28996 return target;
28997 }
28998
28999 /* Subroutine of ix86_expand_builtin to take care of insns with
29000 variable number of operands. */
29001
29002 static rtx
29003 ix86_expand_args_builtin (const struct builtin_description *d,
29004 tree exp, rtx target)
29005 {
29006 rtx pat, real_target;
29007 unsigned int i, nargs;
29008 unsigned int nargs_constant = 0;
29009 int num_memory = 0;
29010 struct
29011 {
29012 rtx op;
29013 enum machine_mode mode;
29014 } args[4];
29015 bool last_arg_count = false;
29016 enum insn_code icode = d->icode;
29017 const struct insn_data_d *insn_p = &insn_data[icode];
29018 enum machine_mode tmode = insn_p->operand[0].mode;
29019 enum machine_mode rmode = VOIDmode;
29020 bool swap = false;
29021 enum rtx_code comparison = d->comparison;
29022
29023 switch ((enum ix86_builtin_func_type) d->flag)
29024 {
29025 case V2DF_FTYPE_V2DF_ROUND:
29026 case V4DF_FTYPE_V4DF_ROUND:
29027 case V4SF_FTYPE_V4SF_ROUND:
29028 case V8SF_FTYPE_V8SF_ROUND:
29029 case V4SI_FTYPE_V4SF_ROUND:
29030 case V8SI_FTYPE_V8SF_ROUND:
29031 return ix86_expand_sse_round (d, exp, target);
29032 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29033 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29034 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29035 case INT_FTYPE_V8SF_V8SF_PTEST:
29036 case INT_FTYPE_V4DI_V4DI_PTEST:
29037 case INT_FTYPE_V4DF_V4DF_PTEST:
29038 case INT_FTYPE_V4SF_V4SF_PTEST:
29039 case INT_FTYPE_V2DI_V2DI_PTEST:
29040 case INT_FTYPE_V2DF_V2DF_PTEST:
29041 return ix86_expand_sse_ptest (d, exp, target);
29042 case FLOAT128_FTYPE_FLOAT128:
29043 case FLOAT_FTYPE_FLOAT:
29044 case INT_FTYPE_INT:
29045 case UINT64_FTYPE_INT:
29046 case UINT16_FTYPE_UINT16:
29047 case INT64_FTYPE_INT64:
29048 case INT64_FTYPE_V4SF:
29049 case INT64_FTYPE_V2DF:
29050 case INT_FTYPE_V16QI:
29051 case INT_FTYPE_V8QI:
29052 case INT_FTYPE_V8SF:
29053 case INT_FTYPE_V4DF:
29054 case INT_FTYPE_V4SF:
29055 case INT_FTYPE_V2DF:
29056 case INT_FTYPE_V32QI:
29057 case V16QI_FTYPE_V16QI:
29058 case V8SI_FTYPE_V8SF:
29059 case V8SI_FTYPE_V4SI:
29060 case V8HI_FTYPE_V8HI:
29061 case V8HI_FTYPE_V16QI:
29062 case V8QI_FTYPE_V8QI:
29063 case V8SF_FTYPE_V8SF:
29064 case V8SF_FTYPE_V8SI:
29065 case V8SF_FTYPE_V4SF:
29066 case V8SF_FTYPE_V8HI:
29067 case V4SI_FTYPE_V4SI:
29068 case V4SI_FTYPE_V16QI:
29069 case V4SI_FTYPE_V4SF:
29070 case V4SI_FTYPE_V8SI:
29071 case V4SI_FTYPE_V8HI:
29072 case V4SI_FTYPE_V4DF:
29073 case V4SI_FTYPE_V2DF:
29074 case V4HI_FTYPE_V4HI:
29075 case V4DF_FTYPE_V4DF:
29076 case V4DF_FTYPE_V4SI:
29077 case V4DF_FTYPE_V4SF:
29078 case V4DF_FTYPE_V2DF:
29079 case V4SF_FTYPE_V4SF:
29080 case V4SF_FTYPE_V4SI:
29081 case V4SF_FTYPE_V8SF:
29082 case V4SF_FTYPE_V4DF:
29083 case V4SF_FTYPE_V8HI:
29084 case V4SF_FTYPE_V2DF:
29085 case V2DI_FTYPE_V2DI:
29086 case V2DI_FTYPE_V16QI:
29087 case V2DI_FTYPE_V8HI:
29088 case V2DI_FTYPE_V4SI:
29089 case V2DF_FTYPE_V2DF:
29090 case V2DF_FTYPE_V4SI:
29091 case V2DF_FTYPE_V4DF:
29092 case V2DF_FTYPE_V4SF:
29093 case V2DF_FTYPE_V2SI:
29094 case V2SI_FTYPE_V2SI:
29095 case V2SI_FTYPE_V4SF:
29096 case V2SI_FTYPE_V2SF:
29097 case V2SI_FTYPE_V2DF:
29098 case V2SF_FTYPE_V2SF:
29099 case V2SF_FTYPE_V2SI:
29100 case V32QI_FTYPE_V32QI:
29101 case V32QI_FTYPE_V16QI:
29102 case V16HI_FTYPE_V16HI:
29103 case V16HI_FTYPE_V8HI:
29104 case V8SI_FTYPE_V8SI:
29105 case V16HI_FTYPE_V16QI:
29106 case V8SI_FTYPE_V16QI:
29107 case V4DI_FTYPE_V16QI:
29108 case V8SI_FTYPE_V8HI:
29109 case V4DI_FTYPE_V8HI:
29110 case V4DI_FTYPE_V4SI:
29111 case V4DI_FTYPE_V2DI:
29112 nargs = 1;
29113 break;
29114 case V4SF_FTYPE_V4SF_VEC_MERGE:
29115 case V2DF_FTYPE_V2DF_VEC_MERGE:
29116 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29117 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29118 case V16QI_FTYPE_V16QI_V16QI:
29119 case V16QI_FTYPE_V8HI_V8HI:
29120 case V8QI_FTYPE_V8QI_V8QI:
29121 case V8QI_FTYPE_V4HI_V4HI:
29122 case V8HI_FTYPE_V8HI_V8HI:
29123 case V8HI_FTYPE_V16QI_V16QI:
29124 case V8HI_FTYPE_V4SI_V4SI:
29125 case V8SF_FTYPE_V8SF_V8SF:
29126 case V8SF_FTYPE_V8SF_V8SI:
29127 case V4SI_FTYPE_V4SI_V4SI:
29128 case V4SI_FTYPE_V8HI_V8HI:
29129 case V4SI_FTYPE_V4SF_V4SF:
29130 case V4SI_FTYPE_V2DF_V2DF:
29131 case V4HI_FTYPE_V4HI_V4HI:
29132 case V4HI_FTYPE_V8QI_V8QI:
29133 case V4HI_FTYPE_V2SI_V2SI:
29134 case V4DF_FTYPE_V4DF_V4DF:
29135 case V4DF_FTYPE_V4DF_V4DI:
29136 case V4SF_FTYPE_V4SF_V4SF:
29137 case V4SF_FTYPE_V4SF_V4SI:
29138 case V4SF_FTYPE_V4SF_V2SI:
29139 case V4SF_FTYPE_V4SF_V2DF:
29140 case V4SF_FTYPE_V4SF_DI:
29141 case V4SF_FTYPE_V4SF_SI:
29142 case V2DI_FTYPE_V2DI_V2DI:
29143 case V2DI_FTYPE_V16QI_V16QI:
29144 case V2DI_FTYPE_V4SI_V4SI:
29145 case V2DI_FTYPE_V2DI_V16QI:
29146 case V2DI_FTYPE_V2DF_V2DF:
29147 case V2SI_FTYPE_V2SI_V2SI:
29148 case V2SI_FTYPE_V4HI_V4HI:
29149 case V2SI_FTYPE_V2SF_V2SF:
29150 case V2DF_FTYPE_V2DF_V2DF:
29151 case V2DF_FTYPE_V2DF_V4SF:
29152 case V2DF_FTYPE_V2DF_V2DI:
29153 case V2DF_FTYPE_V2DF_DI:
29154 case V2DF_FTYPE_V2DF_SI:
29155 case V2SF_FTYPE_V2SF_V2SF:
29156 case V1DI_FTYPE_V1DI_V1DI:
29157 case V1DI_FTYPE_V8QI_V8QI:
29158 case V1DI_FTYPE_V2SI_V2SI:
29159 case V32QI_FTYPE_V16HI_V16HI:
29160 case V16HI_FTYPE_V8SI_V8SI:
29161 case V32QI_FTYPE_V32QI_V32QI:
29162 case V16HI_FTYPE_V32QI_V32QI:
29163 case V16HI_FTYPE_V16HI_V16HI:
29164 case V8SI_FTYPE_V4DF_V4DF:
29165 case V8SI_FTYPE_V8SI_V8SI:
29166 case V8SI_FTYPE_V16HI_V16HI:
29167 case V4DI_FTYPE_V4DI_V4DI:
29168 case V4DI_FTYPE_V8SI_V8SI:
29169 if (comparison == UNKNOWN)
29170 return ix86_expand_binop_builtin (icode, exp, target);
29171 nargs = 2;
29172 break;
29173 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29174 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29175 gcc_assert (comparison != UNKNOWN);
29176 nargs = 2;
29177 swap = true;
29178 break;
29179 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29180 case V16HI_FTYPE_V16HI_SI_COUNT:
29181 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29182 case V8SI_FTYPE_V8SI_SI_COUNT:
29183 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29184 case V4DI_FTYPE_V4DI_INT_COUNT:
29185 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29186 case V8HI_FTYPE_V8HI_SI_COUNT:
29187 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29188 case V4SI_FTYPE_V4SI_SI_COUNT:
29189 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29190 case V4HI_FTYPE_V4HI_SI_COUNT:
29191 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29192 case V2DI_FTYPE_V2DI_SI_COUNT:
29193 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29194 case V2SI_FTYPE_V2SI_SI_COUNT:
29195 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29196 case V1DI_FTYPE_V1DI_SI_COUNT:
29197 nargs = 2;
29198 last_arg_count = true;
29199 break;
29200 case UINT64_FTYPE_UINT64_UINT64:
29201 case UINT_FTYPE_UINT_UINT:
29202 case UINT_FTYPE_UINT_USHORT:
29203 case UINT_FTYPE_UINT_UCHAR:
29204 case UINT16_FTYPE_UINT16_INT:
29205 case UINT8_FTYPE_UINT8_INT:
29206 nargs = 2;
29207 break;
29208 case V2DI_FTYPE_V2DI_INT_CONVERT:
29209 nargs = 2;
29210 rmode = V1TImode;
29211 nargs_constant = 1;
29212 break;
29213 case V4DI_FTYPE_V4DI_INT_CONVERT:
29214 nargs = 2;
29215 rmode = V2TImode;
29216 nargs_constant = 1;
29217 break;
29218 case V8HI_FTYPE_V8HI_INT:
29219 case V8HI_FTYPE_V8SF_INT:
29220 case V8HI_FTYPE_V4SF_INT:
29221 case V8SF_FTYPE_V8SF_INT:
29222 case V4SI_FTYPE_V4SI_INT:
29223 case V4SI_FTYPE_V8SI_INT:
29224 case V4HI_FTYPE_V4HI_INT:
29225 case V4DF_FTYPE_V4DF_INT:
29226 case V4SF_FTYPE_V4SF_INT:
29227 case V4SF_FTYPE_V8SF_INT:
29228 case V2DI_FTYPE_V2DI_INT:
29229 case V2DF_FTYPE_V2DF_INT:
29230 case V2DF_FTYPE_V4DF_INT:
29231 case V16HI_FTYPE_V16HI_INT:
29232 case V8SI_FTYPE_V8SI_INT:
29233 case V4DI_FTYPE_V4DI_INT:
29234 case V2DI_FTYPE_V4DI_INT:
29235 nargs = 2;
29236 nargs_constant = 1;
29237 break;
29238 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29239 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29240 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29241 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29242 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29243 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29244 nargs = 3;
29245 break;
29246 case V32QI_FTYPE_V32QI_V32QI_INT:
29247 case V16HI_FTYPE_V16HI_V16HI_INT:
29248 case V16QI_FTYPE_V16QI_V16QI_INT:
29249 case V4DI_FTYPE_V4DI_V4DI_INT:
29250 case V8HI_FTYPE_V8HI_V8HI_INT:
29251 case V8SI_FTYPE_V8SI_V8SI_INT:
29252 case V8SI_FTYPE_V8SI_V4SI_INT:
29253 case V8SF_FTYPE_V8SF_V8SF_INT:
29254 case V8SF_FTYPE_V8SF_V4SF_INT:
29255 case V4SI_FTYPE_V4SI_V4SI_INT:
29256 case V4DF_FTYPE_V4DF_V4DF_INT:
29257 case V4DF_FTYPE_V4DF_V2DF_INT:
29258 case V4SF_FTYPE_V4SF_V4SF_INT:
29259 case V2DI_FTYPE_V2DI_V2DI_INT:
29260 case V4DI_FTYPE_V4DI_V2DI_INT:
29261 case V2DF_FTYPE_V2DF_V2DF_INT:
29262 nargs = 3;
29263 nargs_constant = 1;
29264 break;
29265 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29266 nargs = 3;
29267 rmode = V4DImode;
29268 nargs_constant = 1;
29269 break;
29270 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29271 nargs = 3;
29272 rmode = V2DImode;
29273 nargs_constant = 1;
29274 break;
29275 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29276 nargs = 3;
29277 rmode = DImode;
29278 nargs_constant = 1;
29279 break;
29280 case V2DI_FTYPE_V2DI_UINT_UINT:
29281 nargs = 3;
29282 nargs_constant = 2;
29283 break;
29284 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29285 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29286 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29287 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29288 nargs = 4;
29289 nargs_constant = 1;
29290 break;
29291 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29292 nargs = 4;
29293 nargs_constant = 2;
29294 break;
29295 default:
29296 gcc_unreachable ();
29297 }
29298
29299 gcc_assert (nargs <= ARRAY_SIZE (args));
29300
29301 if (comparison != UNKNOWN)
29302 {
29303 gcc_assert (nargs == 2);
29304 return ix86_expand_sse_compare (d, exp, target, swap);
29305 }
29306
29307 if (rmode == VOIDmode || rmode == tmode)
29308 {
29309 if (optimize
29310 || target == 0
29311 || GET_MODE (target) != tmode
29312 || !insn_p->operand[0].predicate (target, tmode))
29313 target = gen_reg_rtx (tmode);
29314 real_target = target;
29315 }
29316 else
29317 {
29318 target = gen_reg_rtx (rmode);
29319 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29320 }
29321
29322 for (i = 0; i < nargs; i++)
29323 {
29324 tree arg = CALL_EXPR_ARG (exp, i);
29325 rtx op = expand_normal (arg);
29326 enum machine_mode mode = insn_p->operand[i + 1].mode;
29327 bool match = insn_p->operand[i + 1].predicate (op, mode);
29328
29329 if (last_arg_count && (i + 1) == nargs)
29330 {
29331 /* SIMD shift insns take either an 8-bit immediate or
29332 register as count. But builtin functions take int as
29333 count. If count doesn't match, we put it in register. */
29334 if (!match)
29335 {
29336 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29337 if (!insn_p->operand[i + 1].predicate (op, mode))
29338 op = copy_to_reg (op);
29339 }
29340 }
29341 else if ((nargs - i) <= nargs_constant)
29342 {
29343 if (!match)
29344 switch (icode)
29345 {
29346 case CODE_FOR_avx2_inserti128:
29347 case CODE_FOR_avx2_extracti128:
29348 error ("the last argument must be an 1-bit immediate");
29349 return const0_rtx;
29350
29351 case CODE_FOR_sse4_1_roundsd:
29352 case CODE_FOR_sse4_1_roundss:
29353
29354 case CODE_FOR_sse4_1_roundpd:
29355 case CODE_FOR_sse4_1_roundps:
29356 case CODE_FOR_avx_roundpd256:
29357 case CODE_FOR_avx_roundps256:
29358
29359 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29360 case CODE_FOR_sse4_1_roundps_sfix:
29361 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29362 case CODE_FOR_avx_roundps_sfix256:
29363
29364 case CODE_FOR_sse4_1_blendps:
29365 case CODE_FOR_avx_blendpd256:
29366 case CODE_FOR_avx_vpermilv4df:
29367 error ("the last argument must be a 4-bit immediate");
29368 return const0_rtx;
29369
29370 case CODE_FOR_sse4_1_blendpd:
29371 case CODE_FOR_avx_vpermilv2df:
29372 case CODE_FOR_xop_vpermil2v2df3:
29373 case CODE_FOR_xop_vpermil2v4sf3:
29374 case CODE_FOR_xop_vpermil2v4df3:
29375 case CODE_FOR_xop_vpermil2v8sf3:
29376 error ("the last argument must be a 2-bit immediate");
29377 return const0_rtx;
29378
29379 case CODE_FOR_avx_vextractf128v4df:
29380 case CODE_FOR_avx_vextractf128v8sf:
29381 case CODE_FOR_avx_vextractf128v8si:
29382 case CODE_FOR_avx_vinsertf128v4df:
29383 case CODE_FOR_avx_vinsertf128v8sf:
29384 case CODE_FOR_avx_vinsertf128v8si:
29385 error ("the last argument must be a 1-bit immediate");
29386 return const0_rtx;
29387
29388 case CODE_FOR_avx_vmcmpv2df3:
29389 case CODE_FOR_avx_vmcmpv4sf3:
29390 case CODE_FOR_avx_cmpv2df3:
29391 case CODE_FOR_avx_cmpv4sf3:
29392 case CODE_FOR_avx_cmpv4df3:
29393 case CODE_FOR_avx_cmpv8sf3:
29394 error ("the last argument must be a 5-bit immediate");
29395 return const0_rtx;
29396
29397 default:
29398 switch (nargs_constant)
29399 {
29400 case 2:
29401 if ((nargs - i) == nargs_constant)
29402 {
29403 error ("the next to last argument must be an 8-bit immediate");
29404 break;
29405 }
29406 case 1:
29407 error ("the last argument must be an 8-bit immediate");
29408 break;
29409 default:
29410 gcc_unreachable ();
29411 }
29412 return const0_rtx;
29413 }
29414 }
29415 else
29416 {
29417 if (VECTOR_MODE_P (mode))
29418 op = safe_vector_operand (op, mode);
29419
29420 /* If we aren't optimizing, only allow one memory operand to
29421 be generated. */
29422 if (memory_operand (op, mode))
29423 num_memory++;
29424
29425 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29426 {
29427 if (optimize || !match || num_memory > 1)
29428 op = copy_to_mode_reg (mode, op);
29429 }
29430 else
29431 {
29432 op = copy_to_reg (op);
29433 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29434 }
29435 }
29436
29437 args[i].op = op;
29438 args[i].mode = mode;
29439 }
29440
29441 switch (nargs)
29442 {
29443 case 1:
29444 pat = GEN_FCN (icode) (real_target, args[0].op);
29445 break;
29446 case 2:
29447 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29448 break;
29449 case 3:
29450 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29451 args[2].op);
29452 break;
29453 case 4:
29454 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29455 args[2].op, args[3].op);
29456 break;
29457 default:
29458 gcc_unreachable ();
29459 }
29460
29461 if (! pat)
29462 return 0;
29463
29464 emit_insn (pat);
29465 return target;
29466 }
29467
29468 /* Subroutine of ix86_expand_builtin to take care of special insns
29469 with variable number of operands. */
29470
29471 static rtx
29472 ix86_expand_special_args_builtin (const struct builtin_description *d,
29473 tree exp, rtx target)
29474 {
29475 tree arg;
29476 rtx pat, op;
29477 unsigned int i, nargs, arg_adjust, memory;
29478 struct
29479 {
29480 rtx op;
29481 enum machine_mode mode;
29482 } args[3];
29483 enum insn_code icode = d->icode;
29484 bool last_arg_constant = false;
29485 const struct insn_data_d *insn_p = &insn_data[icode];
29486 enum machine_mode tmode = insn_p->operand[0].mode;
29487 enum { load, store } klass;
29488
29489 switch ((enum ix86_builtin_func_type) d->flag)
29490 {
29491 case VOID_FTYPE_VOID:
29492 if (icode == CODE_FOR_avx_vzeroupper)
29493 target = GEN_INT (vzeroupper_intrinsic);
29494 emit_insn (GEN_FCN (icode) (target));
29495 return 0;
29496 case VOID_FTYPE_UINT64:
29497 case VOID_FTYPE_UNSIGNED:
29498 nargs = 0;
29499 klass = store;
29500 memory = 0;
29501 break;
29502
29503 case INT_FTYPE_VOID:
29504 case UINT64_FTYPE_VOID:
29505 case UNSIGNED_FTYPE_VOID:
29506 nargs = 0;
29507 klass = load;
29508 memory = 0;
29509 break;
29510 case UINT64_FTYPE_PUNSIGNED:
29511 case V2DI_FTYPE_PV2DI:
29512 case V4DI_FTYPE_PV4DI:
29513 case V32QI_FTYPE_PCCHAR:
29514 case V16QI_FTYPE_PCCHAR:
29515 case V8SF_FTYPE_PCV4SF:
29516 case V8SF_FTYPE_PCFLOAT:
29517 case V4SF_FTYPE_PCFLOAT:
29518 case V4DF_FTYPE_PCV2DF:
29519 case V4DF_FTYPE_PCDOUBLE:
29520 case V2DF_FTYPE_PCDOUBLE:
29521 case VOID_FTYPE_PVOID:
29522 nargs = 1;
29523 klass = load;
29524 memory = 0;
29525 break;
29526 case VOID_FTYPE_PV2SF_V4SF:
29527 case VOID_FTYPE_PV4DI_V4DI:
29528 case VOID_FTYPE_PV2DI_V2DI:
29529 case VOID_FTYPE_PCHAR_V32QI:
29530 case VOID_FTYPE_PCHAR_V16QI:
29531 case VOID_FTYPE_PFLOAT_V8SF:
29532 case VOID_FTYPE_PFLOAT_V4SF:
29533 case VOID_FTYPE_PDOUBLE_V4DF:
29534 case VOID_FTYPE_PDOUBLE_V2DF:
29535 case VOID_FTYPE_PLONGLONG_LONGLONG:
29536 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29537 case VOID_FTYPE_PINT_INT:
29538 nargs = 1;
29539 klass = store;
29540 /* Reserve memory operand for target. */
29541 memory = ARRAY_SIZE (args);
29542 break;
29543 case V4SF_FTYPE_V4SF_PCV2SF:
29544 case V2DF_FTYPE_V2DF_PCDOUBLE:
29545 nargs = 2;
29546 klass = load;
29547 memory = 1;
29548 break;
29549 case V8SF_FTYPE_PCV8SF_V8SI:
29550 case V4DF_FTYPE_PCV4DF_V4DI:
29551 case V4SF_FTYPE_PCV4SF_V4SI:
29552 case V2DF_FTYPE_PCV2DF_V2DI:
29553 case V8SI_FTYPE_PCV8SI_V8SI:
29554 case V4DI_FTYPE_PCV4DI_V4DI:
29555 case V4SI_FTYPE_PCV4SI_V4SI:
29556 case V2DI_FTYPE_PCV2DI_V2DI:
29557 nargs = 2;
29558 klass = load;
29559 memory = 0;
29560 break;
29561 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29562 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29563 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29564 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29565 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29566 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29567 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29568 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29569 nargs = 2;
29570 klass = store;
29571 /* Reserve memory operand for target. */
29572 memory = ARRAY_SIZE (args);
29573 break;
29574 case VOID_FTYPE_UINT_UINT_UINT:
29575 case VOID_FTYPE_UINT64_UINT_UINT:
29576 case UCHAR_FTYPE_UINT_UINT_UINT:
29577 case UCHAR_FTYPE_UINT64_UINT_UINT:
29578 nargs = 3;
29579 klass = load;
29580 memory = ARRAY_SIZE (args);
29581 last_arg_constant = true;
29582 break;
29583 default:
29584 gcc_unreachable ();
29585 }
29586
29587 gcc_assert (nargs <= ARRAY_SIZE (args));
29588
29589 if (klass == store)
29590 {
29591 arg = CALL_EXPR_ARG (exp, 0);
29592 op = expand_normal (arg);
29593 gcc_assert (target == 0);
29594 if (memory)
29595 {
29596 if (GET_MODE (op) != Pmode)
29597 op = convert_to_mode (Pmode, op, 1);
29598 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29599 }
29600 else
29601 target = force_reg (tmode, op);
29602 arg_adjust = 1;
29603 }
29604 else
29605 {
29606 arg_adjust = 0;
29607 if (optimize
29608 || target == 0
29609 || !register_operand (target, tmode)
29610 || GET_MODE (target) != tmode)
29611 target = gen_reg_rtx (tmode);
29612 }
29613
29614 for (i = 0; i < nargs; i++)
29615 {
29616 enum machine_mode mode = insn_p->operand[i + 1].mode;
29617 bool match;
29618
29619 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29620 op = expand_normal (arg);
29621 match = insn_p->operand[i + 1].predicate (op, mode);
29622
29623 if (last_arg_constant && (i + 1) == nargs)
29624 {
29625 if (!match)
29626 {
29627 if (icode == CODE_FOR_lwp_lwpvalsi3
29628 || icode == CODE_FOR_lwp_lwpinssi3
29629 || icode == CODE_FOR_lwp_lwpvaldi3
29630 || icode == CODE_FOR_lwp_lwpinsdi3)
29631 error ("the last argument must be a 32-bit immediate");
29632 else
29633 error ("the last argument must be an 8-bit immediate");
29634 return const0_rtx;
29635 }
29636 }
29637 else
29638 {
29639 if (i == memory)
29640 {
29641 /* This must be the memory operand. */
29642 if (GET_MODE (op) != Pmode)
29643 op = convert_to_mode (Pmode, op, 1);
29644 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29645 gcc_assert (GET_MODE (op) == mode
29646 || GET_MODE (op) == VOIDmode);
29647 }
29648 else
29649 {
29650 /* This must be register. */
29651 if (VECTOR_MODE_P (mode))
29652 op = safe_vector_operand (op, mode);
29653
29654 gcc_assert (GET_MODE (op) == mode
29655 || GET_MODE (op) == VOIDmode);
29656 op = copy_to_mode_reg (mode, op);
29657 }
29658 }
29659
29660 args[i].op = op;
29661 args[i].mode = mode;
29662 }
29663
29664 switch (nargs)
29665 {
29666 case 0:
29667 pat = GEN_FCN (icode) (target);
29668 break;
29669 case 1:
29670 pat = GEN_FCN (icode) (target, args[0].op);
29671 break;
29672 case 2:
29673 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29674 break;
29675 case 3:
29676 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29677 break;
29678 default:
29679 gcc_unreachable ();
29680 }
29681
29682 if (! pat)
29683 return 0;
29684 emit_insn (pat);
29685 return klass == store ? 0 : target;
29686 }
29687
29688 /* Return the integer constant in ARG. Constrain it to be in the range
29689 of the subparts of VEC_TYPE; issue an error if not. */
29690
29691 static int
29692 get_element_number (tree vec_type, tree arg)
29693 {
29694 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29695
29696 if (!host_integerp (arg, 1)
29697 || (elt = tree_low_cst (arg, 1), elt > max))
29698 {
29699 error ("selector must be an integer constant in the range 0..%wi", max);
29700 return 0;
29701 }
29702
29703 return elt;
29704 }
29705
29706 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29707 ix86_expand_vector_init. We DO have language-level syntax for this, in
29708 the form of (type){ init-list }. Except that since we can't place emms
29709 instructions from inside the compiler, we can't allow the use of MMX
29710 registers unless the user explicitly asks for it. So we do *not* define
29711 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29712 we have builtins invoked by mmintrin.h that gives us license to emit
29713 these sorts of instructions. */
29714
29715 static rtx
29716 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29717 {
29718 enum machine_mode tmode = TYPE_MODE (type);
29719 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29720 int i, n_elt = GET_MODE_NUNITS (tmode);
29721 rtvec v = rtvec_alloc (n_elt);
29722
29723 gcc_assert (VECTOR_MODE_P (tmode));
29724 gcc_assert (call_expr_nargs (exp) == n_elt);
29725
29726 for (i = 0; i < n_elt; ++i)
29727 {
29728 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29729 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29730 }
29731
29732 if (!target || !register_operand (target, tmode))
29733 target = gen_reg_rtx (tmode);
29734
29735 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29736 return target;
29737 }
29738
29739 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29740 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29741 had a language-level syntax for referencing vector elements. */
29742
29743 static rtx
29744 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29745 {
29746 enum machine_mode tmode, mode0;
29747 tree arg0, arg1;
29748 int elt;
29749 rtx op0;
29750
29751 arg0 = CALL_EXPR_ARG (exp, 0);
29752 arg1 = CALL_EXPR_ARG (exp, 1);
29753
29754 op0 = expand_normal (arg0);
29755 elt = get_element_number (TREE_TYPE (arg0), arg1);
29756
29757 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29758 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29759 gcc_assert (VECTOR_MODE_P (mode0));
29760
29761 op0 = force_reg (mode0, op0);
29762
29763 if (optimize || !target || !register_operand (target, tmode))
29764 target = gen_reg_rtx (tmode);
29765
29766 ix86_expand_vector_extract (true, target, op0, elt);
29767
29768 return target;
29769 }
29770
29771 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29772 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29773 a language-level syntax for referencing vector elements. */
29774
29775 static rtx
29776 ix86_expand_vec_set_builtin (tree exp)
29777 {
29778 enum machine_mode tmode, mode1;
29779 tree arg0, arg1, arg2;
29780 int elt;
29781 rtx op0, op1, target;
29782
29783 arg0 = CALL_EXPR_ARG (exp, 0);
29784 arg1 = CALL_EXPR_ARG (exp, 1);
29785 arg2 = CALL_EXPR_ARG (exp, 2);
29786
29787 tmode = TYPE_MODE (TREE_TYPE (arg0));
29788 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29789 gcc_assert (VECTOR_MODE_P (tmode));
29790
29791 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29792 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29793 elt = get_element_number (TREE_TYPE (arg0), arg2);
29794
29795 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29796 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29797
29798 op0 = force_reg (tmode, op0);
29799 op1 = force_reg (mode1, op1);
29800
29801 /* OP0 is the source of these builtin functions and shouldn't be
29802 modified. Create a copy, use it and return it as target. */
29803 target = gen_reg_rtx (tmode);
29804 emit_move_insn (target, op0);
29805 ix86_expand_vector_set (true, target, op1, elt);
29806
29807 return target;
29808 }
29809
29810 /* Expand an expression EXP that calls a built-in function,
29811 with result going to TARGET if that's convenient
29812 (and in mode MODE if that's convenient).
29813 SUBTARGET may be used as the target for computing one of EXP's operands.
29814 IGNORE is nonzero if the value is to be ignored. */
29815
29816 static rtx
29817 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29818 enum machine_mode mode ATTRIBUTE_UNUSED,
29819 int ignore ATTRIBUTE_UNUSED)
29820 {
29821 const struct builtin_description *d;
29822 size_t i;
29823 enum insn_code icode;
29824 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29825 tree arg0, arg1, arg2, arg3, arg4;
29826 rtx op0, op1, op2, op3, op4, pat;
29827 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29828 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29829
29830 /* For CPU builtins that can be folded, fold first and expand the fold. */
29831 switch (fcode)
29832 {
29833 case IX86_BUILTIN_CPU_INIT:
29834 {
29835 /* Make it call __cpu_indicator_init in libgcc. */
29836 tree call_expr, fndecl, type;
29837 type = build_function_type_list (integer_type_node, NULL_TREE);
29838 fndecl = build_fn_decl ("__cpu_indicator_init", type);
29839 call_expr = build_call_expr (fndecl, 0);
29840 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
29841 }
29842 case IX86_BUILTIN_CPU_IS:
29843 case IX86_BUILTIN_CPU_SUPPORTS:
29844 {
29845 tree arg0 = CALL_EXPR_ARG (exp, 0);
29846 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
29847 gcc_assert (fold_expr != NULL_TREE);
29848 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
29849 }
29850 }
29851
29852 /* Determine whether the builtin function is available under the current ISA.
29853 Originally the builtin was not created if it wasn't applicable to the
29854 current ISA based on the command line switches. With function specific
29855 options, we need to check in the context of the function making the call
29856 whether it is supported. */
29857 if (ix86_builtins_isa[fcode].isa
29858 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29859 {
29860 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29861 NULL, (enum fpmath_unit) 0, false);
29862
29863 if (!opts)
29864 error ("%qE needs unknown isa option", fndecl);
29865 else
29866 {
29867 gcc_assert (opts != NULL);
29868 error ("%qE needs isa option %s", fndecl, opts);
29869 free (opts);
29870 }
29871 return const0_rtx;
29872 }
29873
29874 switch (fcode)
29875 {
29876 case IX86_BUILTIN_MASKMOVQ:
29877 case IX86_BUILTIN_MASKMOVDQU:
29878 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29879 ? CODE_FOR_mmx_maskmovq
29880 : CODE_FOR_sse2_maskmovdqu);
29881 /* Note the arg order is different from the operand order. */
29882 arg1 = CALL_EXPR_ARG (exp, 0);
29883 arg2 = CALL_EXPR_ARG (exp, 1);
29884 arg0 = CALL_EXPR_ARG (exp, 2);
29885 op0 = expand_normal (arg0);
29886 op1 = expand_normal (arg1);
29887 op2 = expand_normal (arg2);
29888 mode0 = insn_data[icode].operand[0].mode;
29889 mode1 = insn_data[icode].operand[1].mode;
29890 mode2 = insn_data[icode].operand[2].mode;
29891
29892 if (GET_MODE (op0) != Pmode)
29893 op0 = convert_to_mode (Pmode, op0, 1);
29894 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29895
29896 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29897 op0 = copy_to_mode_reg (mode0, op0);
29898 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29899 op1 = copy_to_mode_reg (mode1, op1);
29900 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29901 op2 = copy_to_mode_reg (mode2, op2);
29902 pat = GEN_FCN (icode) (op0, op1, op2);
29903 if (! pat)
29904 return 0;
29905 emit_insn (pat);
29906 return 0;
29907
29908 case IX86_BUILTIN_LDMXCSR:
29909 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29910 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29911 emit_move_insn (target, op0);
29912 emit_insn (gen_sse_ldmxcsr (target));
29913 return 0;
29914
29915 case IX86_BUILTIN_STMXCSR:
29916 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29917 emit_insn (gen_sse_stmxcsr (target));
29918 return copy_to_mode_reg (SImode, target);
29919
29920 case IX86_BUILTIN_CLFLUSH:
29921 arg0 = CALL_EXPR_ARG (exp, 0);
29922 op0 = expand_normal (arg0);
29923 icode = CODE_FOR_sse2_clflush;
29924 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29925 {
29926 if (GET_MODE (op0) != Pmode)
29927 op0 = convert_to_mode (Pmode, op0, 1);
29928 op0 = force_reg (Pmode, op0);
29929 }
29930
29931 emit_insn (gen_sse2_clflush (op0));
29932 return 0;
29933
29934 case IX86_BUILTIN_MONITOR:
29935 arg0 = CALL_EXPR_ARG (exp, 0);
29936 arg1 = CALL_EXPR_ARG (exp, 1);
29937 arg2 = CALL_EXPR_ARG (exp, 2);
29938 op0 = expand_normal (arg0);
29939 op1 = expand_normal (arg1);
29940 op2 = expand_normal (arg2);
29941 if (!REG_P (op0))
29942 {
29943 if (GET_MODE (op0) != Pmode)
29944 op0 = convert_to_mode (Pmode, op0, 1);
29945 op0 = force_reg (Pmode, op0);
29946 }
29947 if (!REG_P (op1))
29948 op1 = copy_to_mode_reg (SImode, op1);
29949 if (!REG_P (op2))
29950 op2 = copy_to_mode_reg (SImode, op2);
29951 emit_insn (ix86_gen_monitor (op0, op1, op2));
29952 return 0;
29953
29954 case IX86_BUILTIN_MWAIT:
29955 arg0 = CALL_EXPR_ARG (exp, 0);
29956 arg1 = CALL_EXPR_ARG (exp, 1);
29957 op0 = expand_normal (arg0);
29958 op1 = expand_normal (arg1);
29959 if (!REG_P (op0))
29960 op0 = copy_to_mode_reg (SImode, op0);
29961 if (!REG_P (op1))
29962 op1 = copy_to_mode_reg (SImode, op1);
29963 emit_insn (gen_sse3_mwait (op0, op1));
29964 return 0;
29965
29966 case IX86_BUILTIN_VEC_INIT_V2SI:
29967 case IX86_BUILTIN_VEC_INIT_V4HI:
29968 case IX86_BUILTIN_VEC_INIT_V8QI:
29969 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29970
29971 case IX86_BUILTIN_VEC_EXT_V2DF:
29972 case IX86_BUILTIN_VEC_EXT_V2DI:
29973 case IX86_BUILTIN_VEC_EXT_V4SF:
29974 case IX86_BUILTIN_VEC_EXT_V4SI:
29975 case IX86_BUILTIN_VEC_EXT_V8HI:
29976 case IX86_BUILTIN_VEC_EXT_V2SI:
29977 case IX86_BUILTIN_VEC_EXT_V4HI:
29978 case IX86_BUILTIN_VEC_EXT_V16QI:
29979 return ix86_expand_vec_ext_builtin (exp, target);
29980
29981 case IX86_BUILTIN_VEC_SET_V2DI:
29982 case IX86_BUILTIN_VEC_SET_V4SF:
29983 case IX86_BUILTIN_VEC_SET_V4SI:
29984 case IX86_BUILTIN_VEC_SET_V8HI:
29985 case IX86_BUILTIN_VEC_SET_V4HI:
29986 case IX86_BUILTIN_VEC_SET_V16QI:
29987 return ix86_expand_vec_set_builtin (exp);
29988
29989 case IX86_BUILTIN_INFQ:
29990 case IX86_BUILTIN_HUGE_VALQ:
29991 {
29992 REAL_VALUE_TYPE inf;
29993 rtx tmp;
29994
29995 real_inf (&inf);
29996 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29997
29998 tmp = validize_mem (force_const_mem (mode, tmp));
29999
30000 if (target == 0)
30001 target = gen_reg_rtx (mode);
30002
30003 emit_move_insn (target, tmp);
30004 return target;
30005 }
30006
30007 case IX86_BUILTIN_LLWPCB:
30008 arg0 = CALL_EXPR_ARG (exp, 0);
30009 op0 = expand_normal (arg0);
30010 icode = CODE_FOR_lwp_llwpcb;
30011 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30012 {
30013 if (GET_MODE (op0) != Pmode)
30014 op0 = convert_to_mode (Pmode, op0, 1);
30015 op0 = force_reg (Pmode, op0);
30016 }
30017 emit_insn (gen_lwp_llwpcb (op0));
30018 return 0;
30019
30020 case IX86_BUILTIN_SLWPCB:
30021 icode = CODE_FOR_lwp_slwpcb;
30022 if (!target
30023 || !insn_data[icode].operand[0].predicate (target, Pmode))
30024 target = gen_reg_rtx (Pmode);
30025 emit_insn (gen_lwp_slwpcb (target));
30026 return target;
30027
30028 case IX86_BUILTIN_BEXTRI32:
30029 case IX86_BUILTIN_BEXTRI64:
30030 arg0 = CALL_EXPR_ARG (exp, 0);
30031 arg1 = CALL_EXPR_ARG (exp, 1);
30032 op0 = expand_normal (arg0);
30033 op1 = expand_normal (arg1);
30034 icode = (fcode == IX86_BUILTIN_BEXTRI32
30035 ? CODE_FOR_tbm_bextri_si
30036 : CODE_FOR_tbm_bextri_di);
30037 if (!CONST_INT_P (op1))
30038 {
30039 error ("last argument must be an immediate");
30040 return const0_rtx;
30041 }
30042 else
30043 {
30044 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30045 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30046 op1 = GEN_INT (length);
30047 op2 = GEN_INT (lsb_index);
30048 pat = GEN_FCN (icode) (target, op0, op1, op2);
30049 if (pat)
30050 emit_insn (pat);
30051 return target;
30052 }
30053
30054 case IX86_BUILTIN_RDRAND16_STEP:
30055 icode = CODE_FOR_rdrandhi_1;
30056 mode0 = HImode;
30057 goto rdrand_step;
30058
30059 case IX86_BUILTIN_RDRAND32_STEP:
30060 icode = CODE_FOR_rdrandsi_1;
30061 mode0 = SImode;
30062 goto rdrand_step;
30063
30064 case IX86_BUILTIN_RDRAND64_STEP:
30065 icode = CODE_FOR_rdranddi_1;
30066 mode0 = DImode;
30067
30068 rdrand_step:
30069 op0 = gen_reg_rtx (mode0);
30070 emit_insn (GEN_FCN (icode) (op0));
30071
30072 arg0 = CALL_EXPR_ARG (exp, 0);
30073 op1 = expand_normal (arg0);
30074 if (!address_operand (op1, VOIDmode))
30075 {
30076 op1 = convert_memory_address (Pmode, op1);
30077 op1 = copy_addr_to_reg (op1);
30078 }
30079 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30080
30081 op1 = gen_reg_rtx (SImode);
30082 emit_move_insn (op1, CONST1_RTX (SImode));
30083
30084 /* Emit SImode conditional move. */
30085 if (mode0 == HImode)
30086 {
30087 op2 = gen_reg_rtx (SImode);
30088 emit_insn (gen_zero_extendhisi2 (op2, op0));
30089 }
30090 else if (mode0 == SImode)
30091 op2 = op0;
30092 else
30093 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30094
30095 if (target == 0)
30096 target = gen_reg_rtx (SImode);
30097
30098 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30099 const0_rtx);
30100 emit_insn (gen_rtx_SET (VOIDmode, target,
30101 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30102 return target;
30103
30104 case IX86_BUILTIN_GATHERSIV2DF:
30105 icode = CODE_FOR_avx2_gathersiv2df;
30106 goto gather_gen;
30107 case IX86_BUILTIN_GATHERSIV4DF:
30108 icode = CODE_FOR_avx2_gathersiv4df;
30109 goto gather_gen;
30110 case IX86_BUILTIN_GATHERDIV2DF:
30111 icode = CODE_FOR_avx2_gatherdiv2df;
30112 goto gather_gen;
30113 case IX86_BUILTIN_GATHERDIV4DF:
30114 icode = CODE_FOR_avx2_gatherdiv4df;
30115 goto gather_gen;
30116 case IX86_BUILTIN_GATHERSIV4SF:
30117 icode = CODE_FOR_avx2_gathersiv4sf;
30118 goto gather_gen;
30119 case IX86_BUILTIN_GATHERSIV8SF:
30120 icode = CODE_FOR_avx2_gathersiv8sf;
30121 goto gather_gen;
30122 case IX86_BUILTIN_GATHERDIV4SF:
30123 icode = CODE_FOR_avx2_gatherdiv4sf;
30124 goto gather_gen;
30125 case IX86_BUILTIN_GATHERDIV8SF:
30126 icode = CODE_FOR_avx2_gatherdiv8sf;
30127 goto gather_gen;
30128 case IX86_BUILTIN_GATHERSIV2DI:
30129 icode = CODE_FOR_avx2_gathersiv2di;
30130 goto gather_gen;
30131 case IX86_BUILTIN_GATHERSIV4DI:
30132 icode = CODE_FOR_avx2_gathersiv4di;
30133 goto gather_gen;
30134 case IX86_BUILTIN_GATHERDIV2DI:
30135 icode = CODE_FOR_avx2_gatherdiv2di;
30136 goto gather_gen;
30137 case IX86_BUILTIN_GATHERDIV4DI:
30138 icode = CODE_FOR_avx2_gatherdiv4di;
30139 goto gather_gen;
30140 case IX86_BUILTIN_GATHERSIV4SI:
30141 icode = CODE_FOR_avx2_gathersiv4si;
30142 goto gather_gen;
30143 case IX86_BUILTIN_GATHERSIV8SI:
30144 icode = CODE_FOR_avx2_gathersiv8si;
30145 goto gather_gen;
30146 case IX86_BUILTIN_GATHERDIV4SI:
30147 icode = CODE_FOR_avx2_gatherdiv4si;
30148 goto gather_gen;
30149 case IX86_BUILTIN_GATHERDIV8SI:
30150 icode = CODE_FOR_avx2_gatherdiv8si;
30151 goto gather_gen;
30152 case IX86_BUILTIN_GATHERALTSIV4DF:
30153 icode = CODE_FOR_avx2_gathersiv4df;
30154 goto gather_gen;
30155 case IX86_BUILTIN_GATHERALTDIV8SF:
30156 icode = CODE_FOR_avx2_gatherdiv8sf;
30157 goto gather_gen;
30158 case IX86_BUILTIN_GATHERALTSIV4DI:
30159 icode = CODE_FOR_avx2_gathersiv4di;
30160 goto gather_gen;
30161 case IX86_BUILTIN_GATHERALTDIV8SI:
30162 icode = CODE_FOR_avx2_gatherdiv8si;
30163 goto gather_gen;
30164
30165 gather_gen:
30166 arg0 = CALL_EXPR_ARG (exp, 0);
30167 arg1 = CALL_EXPR_ARG (exp, 1);
30168 arg2 = CALL_EXPR_ARG (exp, 2);
30169 arg3 = CALL_EXPR_ARG (exp, 3);
30170 arg4 = CALL_EXPR_ARG (exp, 4);
30171 op0 = expand_normal (arg0);
30172 op1 = expand_normal (arg1);
30173 op2 = expand_normal (arg2);
30174 op3 = expand_normal (arg3);
30175 op4 = expand_normal (arg4);
30176 /* Note the arg order is different from the operand order. */
30177 mode0 = insn_data[icode].operand[1].mode;
30178 mode2 = insn_data[icode].operand[3].mode;
30179 mode3 = insn_data[icode].operand[4].mode;
30180 mode4 = insn_data[icode].operand[5].mode;
30181
30182 if (target == NULL_RTX
30183 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30184 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30185 else
30186 subtarget = target;
30187
30188 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30189 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30190 {
30191 rtx half = gen_reg_rtx (V4SImode);
30192 if (!nonimmediate_operand (op2, V8SImode))
30193 op2 = copy_to_mode_reg (V8SImode, op2);
30194 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30195 op2 = half;
30196 }
30197 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30198 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30199 {
30200 rtx (*gen) (rtx, rtx);
30201 rtx half = gen_reg_rtx (mode0);
30202 if (mode0 == V4SFmode)
30203 gen = gen_vec_extract_lo_v8sf;
30204 else
30205 gen = gen_vec_extract_lo_v8si;
30206 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30207 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30208 emit_insn (gen (half, op0));
30209 op0 = half;
30210 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30211 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30212 emit_insn (gen (half, op3));
30213 op3 = half;
30214 }
30215
30216 /* Force memory operand only with base register here. But we
30217 don't want to do it on memory operand for other builtin
30218 functions. */
30219 if (GET_MODE (op1) != Pmode)
30220 op1 = convert_to_mode (Pmode, op1, 1);
30221 op1 = force_reg (Pmode, op1);
30222
30223 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30224 op0 = copy_to_mode_reg (mode0, op0);
30225 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30226 op1 = copy_to_mode_reg (Pmode, op1);
30227 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30228 op2 = copy_to_mode_reg (mode2, op2);
30229 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30230 op3 = copy_to_mode_reg (mode3, op3);
30231 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30232 {
30233 error ("last argument must be scale 1, 2, 4, 8");
30234 return const0_rtx;
30235 }
30236
30237 /* Optimize. If mask is known to have all high bits set,
30238 replace op0 with pc_rtx to signal that the instruction
30239 overwrites the whole destination and doesn't use its
30240 previous contents. */
30241 if (optimize)
30242 {
30243 if (TREE_CODE (arg3) == VECTOR_CST)
30244 {
30245 unsigned int negative = 0;
30246 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30247 {
30248 tree cst = VECTOR_CST_ELT (arg3, i);
30249 if (TREE_CODE (cst) == INTEGER_CST
30250 && tree_int_cst_sign_bit (cst))
30251 negative++;
30252 else if (TREE_CODE (cst) == REAL_CST
30253 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30254 negative++;
30255 }
30256 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30257 op0 = pc_rtx;
30258 }
30259 else if (TREE_CODE (arg3) == SSA_NAME)
30260 {
30261 /* Recognize also when mask is like:
30262 __v2df src = _mm_setzero_pd ();
30263 __v2df mask = _mm_cmpeq_pd (src, src);
30264 or
30265 __v8sf src = _mm256_setzero_ps ();
30266 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30267 as that is a cheaper way to load all ones into
30268 a register than having to load a constant from
30269 memory. */
30270 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30271 if (is_gimple_call (def_stmt))
30272 {
30273 tree fndecl = gimple_call_fndecl (def_stmt);
30274 if (fndecl
30275 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30276 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30277 {
30278 case IX86_BUILTIN_CMPPD:
30279 case IX86_BUILTIN_CMPPS:
30280 case IX86_BUILTIN_CMPPD256:
30281 case IX86_BUILTIN_CMPPS256:
30282 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30283 break;
30284 /* FALLTHRU */
30285 case IX86_BUILTIN_CMPEQPD:
30286 case IX86_BUILTIN_CMPEQPS:
30287 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30288 && initializer_zerop (gimple_call_arg (def_stmt,
30289 1)))
30290 op0 = pc_rtx;
30291 break;
30292 default:
30293 break;
30294 }
30295 }
30296 }
30297 }
30298
30299 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30300 if (! pat)
30301 return const0_rtx;
30302 emit_insn (pat);
30303
30304 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30305 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30306 {
30307 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30308 ? V4SFmode : V4SImode;
30309 if (target == NULL_RTX)
30310 target = gen_reg_rtx (tmode);
30311 if (tmode == V4SFmode)
30312 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30313 else
30314 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30315 }
30316 else
30317 target = subtarget;
30318
30319 return target;
30320
30321 case IX86_BUILTIN_XABORT:
30322 icode = CODE_FOR_xabort;
30323 arg0 = CALL_EXPR_ARG (exp, 0);
30324 op0 = expand_normal (arg0);
30325 mode0 = insn_data[icode].operand[0].mode;
30326 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30327 {
30328 error ("the xabort's argument must be an 8-bit immediate");
30329 return const0_rtx;
30330 }
30331 emit_insn (gen_xabort (op0));
30332 return 0;
30333
30334 default:
30335 break;
30336 }
30337
30338 for (i = 0, d = bdesc_special_args;
30339 i < ARRAY_SIZE (bdesc_special_args);
30340 i++, d++)
30341 if (d->code == fcode)
30342 return ix86_expand_special_args_builtin (d, exp, target);
30343
30344 for (i = 0, d = bdesc_args;
30345 i < ARRAY_SIZE (bdesc_args);
30346 i++, d++)
30347 if (d->code == fcode)
30348 switch (fcode)
30349 {
30350 case IX86_BUILTIN_FABSQ:
30351 case IX86_BUILTIN_COPYSIGNQ:
30352 if (!TARGET_SSE)
30353 /* Emit a normal call if SSE isn't available. */
30354 return expand_call (exp, target, ignore);
30355 default:
30356 return ix86_expand_args_builtin (d, exp, target);
30357 }
30358
30359 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30360 if (d->code == fcode)
30361 return ix86_expand_sse_comi (d, exp, target);
30362
30363 for (i = 0, d = bdesc_pcmpestr;
30364 i < ARRAY_SIZE (bdesc_pcmpestr);
30365 i++, d++)
30366 if (d->code == fcode)
30367 return ix86_expand_sse_pcmpestr (d, exp, target);
30368
30369 for (i = 0, d = bdesc_pcmpistr;
30370 i < ARRAY_SIZE (bdesc_pcmpistr);
30371 i++, d++)
30372 if (d->code == fcode)
30373 return ix86_expand_sse_pcmpistr (d, exp, target);
30374
30375 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30376 if (d->code == fcode)
30377 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30378 (enum ix86_builtin_func_type)
30379 d->flag, d->comparison);
30380
30381 gcc_unreachable ();
30382 }
30383
30384 /* Returns a function decl for a vectorized version of the builtin function
30385 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30386 if it is not available. */
30387
30388 static tree
30389 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30390 tree type_in)
30391 {
30392 enum machine_mode in_mode, out_mode;
30393 int in_n, out_n;
30394 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30395
30396 if (TREE_CODE (type_out) != VECTOR_TYPE
30397 || TREE_CODE (type_in) != VECTOR_TYPE
30398 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30399 return NULL_TREE;
30400
30401 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30402 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30403 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30404 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30405
30406 switch (fn)
30407 {
30408 case BUILT_IN_SQRT:
30409 if (out_mode == DFmode && in_mode == DFmode)
30410 {
30411 if (out_n == 2 && in_n == 2)
30412 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30413 else if (out_n == 4 && in_n == 4)
30414 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30415 }
30416 break;
30417
30418 case BUILT_IN_SQRTF:
30419 if (out_mode == SFmode && in_mode == SFmode)
30420 {
30421 if (out_n == 4 && in_n == 4)
30422 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30423 else if (out_n == 8 && in_n == 8)
30424 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30425 }
30426 break;
30427
30428 case BUILT_IN_IFLOOR:
30429 case BUILT_IN_LFLOOR:
30430 case BUILT_IN_LLFLOOR:
30431 /* The round insn does not trap on denormals. */
30432 if (flag_trapping_math || !TARGET_ROUND)
30433 break;
30434
30435 if (out_mode == SImode && in_mode == DFmode)
30436 {
30437 if (out_n == 4 && in_n == 2)
30438 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30439 else if (out_n == 8 && in_n == 4)
30440 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30441 }
30442 break;
30443
30444 case BUILT_IN_IFLOORF:
30445 case BUILT_IN_LFLOORF:
30446 case BUILT_IN_LLFLOORF:
30447 /* The round insn does not trap on denormals. */
30448 if (flag_trapping_math || !TARGET_ROUND)
30449 break;
30450
30451 if (out_mode == SImode && in_mode == SFmode)
30452 {
30453 if (out_n == 4 && in_n == 4)
30454 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30455 else if (out_n == 8 && in_n == 8)
30456 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30457 }
30458 break;
30459
30460 case BUILT_IN_ICEIL:
30461 case BUILT_IN_LCEIL:
30462 case BUILT_IN_LLCEIL:
30463 /* The round insn does not trap on denormals. */
30464 if (flag_trapping_math || !TARGET_ROUND)
30465 break;
30466
30467 if (out_mode == SImode && in_mode == DFmode)
30468 {
30469 if (out_n == 4 && in_n == 2)
30470 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30471 else if (out_n == 8 && in_n == 4)
30472 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30473 }
30474 break;
30475
30476 case BUILT_IN_ICEILF:
30477 case BUILT_IN_LCEILF:
30478 case BUILT_IN_LLCEILF:
30479 /* The round insn does not trap on denormals. */
30480 if (flag_trapping_math || !TARGET_ROUND)
30481 break;
30482
30483 if (out_mode == SImode && in_mode == SFmode)
30484 {
30485 if (out_n == 4 && in_n == 4)
30486 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30487 else if (out_n == 8 && in_n == 8)
30488 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30489 }
30490 break;
30491
30492 case BUILT_IN_IRINT:
30493 case BUILT_IN_LRINT:
30494 case BUILT_IN_LLRINT:
30495 if (out_mode == SImode && in_mode == DFmode)
30496 {
30497 if (out_n == 4 && in_n == 2)
30498 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30499 else if (out_n == 8 && in_n == 4)
30500 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30501 }
30502 break;
30503
30504 case BUILT_IN_IRINTF:
30505 case BUILT_IN_LRINTF:
30506 case BUILT_IN_LLRINTF:
30507 if (out_mode == SImode && in_mode == SFmode)
30508 {
30509 if (out_n == 4 && in_n == 4)
30510 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30511 else if (out_n == 8 && in_n == 8)
30512 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30513 }
30514 break;
30515
30516 case BUILT_IN_IROUND:
30517 case BUILT_IN_LROUND:
30518 case BUILT_IN_LLROUND:
30519 /* The round insn does not trap on denormals. */
30520 if (flag_trapping_math || !TARGET_ROUND)
30521 break;
30522
30523 if (out_mode == SImode && in_mode == DFmode)
30524 {
30525 if (out_n == 4 && in_n == 2)
30526 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30527 else if (out_n == 8 && in_n == 4)
30528 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30529 }
30530 break;
30531
30532 case BUILT_IN_IROUNDF:
30533 case BUILT_IN_LROUNDF:
30534 case BUILT_IN_LLROUNDF:
30535 /* The round insn does not trap on denormals. */
30536 if (flag_trapping_math || !TARGET_ROUND)
30537 break;
30538
30539 if (out_mode == SImode && in_mode == SFmode)
30540 {
30541 if (out_n == 4 && in_n == 4)
30542 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30543 else if (out_n == 8 && in_n == 8)
30544 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30545 }
30546 break;
30547
30548 case BUILT_IN_COPYSIGN:
30549 if (out_mode == DFmode && in_mode == DFmode)
30550 {
30551 if (out_n == 2 && in_n == 2)
30552 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30553 else if (out_n == 4 && in_n == 4)
30554 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30555 }
30556 break;
30557
30558 case BUILT_IN_COPYSIGNF:
30559 if (out_mode == SFmode && in_mode == SFmode)
30560 {
30561 if (out_n == 4 && in_n == 4)
30562 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30563 else if (out_n == 8 && in_n == 8)
30564 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30565 }
30566 break;
30567
30568 case BUILT_IN_FLOOR:
30569 /* The round insn does not trap on denormals. */
30570 if (flag_trapping_math || !TARGET_ROUND)
30571 break;
30572
30573 if (out_mode == DFmode && in_mode == DFmode)
30574 {
30575 if (out_n == 2 && in_n == 2)
30576 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30577 else if (out_n == 4 && in_n == 4)
30578 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30579 }
30580 break;
30581
30582 case BUILT_IN_FLOORF:
30583 /* The round insn does not trap on denormals. */
30584 if (flag_trapping_math || !TARGET_ROUND)
30585 break;
30586
30587 if (out_mode == SFmode && in_mode == SFmode)
30588 {
30589 if (out_n == 4 && in_n == 4)
30590 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30591 else if (out_n == 8 && in_n == 8)
30592 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30593 }
30594 break;
30595
30596 case BUILT_IN_CEIL:
30597 /* The round insn does not trap on denormals. */
30598 if (flag_trapping_math || !TARGET_ROUND)
30599 break;
30600
30601 if (out_mode == DFmode && in_mode == DFmode)
30602 {
30603 if (out_n == 2 && in_n == 2)
30604 return ix86_builtins[IX86_BUILTIN_CEILPD];
30605 else if (out_n == 4 && in_n == 4)
30606 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30607 }
30608 break;
30609
30610 case BUILT_IN_CEILF:
30611 /* The round insn does not trap on denormals. */
30612 if (flag_trapping_math || !TARGET_ROUND)
30613 break;
30614
30615 if (out_mode == SFmode && in_mode == SFmode)
30616 {
30617 if (out_n == 4 && in_n == 4)
30618 return ix86_builtins[IX86_BUILTIN_CEILPS];
30619 else if (out_n == 8 && in_n == 8)
30620 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30621 }
30622 break;
30623
30624 case BUILT_IN_TRUNC:
30625 /* The round insn does not trap on denormals. */
30626 if (flag_trapping_math || !TARGET_ROUND)
30627 break;
30628
30629 if (out_mode == DFmode && in_mode == DFmode)
30630 {
30631 if (out_n == 2 && in_n == 2)
30632 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30633 else if (out_n == 4 && in_n == 4)
30634 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30635 }
30636 break;
30637
30638 case BUILT_IN_TRUNCF:
30639 /* The round insn does not trap on denormals. */
30640 if (flag_trapping_math || !TARGET_ROUND)
30641 break;
30642
30643 if (out_mode == SFmode && in_mode == SFmode)
30644 {
30645 if (out_n == 4 && in_n == 4)
30646 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30647 else if (out_n == 8 && in_n == 8)
30648 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30649 }
30650 break;
30651
30652 case BUILT_IN_RINT:
30653 /* The round insn does not trap on denormals. */
30654 if (flag_trapping_math || !TARGET_ROUND)
30655 break;
30656
30657 if (out_mode == DFmode && in_mode == DFmode)
30658 {
30659 if (out_n == 2 && in_n == 2)
30660 return ix86_builtins[IX86_BUILTIN_RINTPD];
30661 else if (out_n == 4 && in_n == 4)
30662 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30663 }
30664 break;
30665
30666 case BUILT_IN_RINTF:
30667 /* The round insn does not trap on denormals. */
30668 if (flag_trapping_math || !TARGET_ROUND)
30669 break;
30670
30671 if (out_mode == SFmode && in_mode == SFmode)
30672 {
30673 if (out_n == 4 && in_n == 4)
30674 return ix86_builtins[IX86_BUILTIN_RINTPS];
30675 else if (out_n == 8 && in_n == 8)
30676 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30677 }
30678 break;
30679
30680 case BUILT_IN_ROUND:
30681 /* The round insn does not trap on denormals. */
30682 if (flag_trapping_math || !TARGET_ROUND)
30683 break;
30684
30685 if (out_mode == DFmode && in_mode == DFmode)
30686 {
30687 if (out_n == 2 && in_n == 2)
30688 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30689 else if (out_n == 4 && in_n == 4)
30690 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30691 }
30692 break;
30693
30694 case BUILT_IN_ROUNDF:
30695 /* The round insn does not trap on denormals. */
30696 if (flag_trapping_math || !TARGET_ROUND)
30697 break;
30698
30699 if (out_mode == SFmode && in_mode == SFmode)
30700 {
30701 if (out_n == 4 && in_n == 4)
30702 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30703 else if (out_n == 8 && in_n == 8)
30704 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30705 }
30706 break;
30707
30708 case BUILT_IN_FMA:
30709 if (out_mode == DFmode && in_mode == DFmode)
30710 {
30711 if (out_n == 2 && in_n == 2)
30712 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30713 if (out_n == 4 && in_n == 4)
30714 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30715 }
30716 break;
30717
30718 case BUILT_IN_FMAF:
30719 if (out_mode == SFmode && in_mode == SFmode)
30720 {
30721 if (out_n == 4 && in_n == 4)
30722 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30723 if (out_n == 8 && in_n == 8)
30724 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30725 }
30726 break;
30727
30728 default:
30729 break;
30730 }
30731
30732 /* Dispatch to a handler for a vectorization library. */
30733 if (ix86_veclib_handler)
30734 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30735 type_in);
30736
30737 return NULL_TREE;
30738 }
30739
30740 /* Handler for an SVML-style interface to
30741 a library with vectorized intrinsics. */
30742
30743 static tree
30744 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30745 {
30746 char name[20];
30747 tree fntype, new_fndecl, args;
30748 unsigned arity;
30749 const char *bname;
30750 enum machine_mode el_mode, in_mode;
30751 int n, in_n;
30752
30753 /* The SVML is suitable for unsafe math only. */
30754 if (!flag_unsafe_math_optimizations)
30755 return NULL_TREE;
30756
30757 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30758 n = TYPE_VECTOR_SUBPARTS (type_out);
30759 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30760 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30761 if (el_mode != in_mode
30762 || n != in_n)
30763 return NULL_TREE;
30764
30765 switch (fn)
30766 {
30767 case BUILT_IN_EXP:
30768 case BUILT_IN_LOG:
30769 case BUILT_IN_LOG10:
30770 case BUILT_IN_POW:
30771 case BUILT_IN_TANH:
30772 case BUILT_IN_TAN:
30773 case BUILT_IN_ATAN:
30774 case BUILT_IN_ATAN2:
30775 case BUILT_IN_ATANH:
30776 case BUILT_IN_CBRT:
30777 case BUILT_IN_SINH:
30778 case BUILT_IN_SIN:
30779 case BUILT_IN_ASINH:
30780 case BUILT_IN_ASIN:
30781 case BUILT_IN_COSH:
30782 case BUILT_IN_COS:
30783 case BUILT_IN_ACOSH:
30784 case BUILT_IN_ACOS:
30785 if (el_mode != DFmode || n != 2)
30786 return NULL_TREE;
30787 break;
30788
30789 case BUILT_IN_EXPF:
30790 case BUILT_IN_LOGF:
30791 case BUILT_IN_LOG10F:
30792 case BUILT_IN_POWF:
30793 case BUILT_IN_TANHF:
30794 case BUILT_IN_TANF:
30795 case BUILT_IN_ATANF:
30796 case BUILT_IN_ATAN2F:
30797 case BUILT_IN_ATANHF:
30798 case BUILT_IN_CBRTF:
30799 case BUILT_IN_SINHF:
30800 case BUILT_IN_SINF:
30801 case BUILT_IN_ASINHF:
30802 case BUILT_IN_ASINF:
30803 case BUILT_IN_COSHF:
30804 case BUILT_IN_COSF:
30805 case BUILT_IN_ACOSHF:
30806 case BUILT_IN_ACOSF:
30807 if (el_mode != SFmode || n != 4)
30808 return NULL_TREE;
30809 break;
30810
30811 default:
30812 return NULL_TREE;
30813 }
30814
30815 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30816
30817 if (fn == BUILT_IN_LOGF)
30818 strcpy (name, "vmlsLn4");
30819 else if (fn == BUILT_IN_LOG)
30820 strcpy (name, "vmldLn2");
30821 else if (n == 4)
30822 {
30823 sprintf (name, "vmls%s", bname+10);
30824 name[strlen (name)-1] = '4';
30825 }
30826 else
30827 sprintf (name, "vmld%s2", bname+10);
30828
30829 /* Convert to uppercase. */
30830 name[4] &= ~0x20;
30831
30832 arity = 0;
30833 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30834 args;
30835 args = TREE_CHAIN (args))
30836 arity++;
30837
30838 if (arity == 1)
30839 fntype = build_function_type_list (type_out, type_in, NULL);
30840 else
30841 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30842
30843 /* Build a function declaration for the vectorized function. */
30844 new_fndecl = build_decl (BUILTINS_LOCATION,
30845 FUNCTION_DECL, get_identifier (name), fntype);
30846 TREE_PUBLIC (new_fndecl) = 1;
30847 DECL_EXTERNAL (new_fndecl) = 1;
30848 DECL_IS_NOVOPS (new_fndecl) = 1;
30849 TREE_READONLY (new_fndecl) = 1;
30850
30851 return new_fndecl;
30852 }
30853
30854 /* Handler for an ACML-style interface to
30855 a library with vectorized intrinsics. */
30856
30857 static tree
30858 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30859 {
30860 char name[20] = "__vr.._";
30861 tree fntype, new_fndecl, args;
30862 unsigned arity;
30863 const char *bname;
30864 enum machine_mode el_mode, in_mode;
30865 int n, in_n;
30866
30867 /* The ACML is 64bits only and suitable for unsafe math only as
30868 it does not correctly support parts of IEEE with the required
30869 precision such as denormals. */
30870 if (!TARGET_64BIT
30871 || !flag_unsafe_math_optimizations)
30872 return NULL_TREE;
30873
30874 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30875 n = TYPE_VECTOR_SUBPARTS (type_out);
30876 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30877 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30878 if (el_mode != in_mode
30879 || n != in_n)
30880 return NULL_TREE;
30881
30882 switch (fn)
30883 {
30884 case BUILT_IN_SIN:
30885 case BUILT_IN_COS:
30886 case BUILT_IN_EXP:
30887 case BUILT_IN_LOG:
30888 case BUILT_IN_LOG2:
30889 case BUILT_IN_LOG10:
30890 name[4] = 'd';
30891 name[5] = '2';
30892 if (el_mode != DFmode
30893 || n != 2)
30894 return NULL_TREE;
30895 break;
30896
30897 case BUILT_IN_SINF:
30898 case BUILT_IN_COSF:
30899 case BUILT_IN_EXPF:
30900 case BUILT_IN_POWF:
30901 case BUILT_IN_LOGF:
30902 case BUILT_IN_LOG2F:
30903 case BUILT_IN_LOG10F:
30904 name[4] = 's';
30905 name[5] = '4';
30906 if (el_mode != SFmode
30907 || n != 4)
30908 return NULL_TREE;
30909 break;
30910
30911 default:
30912 return NULL_TREE;
30913 }
30914
30915 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30916 sprintf (name + 7, "%s", bname+10);
30917
30918 arity = 0;
30919 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30920 args;
30921 args = TREE_CHAIN (args))
30922 arity++;
30923
30924 if (arity == 1)
30925 fntype = build_function_type_list (type_out, type_in, NULL);
30926 else
30927 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30928
30929 /* Build a function declaration for the vectorized function. */
30930 new_fndecl = build_decl (BUILTINS_LOCATION,
30931 FUNCTION_DECL, get_identifier (name), fntype);
30932 TREE_PUBLIC (new_fndecl) = 1;
30933 DECL_EXTERNAL (new_fndecl) = 1;
30934 DECL_IS_NOVOPS (new_fndecl) = 1;
30935 TREE_READONLY (new_fndecl) = 1;
30936
30937 return new_fndecl;
30938 }
30939
30940 /* Returns a decl of a function that implements gather load with
30941 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30942 Return NULL_TREE if it is not available. */
30943
30944 static tree
30945 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30946 const_tree index_type, int scale)
30947 {
30948 bool si;
30949 enum ix86_builtins code;
30950
30951 if (! TARGET_AVX2)
30952 return NULL_TREE;
30953
30954 if ((TREE_CODE (index_type) != INTEGER_TYPE
30955 && !POINTER_TYPE_P (index_type))
30956 || (TYPE_MODE (index_type) != SImode
30957 && TYPE_MODE (index_type) != DImode))
30958 return NULL_TREE;
30959
30960 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30961 return NULL_TREE;
30962
30963 /* v*gather* insn sign extends index to pointer mode. */
30964 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30965 && TYPE_UNSIGNED (index_type))
30966 return NULL_TREE;
30967
30968 if (scale <= 0
30969 || scale > 8
30970 || (scale & (scale - 1)) != 0)
30971 return NULL_TREE;
30972
30973 si = TYPE_MODE (index_type) == SImode;
30974 switch (TYPE_MODE (mem_vectype))
30975 {
30976 case V2DFmode:
30977 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30978 break;
30979 case V4DFmode:
30980 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30981 break;
30982 case V2DImode:
30983 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30984 break;
30985 case V4DImode:
30986 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30987 break;
30988 case V4SFmode:
30989 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30990 break;
30991 case V8SFmode:
30992 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30993 break;
30994 case V4SImode:
30995 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30996 break;
30997 case V8SImode:
30998 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30999 break;
31000 default:
31001 return NULL_TREE;
31002 }
31003
31004 return ix86_builtins[code];
31005 }
31006
31007 /* Returns a code for a target-specific builtin that implements
31008 reciprocal of the function, or NULL_TREE if not available. */
31009
31010 static tree
31011 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31012 bool sqrt ATTRIBUTE_UNUSED)
31013 {
31014 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31015 && flag_finite_math_only && !flag_trapping_math
31016 && flag_unsafe_math_optimizations))
31017 return NULL_TREE;
31018
31019 if (md_fn)
31020 /* Machine dependent builtins. */
31021 switch (fn)
31022 {
31023 /* Vectorized version of sqrt to rsqrt conversion. */
31024 case IX86_BUILTIN_SQRTPS_NR:
31025 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31026
31027 case IX86_BUILTIN_SQRTPS_NR256:
31028 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31029
31030 default:
31031 return NULL_TREE;
31032 }
31033 else
31034 /* Normal builtins. */
31035 switch (fn)
31036 {
31037 /* Sqrt to rsqrt conversion. */
31038 case BUILT_IN_SQRTF:
31039 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31040
31041 default:
31042 return NULL_TREE;
31043 }
31044 }
31045 \f
31046 /* Helper for avx_vpermilps256_operand et al. This is also used by
31047 the expansion functions to turn the parallel back into a mask.
31048 The return value is 0 for no match and the imm8+1 for a match. */
31049
31050 int
31051 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
31052 {
31053 unsigned i, nelt = GET_MODE_NUNITS (mode);
31054 unsigned mask = 0;
31055 unsigned char ipar[8];
31056
31057 if (XVECLEN (par, 0) != (int) nelt)
31058 return 0;
31059
31060 /* Validate that all of the elements are constants, and not totally
31061 out of range. Copy the data into an integral array to make the
31062 subsequent checks easier. */
31063 for (i = 0; i < nelt; ++i)
31064 {
31065 rtx er = XVECEXP (par, 0, i);
31066 unsigned HOST_WIDE_INT ei;
31067
31068 if (!CONST_INT_P (er))
31069 return 0;
31070 ei = INTVAL (er);
31071 if (ei >= nelt)
31072 return 0;
31073 ipar[i] = ei;
31074 }
31075
31076 switch (mode)
31077 {
31078 case V4DFmode:
31079 /* In the 256-bit DFmode case, we can only move elements within
31080 a 128-bit lane. */
31081 for (i = 0; i < 2; ++i)
31082 {
31083 if (ipar[i] >= 2)
31084 return 0;
31085 mask |= ipar[i] << i;
31086 }
31087 for (i = 2; i < 4; ++i)
31088 {
31089 if (ipar[i] < 2)
31090 return 0;
31091 mask |= (ipar[i] - 2) << i;
31092 }
31093 break;
31094
31095 case V8SFmode:
31096 /* In the 256-bit SFmode case, we have full freedom of movement
31097 within the low 128-bit lane, but the high 128-bit lane must
31098 mirror the exact same pattern. */
31099 for (i = 0; i < 4; ++i)
31100 if (ipar[i] + 4 != ipar[i + 4])
31101 return 0;
31102 nelt = 4;
31103 /* FALLTHRU */
31104
31105 case V2DFmode:
31106 case V4SFmode:
31107 /* In the 128-bit case, we've full freedom in the placement of
31108 the elements from the source operand. */
31109 for (i = 0; i < nelt; ++i)
31110 mask |= ipar[i] << (i * (nelt / 2));
31111 break;
31112
31113 default:
31114 gcc_unreachable ();
31115 }
31116
31117 /* Make sure success has a non-zero value by adding one. */
31118 return mask + 1;
31119 }
31120
31121 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
31122 the expansion functions to turn the parallel back into a mask.
31123 The return value is 0 for no match and the imm8+1 for a match. */
31124
31125 int
31126 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
31127 {
31128 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
31129 unsigned mask = 0;
31130 unsigned char ipar[8];
31131
31132 if (XVECLEN (par, 0) != (int) nelt)
31133 return 0;
31134
31135 /* Validate that all of the elements are constants, and not totally
31136 out of range. Copy the data into an integral array to make the
31137 subsequent checks easier. */
31138 for (i = 0; i < nelt; ++i)
31139 {
31140 rtx er = XVECEXP (par, 0, i);
31141 unsigned HOST_WIDE_INT ei;
31142
31143 if (!CONST_INT_P (er))
31144 return 0;
31145 ei = INTVAL (er);
31146 if (ei >= 2 * nelt)
31147 return 0;
31148 ipar[i] = ei;
31149 }
31150
31151 /* Validate that the halves of the permute are halves. */
31152 for (i = 0; i < nelt2 - 1; ++i)
31153 if (ipar[i] + 1 != ipar[i + 1])
31154 return 0;
31155 for (i = nelt2; i < nelt - 1; ++i)
31156 if (ipar[i] + 1 != ipar[i + 1])
31157 return 0;
31158
31159 /* Reconstruct the mask. */
31160 for (i = 0; i < 2; ++i)
31161 {
31162 unsigned e = ipar[i * nelt2];
31163 if (e % nelt2)
31164 return 0;
31165 e /= nelt2;
31166 mask |= e << (i * 4);
31167 }
31168
31169 /* Make sure success has a non-zero value by adding one. */
31170 return mask + 1;
31171 }
31172 \f
31173 /* Store OPERAND to the memory after reload is completed. This means
31174 that we can't easily use assign_stack_local. */
31175 rtx
31176 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31177 {
31178 rtx result;
31179
31180 gcc_assert (reload_completed);
31181 if (ix86_using_red_zone ())
31182 {
31183 result = gen_rtx_MEM (mode,
31184 gen_rtx_PLUS (Pmode,
31185 stack_pointer_rtx,
31186 GEN_INT (-RED_ZONE_SIZE)));
31187 emit_move_insn (result, operand);
31188 }
31189 else if (TARGET_64BIT)
31190 {
31191 switch (mode)
31192 {
31193 case HImode:
31194 case SImode:
31195 operand = gen_lowpart (DImode, operand);
31196 /* FALLTHRU */
31197 case DImode:
31198 emit_insn (
31199 gen_rtx_SET (VOIDmode,
31200 gen_rtx_MEM (DImode,
31201 gen_rtx_PRE_DEC (DImode,
31202 stack_pointer_rtx)),
31203 operand));
31204 break;
31205 default:
31206 gcc_unreachable ();
31207 }
31208 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31209 }
31210 else
31211 {
31212 switch (mode)
31213 {
31214 case DImode:
31215 {
31216 rtx operands[2];
31217 split_double_mode (mode, &operand, 1, operands, operands + 1);
31218 emit_insn (
31219 gen_rtx_SET (VOIDmode,
31220 gen_rtx_MEM (SImode,
31221 gen_rtx_PRE_DEC (Pmode,
31222 stack_pointer_rtx)),
31223 operands[1]));
31224 emit_insn (
31225 gen_rtx_SET (VOIDmode,
31226 gen_rtx_MEM (SImode,
31227 gen_rtx_PRE_DEC (Pmode,
31228 stack_pointer_rtx)),
31229 operands[0]));
31230 }
31231 break;
31232 case HImode:
31233 /* Store HImodes as SImodes. */
31234 operand = gen_lowpart (SImode, operand);
31235 /* FALLTHRU */
31236 case SImode:
31237 emit_insn (
31238 gen_rtx_SET (VOIDmode,
31239 gen_rtx_MEM (GET_MODE (operand),
31240 gen_rtx_PRE_DEC (SImode,
31241 stack_pointer_rtx)),
31242 operand));
31243 break;
31244 default:
31245 gcc_unreachable ();
31246 }
31247 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31248 }
31249 return result;
31250 }
31251
31252 /* Free operand from the memory. */
31253 void
31254 ix86_free_from_memory (enum machine_mode mode)
31255 {
31256 if (!ix86_using_red_zone ())
31257 {
31258 int size;
31259
31260 if (mode == DImode || TARGET_64BIT)
31261 size = 8;
31262 else
31263 size = 4;
31264 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31265 to pop or add instruction if registers are available. */
31266 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31267 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31268 GEN_INT (size))));
31269 }
31270 }
31271
31272 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31273
31274 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31275 QImode must go into class Q_REGS.
31276 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31277 movdf to do mem-to-mem moves through integer regs. */
31278
31279 static reg_class_t
31280 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31281 {
31282 enum machine_mode mode = GET_MODE (x);
31283
31284 /* We're only allowed to return a subclass of CLASS. Many of the
31285 following checks fail for NO_REGS, so eliminate that early. */
31286 if (regclass == NO_REGS)
31287 return NO_REGS;
31288
31289 /* All classes can load zeros. */
31290 if (x == CONST0_RTX (mode))
31291 return regclass;
31292
31293 /* Force constants into memory if we are loading a (nonzero) constant into
31294 an MMX or SSE register. This is because there are no MMX/SSE instructions
31295 to load from a constant. */
31296 if (CONSTANT_P (x)
31297 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31298 return NO_REGS;
31299
31300 /* Prefer SSE regs only, if we can use them for math. */
31301 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31302 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31303
31304 /* Floating-point constants need more complex checks. */
31305 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31306 {
31307 /* General regs can load everything. */
31308 if (reg_class_subset_p (regclass, GENERAL_REGS))
31309 return regclass;
31310
31311 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31312 zero above. We only want to wind up preferring 80387 registers if
31313 we plan on doing computation with them. */
31314 if (TARGET_80387
31315 && standard_80387_constant_p (x) > 0)
31316 {
31317 /* Limit class to non-sse. */
31318 if (regclass == FLOAT_SSE_REGS)
31319 return FLOAT_REGS;
31320 if (regclass == FP_TOP_SSE_REGS)
31321 return FP_TOP_REG;
31322 if (regclass == FP_SECOND_SSE_REGS)
31323 return FP_SECOND_REG;
31324 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31325 return regclass;
31326 }
31327
31328 return NO_REGS;
31329 }
31330
31331 /* Generally when we see PLUS here, it's the function invariant
31332 (plus soft-fp const_int). Which can only be computed into general
31333 regs. */
31334 if (GET_CODE (x) == PLUS)
31335 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31336
31337 /* QImode constants are easy to load, but non-constant QImode data
31338 must go into Q_REGS. */
31339 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31340 {
31341 if (reg_class_subset_p (regclass, Q_REGS))
31342 return regclass;
31343 if (reg_class_subset_p (Q_REGS, regclass))
31344 return Q_REGS;
31345 return NO_REGS;
31346 }
31347
31348 return regclass;
31349 }
31350
31351 /* Discourage putting floating-point values in SSE registers unless
31352 SSE math is being used, and likewise for the 387 registers. */
31353 static reg_class_t
31354 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31355 {
31356 enum machine_mode mode = GET_MODE (x);
31357
31358 /* Restrict the output reload class to the register bank that we are doing
31359 math on. If we would like not to return a subset of CLASS, reject this
31360 alternative: if reload cannot do this, it will still use its choice. */
31361 mode = GET_MODE (x);
31362 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31363 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31364
31365 if (X87_FLOAT_MODE_P (mode))
31366 {
31367 if (regclass == FP_TOP_SSE_REGS)
31368 return FP_TOP_REG;
31369 else if (regclass == FP_SECOND_SSE_REGS)
31370 return FP_SECOND_REG;
31371 else
31372 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31373 }
31374
31375 return regclass;
31376 }
31377
31378 static reg_class_t
31379 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31380 enum machine_mode mode, secondary_reload_info *sri)
31381 {
31382 /* Double-word spills from general registers to non-offsettable memory
31383 references (zero-extended addresses) require special handling. */
31384 if (TARGET_64BIT
31385 && MEM_P (x)
31386 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31387 && rclass == GENERAL_REGS
31388 && !offsettable_memref_p (x))
31389 {
31390 sri->icode = (in_p
31391 ? CODE_FOR_reload_noff_load
31392 : CODE_FOR_reload_noff_store);
31393 /* Add the cost of moving address to a temporary. */
31394 sri->extra_cost = 1;
31395
31396 return NO_REGS;
31397 }
31398
31399 /* QImode spills from non-QI registers require
31400 intermediate register on 32bit targets. */
31401 if (!TARGET_64BIT
31402 && !in_p && mode == QImode
31403 && (rclass == GENERAL_REGS
31404 || rclass == LEGACY_REGS
31405 || rclass == INDEX_REGS))
31406 {
31407 int regno;
31408
31409 if (REG_P (x))
31410 regno = REGNO (x);
31411 else
31412 regno = -1;
31413
31414 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31415 regno = true_regnum (x);
31416
31417 /* Return Q_REGS if the operand is in memory. */
31418 if (regno == -1)
31419 return Q_REGS;
31420 }
31421
31422 /* This condition handles corner case where an expression involving
31423 pointers gets vectorized. We're trying to use the address of a
31424 stack slot as a vector initializer.
31425
31426 (set (reg:V2DI 74 [ vect_cst_.2 ])
31427 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31428
31429 Eventually frame gets turned into sp+offset like this:
31430
31431 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31432 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31433 (const_int 392 [0x188]))))
31434
31435 That later gets turned into:
31436
31437 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31438 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31439 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31440
31441 We'll have the following reload recorded:
31442
31443 Reload 0: reload_in (DI) =
31444 (plus:DI (reg/f:DI 7 sp)
31445 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31446 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31447 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31448 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31449 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31450 reload_reg_rtx: (reg:V2DI 22 xmm1)
31451
31452 Which isn't going to work since SSE instructions can't handle scalar
31453 additions. Returning GENERAL_REGS forces the addition into integer
31454 register and reload can handle subsequent reloads without problems. */
31455
31456 if (in_p && GET_CODE (x) == PLUS
31457 && SSE_CLASS_P (rclass)
31458 && SCALAR_INT_MODE_P (mode))
31459 return GENERAL_REGS;
31460
31461 return NO_REGS;
31462 }
31463
31464 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31465
31466 static bool
31467 ix86_class_likely_spilled_p (reg_class_t rclass)
31468 {
31469 switch (rclass)
31470 {
31471 case AREG:
31472 case DREG:
31473 case CREG:
31474 case BREG:
31475 case AD_REGS:
31476 case SIREG:
31477 case DIREG:
31478 case SSE_FIRST_REG:
31479 case FP_TOP_REG:
31480 case FP_SECOND_REG:
31481 return true;
31482
31483 default:
31484 break;
31485 }
31486
31487 return false;
31488 }
31489
31490 /* If we are copying between general and FP registers, we need a memory
31491 location. The same is true for SSE and MMX registers.
31492
31493 To optimize register_move_cost performance, allow inline variant.
31494
31495 The macro can't work reliably when one of the CLASSES is class containing
31496 registers from multiple units (SSE, MMX, integer). We avoid this by never
31497 combining those units in single alternative in the machine description.
31498 Ensure that this constraint holds to avoid unexpected surprises.
31499
31500 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31501 enforce these sanity checks. */
31502
31503 static inline bool
31504 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31505 enum machine_mode mode, int strict)
31506 {
31507 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31508 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31509 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31510 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31511 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31512 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31513 {
31514 gcc_assert (!strict);
31515 return true;
31516 }
31517
31518 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31519 return true;
31520
31521 /* ??? This is a lie. We do have moves between mmx/general, and for
31522 mmx/sse2. But by saying we need secondary memory we discourage the
31523 register allocator from using the mmx registers unless needed. */
31524 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31525 return true;
31526
31527 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31528 {
31529 /* SSE1 doesn't have any direct moves from other classes. */
31530 if (!TARGET_SSE2)
31531 return true;
31532
31533 /* If the target says that inter-unit moves are more expensive
31534 than moving through memory, then don't generate them. */
31535 if (!TARGET_INTER_UNIT_MOVES)
31536 return true;
31537
31538 /* Between SSE and general, we have moves no larger than word size. */
31539 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31540 return true;
31541 }
31542
31543 return false;
31544 }
31545
31546 bool
31547 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31548 enum machine_mode mode, int strict)
31549 {
31550 return inline_secondary_memory_needed (class1, class2, mode, strict);
31551 }
31552
31553 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31554
31555 On the 80386, this is the size of MODE in words,
31556 except in the FP regs, where a single reg is always enough. */
31557
31558 static unsigned char
31559 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31560 {
31561 if (MAYBE_INTEGER_CLASS_P (rclass))
31562 {
31563 if (mode == XFmode)
31564 return (TARGET_64BIT ? 2 : 3);
31565 else if (mode == XCmode)
31566 return (TARGET_64BIT ? 4 : 6);
31567 else
31568 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31569 }
31570 else
31571 {
31572 if (COMPLEX_MODE_P (mode))
31573 return 2;
31574 else
31575 return 1;
31576 }
31577 }
31578
31579 /* Return true if the registers in CLASS cannot represent the change from
31580 modes FROM to TO. */
31581
31582 bool
31583 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31584 enum reg_class regclass)
31585 {
31586 if (from == to)
31587 return false;
31588
31589 /* x87 registers can't do subreg at all, as all values are reformatted
31590 to extended precision. */
31591 if (MAYBE_FLOAT_CLASS_P (regclass))
31592 return true;
31593
31594 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31595 {
31596 /* Vector registers do not support QI or HImode loads. If we don't
31597 disallow a change to these modes, reload will assume it's ok to
31598 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31599 the vec_dupv4hi pattern. */
31600 if (GET_MODE_SIZE (from) < 4)
31601 return true;
31602
31603 /* Vector registers do not support subreg with nonzero offsets, which
31604 are otherwise valid for integer registers. Since we can't see
31605 whether we have a nonzero offset from here, prohibit all
31606 nonparadoxical subregs changing size. */
31607 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31608 return true;
31609 }
31610
31611 return false;
31612 }
31613
31614 /* Return the cost of moving data of mode M between a
31615 register and memory. A value of 2 is the default; this cost is
31616 relative to those in `REGISTER_MOVE_COST'.
31617
31618 This function is used extensively by register_move_cost that is used to
31619 build tables at startup. Make it inline in this case.
31620 When IN is 2, return maximum of in and out move cost.
31621
31622 If moving between registers and memory is more expensive than
31623 between two registers, you should define this macro to express the
31624 relative cost.
31625
31626 Model also increased moving costs of QImode registers in non
31627 Q_REGS classes.
31628 */
31629 static inline int
31630 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31631 int in)
31632 {
31633 int cost;
31634 if (FLOAT_CLASS_P (regclass))
31635 {
31636 int index;
31637 switch (mode)
31638 {
31639 case SFmode:
31640 index = 0;
31641 break;
31642 case DFmode:
31643 index = 1;
31644 break;
31645 case XFmode:
31646 index = 2;
31647 break;
31648 default:
31649 return 100;
31650 }
31651 if (in == 2)
31652 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31653 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31654 }
31655 if (SSE_CLASS_P (regclass))
31656 {
31657 int index;
31658 switch (GET_MODE_SIZE (mode))
31659 {
31660 case 4:
31661 index = 0;
31662 break;
31663 case 8:
31664 index = 1;
31665 break;
31666 case 16:
31667 index = 2;
31668 break;
31669 default:
31670 return 100;
31671 }
31672 if (in == 2)
31673 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31674 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31675 }
31676 if (MMX_CLASS_P (regclass))
31677 {
31678 int index;
31679 switch (GET_MODE_SIZE (mode))
31680 {
31681 case 4:
31682 index = 0;
31683 break;
31684 case 8:
31685 index = 1;
31686 break;
31687 default:
31688 return 100;
31689 }
31690 if (in)
31691 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31692 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31693 }
31694 switch (GET_MODE_SIZE (mode))
31695 {
31696 case 1:
31697 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31698 {
31699 if (!in)
31700 return ix86_cost->int_store[0];
31701 if (TARGET_PARTIAL_REG_DEPENDENCY
31702 && optimize_function_for_speed_p (cfun))
31703 cost = ix86_cost->movzbl_load;
31704 else
31705 cost = ix86_cost->int_load[0];
31706 if (in == 2)
31707 return MAX (cost, ix86_cost->int_store[0]);
31708 return cost;
31709 }
31710 else
31711 {
31712 if (in == 2)
31713 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31714 if (in)
31715 return ix86_cost->movzbl_load;
31716 else
31717 return ix86_cost->int_store[0] + 4;
31718 }
31719 break;
31720 case 2:
31721 if (in == 2)
31722 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31723 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31724 default:
31725 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31726 if (mode == TFmode)
31727 mode = XFmode;
31728 if (in == 2)
31729 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31730 else if (in)
31731 cost = ix86_cost->int_load[2];
31732 else
31733 cost = ix86_cost->int_store[2];
31734 return (cost * (((int) GET_MODE_SIZE (mode)
31735 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31736 }
31737 }
31738
31739 static int
31740 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31741 bool in)
31742 {
31743 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31744 }
31745
31746
31747 /* Return the cost of moving data from a register in class CLASS1 to
31748 one in class CLASS2.
31749
31750 It is not required that the cost always equal 2 when FROM is the same as TO;
31751 on some machines it is expensive to move between registers if they are not
31752 general registers. */
31753
31754 static int
31755 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31756 reg_class_t class2_i)
31757 {
31758 enum reg_class class1 = (enum reg_class) class1_i;
31759 enum reg_class class2 = (enum reg_class) class2_i;
31760
31761 /* In case we require secondary memory, compute cost of the store followed
31762 by load. In order to avoid bad register allocation choices, we need
31763 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31764
31765 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31766 {
31767 int cost = 1;
31768
31769 cost += inline_memory_move_cost (mode, class1, 2);
31770 cost += inline_memory_move_cost (mode, class2, 2);
31771
31772 /* In case of copying from general_purpose_register we may emit multiple
31773 stores followed by single load causing memory size mismatch stall.
31774 Count this as arbitrarily high cost of 20. */
31775 if (targetm.class_max_nregs (class1, mode)
31776 > targetm.class_max_nregs (class2, mode))
31777 cost += 20;
31778
31779 /* In the case of FP/MMX moves, the registers actually overlap, and we
31780 have to switch modes in order to treat them differently. */
31781 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31782 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31783 cost += 20;
31784
31785 return cost;
31786 }
31787
31788 /* Moves between SSE/MMX and integer unit are expensive. */
31789 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31790 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31791
31792 /* ??? By keeping returned value relatively high, we limit the number
31793 of moves between integer and MMX/SSE registers for all targets.
31794 Additionally, high value prevents problem with x86_modes_tieable_p(),
31795 where integer modes in MMX/SSE registers are not tieable
31796 because of missing QImode and HImode moves to, from or between
31797 MMX/SSE registers. */
31798 return MAX (8, ix86_cost->mmxsse_to_integer);
31799
31800 if (MAYBE_FLOAT_CLASS_P (class1))
31801 return ix86_cost->fp_move;
31802 if (MAYBE_SSE_CLASS_P (class1))
31803 return ix86_cost->sse_move;
31804 if (MAYBE_MMX_CLASS_P (class1))
31805 return ix86_cost->mmx_move;
31806 return 2;
31807 }
31808
31809 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31810 MODE. */
31811
31812 bool
31813 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31814 {
31815 /* Flags and only flags can only hold CCmode values. */
31816 if (CC_REGNO_P (regno))
31817 return GET_MODE_CLASS (mode) == MODE_CC;
31818 if (GET_MODE_CLASS (mode) == MODE_CC
31819 || GET_MODE_CLASS (mode) == MODE_RANDOM
31820 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31821 return false;
31822 if (FP_REGNO_P (regno))
31823 return VALID_FP_MODE_P (mode);
31824 if (SSE_REGNO_P (regno))
31825 {
31826 /* We implement the move patterns for all vector modes into and
31827 out of SSE registers, even when no operation instructions
31828 are available. OImode move is available only when AVX is
31829 enabled. */
31830 return ((TARGET_AVX && mode == OImode)
31831 || VALID_AVX256_REG_MODE (mode)
31832 || VALID_SSE_REG_MODE (mode)
31833 || VALID_SSE2_REG_MODE (mode)
31834 || VALID_MMX_REG_MODE (mode)
31835 || VALID_MMX_REG_MODE_3DNOW (mode));
31836 }
31837 if (MMX_REGNO_P (regno))
31838 {
31839 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31840 so if the register is available at all, then we can move data of
31841 the given mode into or out of it. */
31842 return (VALID_MMX_REG_MODE (mode)
31843 || VALID_MMX_REG_MODE_3DNOW (mode));
31844 }
31845
31846 if (mode == QImode)
31847 {
31848 /* Take care for QImode values - they can be in non-QI regs,
31849 but then they do cause partial register stalls. */
31850 if (regno <= BX_REG || TARGET_64BIT)
31851 return true;
31852 if (!TARGET_PARTIAL_REG_STALL)
31853 return true;
31854 return !can_create_pseudo_p ();
31855 }
31856 /* We handle both integer and floats in the general purpose registers. */
31857 else if (VALID_INT_MODE_P (mode))
31858 return true;
31859 else if (VALID_FP_MODE_P (mode))
31860 return true;
31861 else if (VALID_DFP_MODE_P (mode))
31862 return true;
31863 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31864 on to use that value in smaller contexts, this can easily force a
31865 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31866 supporting DImode, allow it. */
31867 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31868 return true;
31869
31870 return false;
31871 }
31872
31873 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31874 tieable integer mode. */
31875
31876 static bool
31877 ix86_tieable_integer_mode_p (enum machine_mode mode)
31878 {
31879 switch (mode)
31880 {
31881 case HImode:
31882 case SImode:
31883 return true;
31884
31885 case QImode:
31886 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31887
31888 case DImode:
31889 return TARGET_64BIT;
31890
31891 default:
31892 return false;
31893 }
31894 }
31895
31896 /* Return true if MODE1 is accessible in a register that can hold MODE2
31897 without copying. That is, all register classes that can hold MODE2
31898 can also hold MODE1. */
31899
31900 bool
31901 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31902 {
31903 if (mode1 == mode2)
31904 return true;
31905
31906 if (ix86_tieable_integer_mode_p (mode1)
31907 && ix86_tieable_integer_mode_p (mode2))
31908 return true;
31909
31910 /* MODE2 being XFmode implies fp stack or general regs, which means we
31911 can tie any smaller floating point modes to it. Note that we do not
31912 tie this with TFmode. */
31913 if (mode2 == XFmode)
31914 return mode1 == SFmode || mode1 == DFmode;
31915
31916 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31917 that we can tie it with SFmode. */
31918 if (mode2 == DFmode)
31919 return mode1 == SFmode;
31920
31921 /* If MODE2 is only appropriate for an SSE register, then tie with
31922 any other mode acceptable to SSE registers. */
31923 if (GET_MODE_SIZE (mode2) == 32
31924 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31925 return (GET_MODE_SIZE (mode1) == 32
31926 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31927 if (GET_MODE_SIZE (mode2) == 16
31928 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31929 return (GET_MODE_SIZE (mode1) == 16
31930 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31931
31932 /* If MODE2 is appropriate for an MMX register, then tie
31933 with any other mode acceptable to MMX registers. */
31934 if (GET_MODE_SIZE (mode2) == 8
31935 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31936 return (GET_MODE_SIZE (mode1) == 8
31937 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31938
31939 return false;
31940 }
31941
31942 /* Return the cost of moving between two registers of mode MODE. */
31943
31944 static int
31945 ix86_set_reg_reg_cost (enum machine_mode mode)
31946 {
31947 unsigned int units = UNITS_PER_WORD;
31948
31949 switch (GET_MODE_CLASS (mode))
31950 {
31951 default:
31952 break;
31953
31954 case MODE_CC:
31955 units = GET_MODE_SIZE (CCmode);
31956 break;
31957
31958 case MODE_FLOAT:
31959 if ((TARGET_SSE2 && mode == TFmode)
31960 || (TARGET_80387 && mode == XFmode)
31961 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
31962 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
31963 units = GET_MODE_SIZE (mode);
31964 break;
31965
31966 case MODE_COMPLEX_FLOAT:
31967 if ((TARGET_SSE2 && mode == TCmode)
31968 || (TARGET_80387 && mode == XCmode)
31969 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
31970 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
31971 units = GET_MODE_SIZE (mode);
31972 break;
31973
31974 case MODE_VECTOR_INT:
31975 case MODE_VECTOR_FLOAT:
31976 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
31977 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
31978 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
31979 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
31980 units = GET_MODE_SIZE (mode);
31981 }
31982
31983 /* Return the cost of moving between two registers of mode MODE,
31984 assuming that the move will be in pieces of at most UNITS bytes. */
31985 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
31986 }
31987
31988 /* Compute a (partial) cost for rtx X. Return true if the complete
31989 cost has been computed, and false if subexpressions should be
31990 scanned. In either case, *TOTAL contains the cost result. */
31991
31992 static bool
31993 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
31994 bool speed)
31995 {
31996 enum rtx_code code = (enum rtx_code) code_i;
31997 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31998 enum machine_mode mode = GET_MODE (x);
31999 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32000
32001 switch (code)
32002 {
32003 case SET:
32004 if (register_operand (SET_DEST (x), VOIDmode)
32005 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32006 {
32007 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
32008 return true;
32009 }
32010 return false;
32011
32012 case CONST_INT:
32013 case CONST:
32014 case LABEL_REF:
32015 case SYMBOL_REF:
32016 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
32017 *total = 3;
32018 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
32019 *total = 2;
32020 else if (flag_pic && SYMBOLIC_CONST (x)
32021 && (!TARGET_64BIT
32022 || (!GET_CODE (x) != LABEL_REF
32023 && (GET_CODE (x) != SYMBOL_REF
32024 || !SYMBOL_REF_LOCAL_P (x)))))
32025 *total = 1;
32026 else
32027 *total = 0;
32028 return true;
32029
32030 case CONST_DOUBLE:
32031 if (mode == VOIDmode)
32032 *total = 0;
32033 else
32034 switch (standard_80387_constant_p (x))
32035 {
32036 case 1: /* 0.0 */
32037 *total = 1;
32038 break;
32039 default: /* Other constants */
32040 *total = 2;
32041 break;
32042 case 0:
32043 case -1:
32044 break;
32045 }
32046 /* FALLTHRU */
32047
32048 case CONST_VECTOR:
32049 /* Start with (MEM (SYMBOL_REF)), since that's where
32050 it'll probably end up. Add a penalty for size. */
32051 *total = (COSTS_N_INSNS (1)
32052 + (flag_pic != 0 && !TARGET_64BIT)
32053 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
32054 return true;
32055
32056 case ZERO_EXTEND:
32057 /* The zero extensions is often completely free on x86_64, so make
32058 it as cheap as possible. */
32059 if (TARGET_64BIT && mode == DImode
32060 && GET_MODE (XEXP (x, 0)) == SImode)
32061 *total = 1;
32062 else if (TARGET_ZERO_EXTEND_WITH_AND)
32063 *total = cost->add;
32064 else
32065 *total = cost->movzx;
32066 return false;
32067
32068 case SIGN_EXTEND:
32069 *total = cost->movsx;
32070 return false;
32071
32072 case ASHIFT:
32073 if (SCALAR_INT_MODE_P (mode)
32074 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
32075 && CONST_INT_P (XEXP (x, 1)))
32076 {
32077 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32078 if (value == 1)
32079 {
32080 *total = cost->add;
32081 return false;
32082 }
32083 if ((value == 2 || value == 3)
32084 && cost->lea <= cost->shift_const)
32085 {
32086 *total = cost->lea;
32087 return false;
32088 }
32089 }
32090 /* FALLTHRU */
32091
32092 case ROTATE:
32093 case ASHIFTRT:
32094 case LSHIFTRT:
32095 case ROTATERT:
32096 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32097 {
32098 /* ??? Should be SSE vector operation cost. */
32099 /* At least for published AMD latencies, this really is the same
32100 as the latency for a simple fpu operation like fabs. */
32101 /* V*QImode is emulated with 1-11 insns. */
32102 if (mode == V16QImode || mode == V32QImode)
32103 {
32104 int count;
32105 if (TARGET_XOP && mode == V16QImode)
32106 {
32107 /* For XOP we use vpshab, which requires a broadcast of the
32108 value to the variable shift insn. For constants this
32109 means a V16Q const in mem; even when we can perform the
32110 shift with one insn set the cost to prefer paddb. */
32111 if (CONSTANT_P (XEXP (x, 1)))
32112 {
32113 *total = (cost->fabs
32114 + rtx_cost (XEXP (x, 0), code, 0, speed)
32115 + (speed ? 2 : COSTS_N_BYTES (16)));
32116 return true;
32117 }
32118 count = 3;
32119 }
32120 else
32121 count = TARGET_SSSE3 ? 7 : 11;
32122 *total = cost->fabs * count;
32123 }
32124 else
32125 *total = cost->fabs;
32126 return false;
32127 }
32128 if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
32129 {
32130 if (CONST_INT_P (XEXP (x, 1)))
32131 {
32132 if (INTVAL (XEXP (x, 1)) > 32)
32133 *total = cost->shift_const + COSTS_N_INSNS (2);
32134 else
32135 *total = cost->shift_const * 2;
32136 }
32137 else
32138 {
32139 if (GET_CODE (XEXP (x, 1)) == AND)
32140 *total = cost->shift_var * 2;
32141 else
32142 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
32143 }
32144 }
32145 else
32146 {
32147 if (CONST_INT_P (XEXP (x, 1)))
32148 *total = cost->shift_const;
32149 else
32150 *total = cost->shift_var;
32151 }
32152 return false;
32153
32154 case FMA:
32155 {
32156 rtx sub;
32157
32158 gcc_assert (FLOAT_MODE_P (mode));
32159 gcc_assert (TARGET_FMA || TARGET_FMA4);
32160
32161 /* ??? SSE scalar/vector cost should be used here. */
32162 /* ??? Bald assumption that fma has the same cost as fmul. */
32163 *total = cost->fmul;
32164 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
32165
32166 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
32167 sub = XEXP (x, 0);
32168 if (GET_CODE (sub) == NEG)
32169 sub = XEXP (sub, 0);
32170 *total += rtx_cost (sub, FMA, 0, speed);
32171
32172 sub = XEXP (x, 2);
32173 if (GET_CODE (sub) == NEG)
32174 sub = XEXP (sub, 0);
32175 *total += rtx_cost (sub, FMA, 2, speed);
32176 return true;
32177 }
32178
32179 case MULT:
32180 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32181 {
32182 /* ??? SSE scalar cost should be used here. */
32183 *total = cost->fmul;
32184 return false;
32185 }
32186 else if (X87_FLOAT_MODE_P (mode))
32187 {
32188 *total = cost->fmul;
32189 return false;
32190 }
32191 else if (FLOAT_MODE_P (mode))
32192 {
32193 /* ??? SSE vector cost should be used here. */
32194 *total = cost->fmul;
32195 return false;
32196 }
32197 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32198 {
32199 /* V*QImode is emulated with 7-13 insns. */
32200 if (mode == V16QImode || mode == V32QImode)
32201 {
32202 int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
32203 *total = cost->fmul * 2 + cost->fabs * extra;
32204 }
32205 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
32206 insns, including two PMULUDQ. */
32207 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
32208 *total = cost->fmul * 2 + cost->fabs * 5;
32209 else
32210 *total = cost->fmul;
32211 return false;
32212 }
32213 else
32214 {
32215 rtx op0 = XEXP (x, 0);
32216 rtx op1 = XEXP (x, 1);
32217 int nbits;
32218 if (CONST_INT_P (XEXP (x, 1)))
32219 {
32220 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32221 for (nbits = 0; value != 0; value &= value - 1)
32222 nbits++;
32223 }
32224 else
32225 /* This is arbitrary. */
32226 nbits = 7;
32227
32228 /* Compute costs correctly for widening multiplication. */
32229 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32230 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32231 == GET_MODE_SIZE (mode))
32232 {
32233 int is_mulwiden = 0;
32234 enum machine_mode inner_mode = GET_MODE (op0);
32235
32236 if (GET_CODE (op0) == GET_CODE (op1))
32237 is_mulwiden = 1, op1 = XEXP (op1, 0);
32238 else if (CONST_INT_P (op1))
32239 {
32240 if (GET_CODE (op0) == SIGN_EXTEND)
32241 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32242 == INTVAL (op1);
32243 else
32244 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32245 }
32246
32247 if (is_mulwiden)
32248 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32249 }
32250
32251 *total = (cost->mult_init[MODE_INDEX (mode)]
32252 + nbits * cost->mult_bit
32253 + rtx_cost (op0, outer_code, opno, speed)
32254 + rtx_cost (op1, outer_code, opno, speed));
32255
32256 return true;
32257 }
32258
32259 case DIV:
32260 case UDIV:
32261 case MOD:
32262 case UMOD:
32263 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32264 /* ??? SSE cost should be used here. */
32265 *total = cost->fdiv;
32266 else if (X87_FLOAT_MODE_P (mode))
32267 *total = cost->fdiv;
32268 else if (FLOAT_MODE_P (mode))
32269 /* ??? SSE vector cost should be used here. */
32270 *total = cost->fdiv;
32271 else
32272 *total = cost->divide[MODE_INDEX (mode)];
32273 return false;
32274
32275 case PLUS:
32276 if (GET_MODE_CLASS (mode) == MODE_INT
32277 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
32278 {
32279 if (GET_CODE (XEXP (x, 0)) == PLUS
32280 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32281 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32282 && CONSTANT_P (XEXP (x, 1)))
32283 {
32284 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32285 if (val == 2 || val == 4 || val == 8)
32286 {
32287 *total = cost->lea;
32288 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32289 outer_code, opno, speed);
32290 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32291 outer_code, opno, speed);
32292 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32293 return true;
32294 }
32295 }
32296 else if (GET_CODE (XEXP (x, 0)) == MULT
32297 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32298 {
32299 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32300 if (val == 2 || val == 4 || val == 8)
32301 {
32302 *total = cost->lea;
32303 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32304 outer_code, opno, speed);
32305 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32306 return true;
32307 }
32308 }
32309 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32310 {
32311 *total = cost->lea;
32312 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32313 outer_code, opno, speed);
32314 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32315 outer_code, opno, speed);
32316 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32317 return true;
32318 }
32319 }
32320 /* FALLTHRU */
32321
32322 case MINUS:
32323 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32324 {
32325 /* ??? SSE cost should be used here. */
32326 *total = cost->fadd;
32327 return false;
32328 }
32329 else if (X87_FLOAT_MODE_P (mode))
32330 {
32331 *total = cost->fadd;
32332 return false;
32333 }
32334 else if (FLOAT_MODE_P (mode))
32335 {
32336 /* ??? SSE vector cost should be used here. */
32337 *total = cost->fadd;
32338 return false;
32339 }
32340 /* FALLTHRU */
32341
32342 case AND:
32343 case IOR:
32344 case XOR:
32345 if (!TARGET_64BIT && mode == DImode)
32346 {
32347 *total = (cost->add * 2
32348 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32349 << (GET_MODE (XEXP (x, 0)) != DImode))
32350 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32351 << (GET_MODE (XEXP (x, 1)) != DImode)));
32352 return true;
32353 }
32354 /* FALLTHRU */
32355
32356 case NEG:
32357 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32358 {
32359 /* ??? SSE cost should be used here. */
32360 *total = cost->fchs;
32361 return false;
32362 }
32363 else if (X87_FLOAT_MODE_P (mode))
32364 {
32365 *total = cost->fchs;
32366 return false;
32367 }
32368 else if (FLOAT_MODE_P (mode))
32369 {
32370 /* ??? SSE vector cost should be used here. */
32371 *total = cost->fchs;
32372 return false;
32373 }
32374 /* FALLTHRU */
32375
32376 case NOT:
32377 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32378 {
32379 /* ??? Should be SSE vector operation cost. */
32380 /* At least for published AMD latencies, this really is the same
32381 as the latency for a simple fpu operation like fabs. */
32382 *total = cost->fabs;
32383 return false;
32384 }
32385 if (!TARGET_64BIT && mode == DImode)
32386 *total = cost->add * 2;
32387 else
32388 *total = cost->add;
32389 return false;
32390
32391 case COMPARE:
32392 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32393 && XEXP (XEXP (x, 0), 1) == const1_rtx
32394 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32395 && XEXP (x, 1) == const0_rtx)
32396 {
32397 /* This kind of construct is implemented using test[bwl].
32398 Treat it as if we had an AND. */
32399 *total = (cost->add
32400 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32401 + rtx_cost (const1_rtx, outer_code, opno, speed));
32402 return true;
32403 }
32404 return false;
32405
32406 case FLOAT_EXTEND:
32407 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32408 *total = 0;
32409 return false;
32410
32411 case ABS:
32412 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32413 /* ??? SSE cost should be used here. */
32414 *total = cost->fabs;
32415 else if (X87_FLOAT_MODE_P (mode))
32416 *total = cost->fabs;
32417 else if (FLOAT_MODE_P (mode))
32418 /* ??? SSE vector cost should be used here. */
32419 *total = cost->fabs;
32420 return false;
32421
32422 case SQRT:
32423 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32424 /* ??? SSE cost should be used here. */
32425 *total = cost->fsqrt;
32426 else if (X87_FLOAT_MODE_P (mode))
32427 *total = cost->fsqrt;
32428 else if (FLOAT_MODE_P (mode))
32429 /* ??? SSE vector cost should be used here. */
32430 *total = cost->fsqrt;
32431 return false;
32432
32433 case UNSPEC:
32434 if (XINT (x, 1) == UNSPEC_TP)
32435 *total = 0;
32436 return false;
32437
32438 case VEC_SELECT:
32439 case VEC_CONCAT:
32440 case VEC_MERGE:
32441 case VEC_DUPLICATE:
32442 /* ??? Assume all of these vector manipulation patterns are
32443 recognizable. In which case they all pretty much have the
32444 same cost. */
32445 *total = cost->fabs;
32446 return true;
32447
32448 default:
32449 return false;
32450 }
32451 }
32452
32453 #if TARGET_MACHO
32454
32455 static int current_machopic_label_num;
32456
32457 /* Given a symbol name and its associated stub, write out the
32458 definition of the stub. */
32459
32460 void
32461 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32462 {
32463 unsigned int length;
32464 char *binder_name, *symbol_name, lazy_ptr_name[32];
32465 int label = ++current_machopic_label_num;
32466
32467 /* For 64-bit we shouldn't get here. */
32468 gcc_assert (!TARGET_64BIT);
32469
32470 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32471 symb = targetm.strip_name_encoding (symb);
32472
32473 length = strlen (stub);
32474 binder_name = XALLOCAVEC (char, length + 32);
32475 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32476
32477 length = strlen (symb);
32478 symbol_name = XALLOCAVEC (char, length + 32);
32479 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32480
32481 sprintf (lazy_ptr_name, "L%d$lz", label);
32482
32483 if (MACHOPIC_ATT_STUB)
32484 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32485 else if (MACHOPIC_PURE)
32486 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32487 else
32488 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32489
32490 fprintf (file, "%s:\n", stub);
32491 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32492
32493 if (MACHOPIC_ATT_STUB)
32494 {
32495 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32496 }
32497 else if (MACHOPIC_PURE)
32498 {
32499 /* PIC stub. */
32500 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32501 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32502 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32503 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32504 label, lazy_ptr_name, label);
32505 fprintf (file, "\tjmp\t*%%ecx\n");
32506 }
32507 else
32508 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32509
32510 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32511 it needs no stub-binding-helper. */
32512 if (MACHOPIC_ATT_STUB)
32513 return;
32514
32515 fprintf (file, "%s:\n", binder_name);
32516
32517 if (MACHOPIC_PURE)
32518 {
32519 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32520 fprintf (file, "\tpushl\t%%ecx\n");
32521 }
32522 else
32523 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32524
32525 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32526
32527 /* N.B. Keep the correspondence of these
32528 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32529 old-pic/new-pic/non-pic stubs; altering this will break
32530 compatibility with existing dylibs. */
32531 if (MACHOPIC_PURE)
32532 {
32533 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32534 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32535 }
32536 else
32537 /* 16-byte -mdynamic-no-pic stub. */
32538 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32539
32540 fprintf (file, "%s:\n", lazy_ptr_name);
32541 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32542 fprintf (file, ASM_LONG "%s\n", binder_name);
32543 }
32544 #endif /* TARGET_MACHO */
32545
32546 /* Order the registers for register allocator. */
32547
32548 void
32549 x86_order_regs_for_local_alloc (void)
32550 {
32551 int pos = 0;
32552 int i;
32553
32554 /* First allocate the local general purpose registers. */
32555 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32556 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32557 reg_alloc_order [pos++] = i;
32558
32559 /* Global general purpose registers. */
32560 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32561 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32562 reg_alloc_order [pos++] = i;
32563
32564 /* x87 registers come first in case we are doing FP math
32565 using them. */
32566 if (!TARGET_SSE_MATH)
32567 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32568 reg_alloc_order [pos++] = i;
32569
32570 /* SSE registers. */
32571 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32572 reg_alloc_order [pos++] = i;
32573 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32574 reg_alloc_order [pos++] = i;
32575
32576 /* x87 registers. */
32577 if (TARGET_SSE_MATH)
32578 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32579 reg_alloc_order [pos++] = i;
32580
32581 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32582 reg_alloc_order [pos++] = i;
32583
32584 /* Initialize the rest of array as we do not allocate some registers
32585 at all. */
32586 while (pos < FIRST_PSEUDO_REGISTER)
32587 reg_alloc_order [pos++] = 0;
32588 }
32589
32590 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32591 in struct attribute_spec handler. */
32592 static tree
32593 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32594 tree args,
32595 int flags ATTRIBUTE_UNUSED,
32596 bool *no_add_attrs)
32597 {
32598 if (TREE_CODE (*node) != FUNCTION_TYPE
32599 && TREE_CODE (*node) != METHOD_TYPE
32600 && TREE_CODE (*node) != FIELD_DECL
32601 && TREE_CODE (*node) != TYPE_DECL)
32602 {
32603 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32604 name);
32605 *no_add_attrs = true;
32606 return NULL_TREE;
32607 }
32608 if (TARGET_64BIT)
32609 {
32610 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32611 name);
32612 *no_add_attrs = true;
32613 return NULL_TREE;
32614 }
32615 if (is_attribute_p ("callee_pop_aggregate_return", name))
32616 {
32617 tree cst;
32618
32619 cst = TREE_VALUE (args);
32620 if (TREE_CODE (cst) != INTEGER_CST)
32621 {
32622 warning (OPT_Wattributes,
32623 "%qE attribute requires an integer constant argument",
32624 name);
32625 *no_add_attrs = true;
32626 }
32627 else if (compare_tree_int (cst, 0) != 0
32628 && compare_tree_int (cst, 1) != 0)
32629 {
32630 warning (OPT_Wattributes,
32631 "argument to %qE attribute is neither zero, nor one",
32632 name);
32633 *no_add_attrs = true;
32634 }
32635
32636 return NULL_TREE;
32637 }
32638
32639 return NULL_TREE;
32640 }
32641
32642 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32643 struct attribute_spec.handler. */
32644 static tree
32645 ix86_handle_abi_attribute (tree *node, tree name,
32646 tree args ATTRIBUTE_UNUSED,
32647 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32648 {
32649 if (TREE_CODE (*node) != FUNCTION_TYPE
32650 && TREE_CODE (*node) != METHOD_TYPE
32651 && TREE_CODE (*node) != FIELD_DECL
32652 && TREE_CODE (*node) != TYPE_DECL)
32653 {
32654 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32655 name);
32656 *no_add_attrs = true;
32657 return NULL_TREE;
32658 }
32659
32660 /* Can combine regparm with all attributes but fastcall. */
32661 if (is_attribute_p ("ms_abi", name))
32662 {
32663 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32664 {
32665 error ("ms_abi and sysv_abi attributes are not compatible");
32666 }
32667
32668 return NULL_TREE;
32669 }
32670 else if (is_attribute_p ("sysv_abi", name))
32671 {
32672 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32673 {
32674 error ("ms_abi and sysv_abi attributes are not compatible");
32675 }
32676
32677 return NULL_TREE;
32678 }
32679
32680 return NULL_TREE;
32681 }
32682
32683 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32684 struct attribute_spec.handler. */
32685 static tree
32686 ix86_handle_struct_attribute (tree *node, tree name,
32687 tree args ATTRIBUTE_UNUSED,
32688 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32689 {
32690 tree *type = NULL;
32691 if (DECL_P (*node))
32692 {
32693 if (TREE_CODE (*node) == TYPE_DECL)
32694 type = &TREE_TYPE (*node);
32695 }
32696 else
32697 type = node;
32698
32699 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32700 {
32701 warning (OPT_Wattributes, "%qE attribute ignored",
32702 name);
32703 *no_add_attrs = true;
32704 }
32705
32706 else if ((is_attribute_p ("ms_struct", name)
32707 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32708 || ((is_attribute_p ("gcc_struct", name)
32709 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32710 {
32711 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32712 name);
32713 *no_add_attrs = true;
32714 }
32715
32716 return NULL_TREE;
32717 }
32718
32719 static tree
32720 ix86_handle_fndecl_attribute (tree *node, tree name,
32721 tree args ATTRIBUTE_UNUSED,
32722 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32723 {
32724 if (TREE_CODE (*node) != FUNCTION_DECL)
32725 {
32726 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32727 name);
32728 *no_add_attrs = true;
32729 }
32730 return NULL_TREE;
32731 }
32732
32733 static bool
32734 ix86_ms_bitfield_layout_p (const_tree record_type)
32735 {
32736 return ((TARGET_MS_BITFIELD_LAYOUT
32737 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32738 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32739 }
32740
32741 /* Returns an expression indicating where the this parameter is
32742 located on entry to the FUNCTION. */
32743
32744 static rtx
32745 x86_this_parameter (tree function)
32746 {
32747 tree type = TREE_TYPE (function);
32748 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32749 int nregs;
32750
32751 if (TARGET_64BIT)
32752 {
32753 const int *parm_regs;
32754
32755 if (ix86_function_type_abi (type) == MS_ABI)
32756 parm_regs = x86_64_ms_abi_int_parameter_registers;
32757 else
32758 parm_regs = x86_64_int_parameter_registers;
32759 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32760 }
32761
32762 nregs = ix86_function_regparm (type, function);
32763
32764 if (nregs > 0 && !stdarg_p (type))
32765 {
32766 int regno;
32767 unsigned int ccvt = ix86_get_callcvt (type);
32768
32769 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32770 regno = aggr ? DX_REG : CX_REG;
32771 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32772 {
32773 regno = CX_REG;
32774 if (aggr)
32775 return gen_rtx_MEM (SImode,
32776 plus_constant (Pmode, stack_pointer_rtx, 4));
32777 }
32778 else
32779 {
32780 regno = AX_REG;
32781 if (aggr)
32782 {
32783 regno = DX_REG;
32784 if (nregs == 1)
32785 return gen_rtx_MEM (SImode,
32786 plus_constant (Pmode,
32787 stack_pointer_rtx, 4));
32788 }
32789 }
32790 return gen_rtx_REG (SImode, regno);
32791 }
32792
32793 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
32794 aggr ? 8 : 4));
32795 }
32796
32797 /* Determine whether x86_output_mi_thunk can succeed. */
32798
32799 static bool
32800 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32801 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32802 HOST_WIDE_INT vcall_offset, const_tree function)
32803 {
32804 /* 64-bit can handle anything. */
32805 if (TARGET_64BIT)
32806 return true;
32807
32808 /* For 32-bit, everything's fine if we have one free register. */
32809 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32810 return true;
32811
32812 /* Need a free register for vcall_offset. */
32813 if (vcall_offset)
32814 return false;
32815
32816 /* Need a free register for GOT references. */
32817 if (flag_pic && !targetm.binds_local_p (function))
32818 return false;
32819
32820 /* Otherwise ok. */
32821 return true;
32822 }
32823
32824 /* Output the assembler code for a thunk function. THUNK_DECL is the
32825 declaration for the thunk function itself, FUNCTION is the decl for
32826 the target function. DELTA is an immediate constant offset to be
32827 added to THIS. If VCALL_OFFSET is nonzero, the word at
32828 *(*this + vcall_offset) should be added to THIS. */
32829
32830 static void
32831 x86_output_mi_thunk (FILE *file,
32832 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32833 HOST_WIDE_INT vcall_offset, tree function)
32834 {
32835 rtx this_param = x86_this_parameter (function);
32836 rtx this_reg, tmp, fnaddr;
32837
32838 emit_note (NOTE_INSN_PROLOGUE_END);
32839
32840 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32841 pull it in now and let DELTA benefit. */
32842 if (REG_P (this_param))
32843 this_reg = this_param;
32844 else if (vcall_offset)
32845 {
32846 /* Put the this parameter into %eax. */
32847 this_reg = gen_rtx_REG (Pmode, AX_REG);
32848 emit_move_insn (this_reg, this_param);
32849 }
32850 else
32851 this_reg = NULL_RTX;
32852
32853 /* Adjust the this parameter by a fixed constant. */
32854 if (delta)
32855 {
32856 rtx delta_rtx = GEN_INT (delta);
32857 rtx delta_dst = this_reg ? this_reg : this_param;
32858
32859 if (TARGET_64BIT)
32860 {
32861 if (!x86_64_general_operand (delta_rtx, Pmode))
32862 {
32863 tmp = gen_rtx_REG (Pmode, R10_REG);
32864 emit_move_insn (tmp, delta_rtx);
32865 delta_rtx = tmp;
32866 }
32867 }
32868
32869 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32870 }
32871
32872 /* Adjust the this parameter by a value stored in the vtable. */
32873 if (vcall_offset)
32874 {
32875 rtx vcall_addr, vcall_mem, this_mem;
32876 unsigned int tmp_regno;
32877
32878 if (TARGET_64BIT)
32879 tmp_regno = R10_REG;
32880 else
32881 {
32882 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32883 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32884 tmp_regno = AX_REG;
32885 else
32886 tmp_regno = CX_REG;
32887 }
32888 tmp = gen_rtx_REG (Pmode, tmp_regno);
32889
32890 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32891 if (Pmode != ptr_mode)
32892 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32893 emit_move_insn (tmp, this_mem);
32894
32895 /* Adjust the this parameter. */
32896 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
32897 if (TARGET_64BIT
32898 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32899 {
32900 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32901 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32902 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32903 }
32904
32905 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32906 if (Pmode != ptr_mode)
32907 emit_insn (gen_addsi_1_zext (this_reg,
32908 gen_rtx_REG (ptr_mode,
32909 REGNO (this_reg)),
32910 vcall_mem));
32911 else
32912 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32913 }
32914
32915 /* If necessary, drop THIS back to its stack slot. */
32916 if (this_reg && this_reg != this_param)
32917 emit_move_insn (this_param, this_reg);
32918
32919 fnaddr = XEXP (DECL_RTL (function), 0);
32920 if (TARGET_64BIT)
32921 {
32922 if (!flag_pic || targetm.binds_local_p (function)
32923 || cfun->machine->call_abi == MS_ABI)
32924 ;
32925 else
32926 {
32927 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32928 tmp = gen_rtx_CONST (Pmode, tmp);
32929 fnaddr = gen_rtx_MEM (Pmode, tmp);
32930 }
32931 }
32932 else
32933 {
32934 if (!flag_pic || targetm.binds_local_p (function))
32935 ;
32936 #if TARGET_MACHO
32937 else if (TARGET_MACHO)
32938 {
32939 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32940 fnaddr = XEXP (fnaddr, 0);
32941 }
32942 #endif /* TARGET_MACHO */
32943 else
32944 {
32945 tmp = gen_rtx_REG (Pmode, CX_REG);
32946 output_set_got (tmp, NULL_RTX);
32947
32948 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32949 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32950 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32951 }
32952 }
32953
32954 /* Our sibling call patterns do not allow memories, because we have no
32955 predicate that can distinguish between frame and non-frame memory.
32956 For our purposes here, we can get away with (ab)using a jump pattern,
32957 because we're going to do no optimization. */
32958 if (MEM_P (fnaddr))
32959 emit_jump_insn (gen_indirect_jump (fnaddr));
32960 else
32961 {
32962 tmp = gen_rtx_MEM (QImode, fnaddr);
32963 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32964 tmp = emit_call_insn (tmp);
32965 SIBLING_CALL_P (tmp) = 1;
32966 }
32967 emit_barrier ();
32968
32969 /* Emit just enough of rest_of_compilation to get the insns emitted.
32970 Note that use_thunk calls assemble_start_function et al. */
32971 tmp = get_insns ();
32972 insn_locators_alloc ();
32973 shorten_branches (tmp);
32974 final_start_function (tmp, file, 1);
32975 final (tmp, file, 1);
32976 final_end_function ();
32977 }
32978
32979 static void
32980 x86_file_start (void)
32981 {
32982 default_file_start ();
32983 #if TARGET_MACHO
32984 darwin_file_start ();
32985 #endif
32986 if (X86_FILE_START_VERSION_DIRECTIVE)
32987 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32988 if (X86_FILE_START_FLTUSED)
32989 fputs ("\t.global\t__fltused\n", asm_out_file);
32990 if (ix86_asm_dialect == ASM_INTEL)
32991 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32992 }
32993
32994 int
32995 x86_field_alignment (tree field, int computed)
32996 {
32997 enum machine_mode mode;
32998 tree type = TREE_TYPE (field);
32999
33000 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
33001 return computed;
33002 mode = TYPE_MODE (strip_array_types (type));
33003 if (mode == DFmode || mode == DCmode
33004 || GET_MODE_CLASS (mode) == MODE_INT
33005 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
33006 return MIN (32, computed);
33007 return computed;
33008 }
33009
33010 /* Output assembler code to FILE to increment profiler label # LABELNO
33011 for profiling a function entry. */
33012 void
33013 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
33014 {
33015 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
33016 : MCOUNT_NAME);
33017
33018 if (TARGET_64BIT)
33019 {
33020 #ifndef NO_PROFILE_COUNTERS
33021 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
33022 #endif
33023
33024 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
33025 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
33026 else
33027 fprintf (file, "\tcall\t%s\n", mcount_name);
33028 }
33029 else if (flag_pic)
33030 {
33031 #ifndef NO_PROFILE_COUNTERS
33032 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
33033 LPREFIX, labelno);
33034 #endif
33035 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
33036 }
33037 else
33038 {
33039 #ifndef NO_PROFILE_COUNTERS
33040 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
33041 LPREFIX, labelno);
33042 #endif
33043 fprintf (file, "\tcall\t%s\n", mcount_name);
33044 }
33045 }
33046
33047 /* We don't have exact information about the insn sizes, but we may assume
33048 quite safely that we are informed about all 1 byte insns and memory
33049 address sizes. This is enough to eliminate unnecessary padding in
33050 99% of cases. */
33051
33052 static int
33053 min_insn_size (rtx insn)
33054 {
33055 int l = 0, len;
33056
33057 if (!INSN_P (insn) || !active_insn_p (insn))
33058 return 0;
33059
33060 /* Discard alignments we've emit and jump instructions. */
33061 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
33062 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
33063 return 0;
33064 if (JUMP_TABLE_DATA_P (insn))
33065 return 0;
33066
33067 /* Important case - calls are always 5 bytes.
33068 It is common to have many calls in the row. */
33069 if (CALL_P (insn)
33070 && symbolic_reference_mentioned_p (PATTERN (insn))
33071 && !SIBLING_CALL_P (insn))
33072 return 5;
33073 len = get_attr_length (insn);
33074 if (len <= 1)
33075 return 1;
33076
33077 /* For normal instructions we rely on get_attr_length being exact,
33078 with a few exceptions. */
33079 if (!JUMP_P (insn))
33080 {
33081 enum attr_type type = get_attr_type (insn);
33082
33083 switch (type)
33084 {
33085 case TYPE_MULTI:
33086 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
33087 || asm_noperands (PATTERN (insn)) >= 0)
33088 return 0;
33089 break;
33090 case TYPE_OTHER:
33091 case TYPE_FCMP:
33092 break;
33093 default:
33094 /* Otherwise trust get_attr_length. */
33095 return len;
33096 }
33097
33098 l = get_attr_length_address (insn);
33099 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
33100 l = 4;
33101 }
33102 if (l)
33103 return 1+l;
33104 else
33105 return 2;
33106 }
33107
33108 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33109
33110 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
33111 window. */
33112
33113 static void
33114 ix86_avoid_jump_mispredicts (void)
33115 {
33116 rtx insn, start = get_insns ();
33117 int nbytes = 0, njumps = 0;
33118 int isjump = 0;
33119
33120 /* Look for all minimal intervals of instructions containing 4 jumps.
33121 The intervals are bounded by START and INSN. NBYTES is the total
33122 size of instructions in the interval including INSN and not including
33123 START. When the NBYTES is smaller than 16 bytes, it is possible
33124 that the end of START and INSN ends up in the same 16byte page.
33125
33126 The smallest offset in the page INSN can start is the case where START
33127 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
33128 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
33129 */
33130 for (insn = start; insn; insn = NEXT_INSN (insn))
33131 {
33132 int min_size;
33133
33134 if (LABEL_P (insn))
33135 {
33136 int align = label_to_alignment (insn);
33137 int max_skip = label_to_max_skip (insn);
33138
33139 if (max_skip > 15)
33140 max_skip = 15;
33141 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
33142 already in the current 16 byte page, because otherwise
33143 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
33144 bytes to reach 16 byte boundary. */
33145 if (align <= 0
33146 || (align <= 3 && max_skip != (1 << align) - 1))
33147 max_skip = 0;
33148 if (dump_file)
33149 fprintf (dump_file, "Label %i with max_skip %i\n",
33150 INSN_UID (insn), max_skip);
33151 if (max_skip)
33152 {
33153 while (nbytes + max_skip >= 16)
33154 {
33155 start = NEXT_INSN (start);
33156 if ((JUMP_P (start)
33157 && GET_CODE (PATTERN (start)) != ADDR_VEC
33158 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33159 || CALL_P (start))
33160 njumps--, isjump = 1;
33161 else
33162 isjump = 0;
33163 nbytes -= min_insn_size (start);
33164 }
33165 }
33166 continue;
33167 }
33168
33169 min_size = min_insn_size (insn);
33170 nbytes += min_size;
33171 if (dump_file)
33172 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
33173 INSN_UID (insn), min_size);
33174 if ((JUMP_P (insn)
33175 && GET_CODE (PATTERN (insn)) != ADDR_VEC
33176 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
33177 || CALL_P (insn))
33178 njumps++;
33179 else
33180 continue;
33181
33182 while (njumps > 3)
33183 {
33184 start = NEXT_INSN (start);
33185 if ((JUMP_P (start)
33186 && GET_CODE (PATTERN (start)) != ADDR_VEC
33187 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33188 || CALL_P (start))
33189 njumps--, isjump = 1;
33190 else
33191 isjump = 0;
33192 nbytes -= min_insn_size (start);
33193 }
33194 gcc_assert (njumps >= 0);
33195 if (dump_file)
33196 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33197 INSN_UID (start), INSN_UID (insn), nbytes);
33198
33199 if (njumps == 3 && isjump && nbytes < 16)
33200 {
33201 int padsize = 15 - nbytes + min_insn_size (insn);
33202
33203 if (dump_file)
33204 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33205 INSN_UID (insn), padsize);
33206 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33207 }
33208 }
33209 }
33210 #endif
33211
33212 /* AMD Athlon works faster
33213 when RET is not destination of conditional jump or directly preceded
33214 by other jump instruction. We avoid the penalty by inserting NOP just
33215 before the RET instructions in such cases. */
33216 static void
33217 ix86_pad_returns (void)
33218 {
33219 edge e;
33220 edge_iterator ei;
33221
33222 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33223 {
33224 basic_block bb = e->src;
33225 rtx ret = BB_END (bb);
33226 rtx prev;
33227 bool replace = false;
33228
33229 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33230 || optimize_bb_for_size_p (bb))
33231 continue;
33232 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33233 if (active_insn_p (prev) || LABEL_P (prev))
33234 break;
33235 if (prev && LABEL_P (prev))
33236 {
33237 edge e;
33238 edge_iterator ei;
33239
33240 FOR_EACH_EDGE (e, ei, bb->preds)
33241 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33242 && !(e->flags & EDGE_FALLTHRU))
33243 replace = true;
33244 }
33245 if (!replace)
33246 {
33247 prev = prev_active_insn (ret);
33248 if (prev
33249 && ((JUMP_P (prev) && any_condjump_p (prev))
33250 || CALL_P (prev)))
33251 replace = true;
33252 /* Empty functions get branch mispredict even when
33253 the jump destination is not visible to us. */
33254 if (!prev && !optimize_function_for_size_p (cfun))
33255 replace = true;
33256 }
33257 if (replace)
33258 {
33259 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33260 delete_insn (ret);
33261 }
33262 }
33263 }
33264
33265 /* Count the minimum number of instructions in BB. Return 4 if the
33266 number of instructions >= 4. */
33267
33268 static int
33269 ix86_count_insn_bb (basic_block bb)
33270 {
33271 rtx insn;
33272 int insn_count = 0;
33273
33274 /* Count number of instructions in this block. Return 4 if the number
33275 of instructions >= 4. */
33276 FOR_BB_INSNS (bb, insn)
33277 {
33278 /* Only happen in exit blocks. */
33279 if (JUMP_P (insn)
33280 && ANY_RETURN_P (PATTERN (insn)))
33281 break;
33282
33283 if (NONDEBUG_INSN_P (insn)
33284 && GET_CODE (PATTERN (insn)) != USE
33285 && GET_CODE (PATTERN (insn)) != CLOBBER)
33286 {
33287 insn_count++;
33288 if (insn_count >= 4)
33289 return insn_count;
33290 }
33291 }
33292
33293 return insn_count;
33294 }
33295
33296
33297 /* Count the minimum number of instructions in code path in BB.
33298 Return 4 if the number of instructions >= 4. */
33299
33300 static int
33301 ix86_count_insn (basic_block bb)
33302 {
33303 edge e;
33304 edge_iterator ei;
33305 int min_prev_count;
33306
33307 /* Only bother counting instructions along paths with no
33308 more than 2 basic blocks between entry and exit. Given
33309 that BB has an edge to exit, determine if a predecessor
33310 of BB has an edge from entry. If so, compute the number
33311 of instructions in the predecessor block. If there
33312 happen to be multiple such blocks, compute the minimum. */
33313 min_prev_count = 4;
33314 FOR_EACH_EDGE (e, ei, bb->preds)
33315 {
33316 edge prev_e;
33317 edge_iterator prev_ei;
33318
33319 if (e->src == ENTRY_BLOCK_PTR)
33320 {
33321 min_prev_count = 0;
33322 break;
33323 }
33324 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33325 {
33326 if (prev_e->src == ENTRY_BLOCK_PTR)
33327 {
33328 int count = ix86_count_insn_bb (e->src);
33329 if (count < min_prev_count)
33330 min_prev_count = count;
33331 break;
33332 }
33333 }
33334 }
33335
33336 if (min_prev_count < 4)
33337 min_prev_count += ix86_count_insn_bb (bb);
33338
33339 return min_prev_count;
33340 }
33341
33342 /* Pad short function to 4 instructions. */
33343
33344 static void
33345 ix86_pad_short_function (void)
33346 {
33347 edge e;
33348 edge_iterator ei;
33349
33350 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33351 {
33352 rtx ret = BB_END (e->src);
33353 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33354 {
33355 int insn_count = ix86_count_insn (e->src);
33356
33357 /* Pad short function. */
33358 if (insn_count < 4)
33359 {
33360 rtx insn = ret;
33361
33362 /* Find epilogue. */
33363 while (insn
33364 && (!NOTE_P (insn)
33365 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33366 insn = PREV_INSN (insn);
33367
33368 if (!insn)
33369 insn = ret;
33370
33371 /* Two NOPs count as one instruction. */
33372 insn_count = 2 * (4 - insn_count);
33373 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33374 }
33375 }
33376 }
33377 }
33378
33379 /* Implement machine specific optimizations. We implement padding of returns
33380 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33381 static void
33382 ix86_reorg (void)
33383 {
33384 /* We are freeing block_for_insn in the toplev to keep compatibility
33385 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33386 compute_bb_for_insn ();
33387
33388 /* Run the vzeroupper optimization if needed. */
33389 if (TARGET_VZEROUPPER)
33390 move_or_delete_vzeroupper ();
33391
33392 if (optimize && optimize_function_for_speed_p (cfun))
33393 {
33394 if (TARGET_PAD_SHORT_FUNCTION)
33395 ix86_pad_short_function ();
33396 else if (TARGET_PAD_RETURNS)
33397 ix86_pad_returns ();
33398 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33399 if (TARGET_FOUR_JUMP_LIMIT)
33400 ix86_avoid_jump_mispredicts ();
33401 #endif
33402 }
33403 }
33404
33405 /* Return nonzero when QImode register that must be represented via REX prefix
33406 is used. */
33407 bool
33408 x86_extended_QIreg_mentioned_p (rtx insn)
33409 {
33410 int i;
33411 extract_insn_cached (insn);
33412 for (i = 0; i < recog_data.n_operands; i++)
33413 if (REG_P (recog_data.operand[i])
33414 && REGNO (recog_data.operand[i]) > BX_REG)
33415 return true;
33416 return false;
33417 }
33418
33419 /* Return nonzero when P points to register encoded via REX prefix.
33420 Called via for_each_rtx. */
33421 static int
33422 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33423 {
33424 unsigned int regno;
33425 if (!REG_P (*p))
33426 return 0;
33427 regno = REGNO (*p);
33428 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33429 }
33430
33431 /* Return true when INSN mentions register that must be encoded using REX
33432 prefix. */
33433 bool
33434 x86_extended_reg_mentioned_p (rtx insn)
33435 {
33436 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33437 extended_reg_mentioned_1, NULL);
33438 }
33439
33440 /* If profitable, negate (without causing overflow) integer constant
33441 of mode MODE at location LOC. Return true in this case. */
33442 bool
33443 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33444 {
33445 HOST_WIDE_INT val;
33446
33447 if (!CONST_INT_P (*loc))
33448 return false;
33449
33450 switch (mode)
33451 {
33452 case DImode:
33453 /* DImode x86_64 constants must fit in 32 bits. */
33454 gcc_assert (x86_64_immediate_operand (*loc, mode));
33455
33456 mode = SImode;
33457 break;
33458
33459 case SImode:
33460 case HImode:
33461 case QImode:
33462 break;
33463
33464 default:
33465 gcc_unreachable ();
33466 }
33467
33468 /* Avoid overflows. */
33469 if (mode_signbit_p (mode, *loc))
33470 return false;
33471
33472 val = INTVAL (*loc);
33473
33474 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33475 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33476 if ((val < 0 && val != -128)
33477 || val == 128)
33478 {
33479 *loc = GEN_INT (-val);
33480 return true;
33481 }
33482
33483 return false;
33484 }
33485
33486 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33487 optabs would emit if we didn't have TFmode patterns. */
33488
33489 void
33490 x86_emit_floatuns (rtx operands[2])
33491 {
33492 rtx neglab, donelab, i0, i1, f0, in, out;
33493 enum machine_mode mode, inmode;
33494
33495 inmode = GET_MODE (operands[1]);
33496 gcc_assert (inmode == SImode || inmode == DImode);
33497
33498 out = operands[0];
33499 in = force_reg (inmode, operands[1]);
33500 mode = GET_MODE (out);
33501 neglab = gen_label_rtx ();
33502 donelab = gen_label_rtx ();
33503 f0 = gen_reg_rtx (mode);
33504
33505 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33506
33507 expand_float (out, in, 0);
33508
33509 emit_jump_insn (gen_jump (donelab));
33510 emit_barrier ();
33511
33512 emit_label (neglab);
33513
33514 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33515 1, OPTAB_DIRECT);
33516 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33517 1, OPTAB_DIRECT);
33518 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33519
33520 expand_float (f0, i0, 0);
33521
33522 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33523
33524 emit_label (donelab);
33525 }
33526 \f
33527 /* AVX2 does support 32-byte integer vector operations,
33528 thus the longest vector we are faced with is V32QImode. */
33529 #define MAX_VECT_LEN 32
33530
33531 struct expand_vec_perm_d
33532 {
33533 rtx target, op0, op1;
33534 unsigned char perm[MAX_VECT_LEN];
33535 enum machine_mode vmode;
33536 unsigned char nelt;
33537 bool one_operand_p;
33538 bool testing_p;
33539 };
33540
33541 static bool canonicalize_perm (struct expand_vec_perm_d *d);
33542 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33543 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33544
33545 /* Get a vector mode of the same size as the original but with elements
33546 twice as wide. This is only guaranteed to apply to integral vectors. */
33547
33548 static inline enum machine_mode
33549 get_mode_wider_vector (enum machine_mode o)
33550 {
33551 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33552 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33553 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33554 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33555 return n;
33556 }
33557
33558 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33559 with all elements equal to VAR. Return true if successful. */
33560
33561 static bool
33562 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33563 rtx target, rtx val)
33564 {
33565 bool ok;
33566
33567 switch (mode)
33568 {
33569 case V2SImode:
33570 case V2SFmode:
33571 if (!mmx_ok)
33572 return false;
33573 /* FALLTHRU */
33574
33575 case V4DFmode:
33576 case V4DImode:
33577 case V8SFmode:
33578 case V8SImode:
33579 case V2DFmode:
33580 case V2DImode:
33581 case V4SFmode:
33582 case V4SImode:
33583 {
33584 rtx insn, dup;
33585
33586 /* First attempt to recognize VAL as-is. */
33587 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33588 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33589 if (recog_memoized (insn) < 0)
33590 {
33591 rtx seq;
33592 /* If that fails, force VAL into a register. */
33593
33594 start_sequence ();
33595 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33596 seq = get_insns ();
33597 end_sequence ();
33598 if (seq)
33599 emit_insn_before (seq, insn);
33600
33601 ok = recog_memoized (insn) >= 0;
33602 gcc_assert (ok);
33603 }
33604 }
33605 return true;
33606
33607 case V4HImode:
33608 if (!mmx_ok)
33609 return false;
33610 if (TARGET_SSE || TARGET_3DNOW_A)
33611 {
33612 rtx x;
33613
33614 val = gen_lowpart (SImode, val);
33615 x = gen_rtx_TRUNCATE (HImode, val);
33616 x = gen_rtx_VEC_DUPLICATE (mode, x);
33617 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33618 return true;
33619 }
33620 goto widen;
33621
33622 case V8QImode:
33623 if (!mmx_ok)
33624 return false;
33625 goto widen;
33626
33627 case V8HImode:
33628 if (TARGET_SSE2)
33629 {
33630 struct expand_vec_perm_d dperm;
33631 rtx tmp1, tmp2;
33632
33633 permute:
33634 memset (&dperm, 0, sizeof (dperm));
33635 dperm.target = target;
33636 dperm.vmode = mode;
33637 dperm.nelt = GET_MODE_NUNITS (mode);
33638 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33639 dperm.one_operand_p = true;
33640
33641 /* Extend to SImode using a paradoxical SUBREG. */
33642 tmp1 = gen_reg_rtx (SImode);
33643 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33644
33645 /* Insert the SImode value as low element of a V4SImode vector. */
33646 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33647 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33648
33649 ok = (expand_vec_perm_1 (&dperm)
33650 || expand_vec_perm_broadcast_1 (&dperm));
33651 gcc_assert (ok);
33652 return ok;
33653 }
33654 goto widen;
33655
33656 case V16QImode:
33657 if (TARGET_SSE2)
33658 goto permute;
33659 goto widen;
33660
33661 widen:
33662 /* Replicate the value once into the next wider mode and recurse. */
33663 {
33664 enum machine_mode smode, wsmode, wvmode;
33665 rtx x;
33666
33667 smode = GET_MODE_INNER (mode);
33668 wvmode = get_mode_wider_vector (mode);
33669 wsmode = GET_MODE_INNER (wvmode);
33670
33671 val = convert_modes (wsmode, smode, val, true);
33672 x = expand_simple_binop (wsmode, ASHIFT, val,
33673 GEN_INT (GET_MODE_BITSIZE (smode)),
33674 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33675 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33676
33677 x = gen_lowpart (wvmode, target);
33678 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33679 gcc_assert (ok);
33680 return ok;
33681 }
33682
33683 case V16HImode:
33684 case V32QImode:
33685 {
33686 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33687 rtx x = gen_reg_rtx (hvmode);
33688
33689 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33690 gcc_assert (ok);
33691
33692 x = gen_rtx_VEC_CONCAT (mode, x, x);
33693 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33694 }
33695 return true;
33696
33697 default:
33698 return false;
33699 }
33700 }
33701
33702 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33703 whose ONE_VAR element is VAR, and other elements are zero. Return true
33704 if successful. */
33705
33706 static bool
33707 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33708 rtx target, rtx var, int one_var)
33709 {
33710 enum machine_mode vsimode;
33711 rtx new_target;
33712 rtx x, tmp;
33713 bool use_vector_set = false;
33714
33715 switch (mode)
33716 {
33717 case V2DImode:
33718 /* For SSE4.1, we normally use vector set. But if the second
33719 element is zero and inter-unit moves are OK, we use movq
33720 instead. */
33721 use_vector_set = (TARGET_64BIT
33722 && TARGET_SSE4_1
33723 && !(TARGET_INTER_UNIT_MOVES
33724 && one_var == 0));
33725 break;
33726 case V16QImode:
33727 case V4SImode:
33728 case V4SFmode:
33729 use_vector_set = TARGET_SSE4_1;
33730 break;
33731 case V8HImode:
33732 use_vector_set = TARGET_SSE2;
33733 break;
33734 case V4HImode:
33735 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33736 break;
33737 case V32QImode:
33738 case V16HImode:
33739 case V8SImode:
33740 case V8SFmode:
33741 case V4DFmode:
33742 use_vector_set = TARGET_AVX;
33743 break;
33744 case V4DImode:
33745 /* Use ix86_expand_vector_set in 64bit mode only. */
33746 use_vector_set = TARGET_AVX && TARGET_64BIT;
33747 break;
33748 default:
33749 break;
33750 }
33751
33752 if (use_vector_set)
33753 {
33754 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33755 var = force_reg (GET_MODE_INNER (mode), var);
33756 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33757 return true;
33758 }
33759
33760 switch (mode)
33761 {
33762 case V2SFmode:
33763 case V2SImode:
33764 if (!mmx_ok)
33765 return false;
33766 /* FALLTHRU */
33767
33768 case V2DFmode:
33769 case V2DImode:
33770 if (one_var != 0)
33771 return false;
33772 var = force_reg (GET_MODE_INNER (mode), var);
33773 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33774 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33775 return true;
33776
33777 case V4SFmode:
33778 case V4SImode:
33779 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33780 new_target = gen_reg_rtx (mode);
33781 else
33782 new_target = target;
33783 var = force_reg (GET_MODE_INNER (mode), var);
33784 x = gen_rtx_VEC_DUPLICATE (mode, var);
33785 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33786 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33787 if (one_var != 0)
33788 {
33789 /* We need to shuffle the value to the correct position, so
33790 create a new pseudo to store the intermediate result. */
33791
33792 /* With SSE2, we can use the integer shuffle insns. */
33793 if (mode != V4SFmode && TARGET_SSE2)
33794 {
33795 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33796 const1_rtx,
33797 GEN_INT (one_var == 1 ? 0 : 1),
33798 GEN_INT (one_var == 2 ? 0 : 1),
33799 GEN_INT (one_var == 3 ? 0 : 1)));
33800 if (target != new_target)
33801 emit_move_insn (target, new_target);
33802 return true;
33803 }
33804
33805 /* Otherwise convert the intermediate result to V4SFmode and
33806 use the SSE1 shuffle instructions. */
33807 if (mode != V4SFmode)
33808 {
33809 tmp = gen_reg_rtx (V4SFmode);
33810 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33811 }
33812 else
33813 tmp = new_target;
33814
33815 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33816 const1_rtx,
33817 GEN_INT (one_var == 1 ? 0 : 1),
33818 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33819 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33820
33821 if (mode != V4SFmode)
33822 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33823 else if (tmp != target)
33824 emit_move_insn (target, tmp);
33825 }
33826 else if (target != new_target)
33827 emit_move_insn (target, new_target);
33828 return true;
33829
33830 case V8HImode:
33831 case V16QImode:
33832 vsimode = V4SImode;
33833 goto widen;
33834 case V4HImode:
33835 case V8QImode:
33836 if (!mmx_ok)
33837 return false;
33838 vsimode = V2SImode;
33839 goto widen;
33840 widen:
33841 if (one_var != 0)
33842 return false;
33843
33844 /* Zero extend the variable element to SImode and recurse. */
33845 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33846
33847 x = gen_reg_rtx (vsimode);
33848 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33849 var, one_var))
33850 gcc_unreachable ();
33851
33852 emit_move_insn (target, gen_lowpart (mode, x));
33853 return true;
33854
33855 default:
33856 return false;
33857 }
33858 }
33859
33860 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33861 consisting of the values in VALS. It is known that all elements
33862 except ONE_VAR are constants. Return true if successful. */
33863
33864 static bool
33865 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33866 rtx target, rtx vals, int one_var)
33867 {
33868 rtx var = XVECEXP (vals, 0, one_var);
33869 enum machine_mode wmode;
33870 rtx const_vec, x;
33871
33872 const_vec = copy_rtx (vals);
33873 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33874 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33875
33876 switch (mode)
33877 {
33878 case V2DFmode:
33879 case V2DImode:
33880 case V2SFmode:
33881 case V2SImode:
33882 /* For the two element vectors, it's just as easy to use
33883 the general case. */
33884 return false;
33885
33886 case V4DImode:
33887 /* Use ix86_expand_vector_set in 64bit mode only. */
33888 if (!TARGET_64BIT)
33889 return false;
33890 case V4DFmode:
33891 case V8SFmode:
33892 case V8SImode:
33893 case V16HImode:
33894 case V32QImode:
33895 case V4SFmode:
33896 case V4SImode:
33897 case V8HImode:
33898 case V4HImode:
33899 break;
33900
33901 case V16QImode:
33902 if (TARGET_SSE4_1)
33903 break;
33904 wmode = V8HImode;
33905 goto widen;
33906 case V8QImode:
33907 wmode = V4HImode;
33908 goto widen;
33909 widen:
33910 /* There's no way to set one QImode entry easily. Combine
33911 the variable value with its adjacent constant value, and
33912 promote to an HImode set. */
33913 x = XVECEXP (vals, 0, one_var ^ 1);
33914 if (one_var & 1)
33915 {
33916 var = convert_modes (HImode, QImode, var, true);
33917 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33918 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33919 x = GEN_INT (INTVAL (x) & 0xff);
33920 }
33921 else
33922 {
33923 var = convert_modes (HImode, QImode, var, true);
33924 x = gen_int_mode (INTVAL (x) << 8, HImode);
33925 }
33926 if (x != const0_rtx)
33927 var = expand_simple_binop (HImode, IOR, var, x, var,
33928 1, OPTAB_LIB_WIDEN);
33929
33930 x = gen_reg_rtx (wmode);
33931 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33932 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33933
33934 emit_move_insn (target, gen_lowpart (mode, x));
33935 return true;
33936
33937 default:
33938 return false;
33939 }
33940
33941 emit_move_insn (target, const_vec);
33942 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33943 return true;
33944 }
33945
33946 /* A subroutine of ix86_expand_vector_init_general. Use vector
33947 concatenate to handle the most general case: all values variable,
33948 and none identical. */
33949
33950 static void
33951 ix86_expand_vector_init_concat (enum machine_mode mode,
33952 rtx target, rtx *ops, int n)
33953 {
33954 enum machine_mode cmode, hmode = VOIDmode;
33955 rtx first[8], second[4];
33956 rtvec v;
33957 int i, j;
33958
33959 switch (n)
33960 {
33961 case 2:
33962 switch (mode)
33963 {
33964 case V8SImode:
33965 cmode = V4SImode;
33966 break;
33967 case V8SFmode:
33968 cmode = V4SFmode;
33969 break;
33970 case V4DImode:
33971 cmode = V2DImode;
33972 break;
33973 case V4DFmode:
33974 cmode = V2DFmode;
33975 break;
33976 case V4SImode:
33977 cmode = V2SImode;
33978 break;
33979 case V4SFmode:
33980 cmode = V2SFmode;
33981 break;
33982 case V2DImode:
33983 cmode = DImode;
33984 break;
33985 case V2SImode:
33986 cmode = SImode;
33987 break;
33988 case V2DFmode:
33989 cmode = DFmode;
33990 break;
33991 case V2SFmode:
33992 cmode = SFmode;
33993 break;
33994 default:
33995 gcc_unreachable ();
33996 }
33997
33998 if (!register_operand (ops[1], cmode))
33999 ops[1] = force_reg (cmode, ops[1]);
34000 if (!register_operand (ops[0], cmode))
34001 ops[0] = force_reg (cmode, ops[0]);
34002 emit_insn (gen_rtx_SET (VOIDmode, target,
34003 gen_rtx_VEC_CONCAT (mode, ops[0],
34004 ops[1])));
34005 break;
34006
34007 case 4:
34008 switch (mode)
34009 {
34010 case V4DImode:
34011 cmode = V2DImode;
34012 break;
34013 case V4DFmode:
34014 cmode = V2DFmode;
34015 break;
34016 case V4SImode:
34017 cmode = V2SImode;
34018 break;
34019 case V4SFmode:
34020 cmode = V2SFmode;
34021 break;
34022 default:
34023 gcc_unreachable ();
34024 }
34025 goto half;
34026
34027 case 8:
34028 switch (mode)
34029 {
34030 case V8SImode:
34031 cmode = V2SImode;
34032 hmode = V4SImode;
34033 break;
34034 case V8SFmode:
34035 cmode = V2SFmode;
34036 hmode = V4SFmode;
34037 break;
34038 default:
34039 gcc_unreachable ();
34040 }
34041 goto half;
34042
34043 half:
34044 /* FIXME: We process inputs backward to help RA. PR 36222. */
34045 i = n - 1;
34046 j = (n >> 1) - 1;
34047 for (; i > 0; i -= 2, j--)
34048 {
34049 first[j] = gen_reg_rtx (cmode);
34050 v = gen_rtvec (2, ops[i - 1], ops[i]);
34051 ix86_expand_vector_init (false, first[j],
34052 gen_rtx_PARALLEL (cmode, v));
34053 }
34054
34055 n >>= 1;
34056 if (n > 2)
34057 {
34058 gcc_assert (hmode != VOIDmode);
34059 for (i = j = 0; i < n; i += 2, j++)
34060 {
34061 second[j] = gen_reg_rtx (hmode);
34062 ix86_expand_vector_init_concat (hmode, second [j],
34063 &first [i], 2);
34064 }
34065 n >>= 1;
34066 ix86_expand_vector_init_concat (mode, target, second, n);
34067 }
34068 else
34069 ix86_expand_vector_init_concat (mode, target, first, n);
34070 break;
34071
34072 default:
34073 gcc_unreachable ();
34074 }
34075 }
34076
34077 /* A subroutine of ix86_expand_vector_init_general. Use vector
34078 interleave to handle the most general case: all values variable,
34079 and none identical. */
34080
34081 static void
34082 ix86_expand_vector_init_interleave (enum machine_mode mode,
34083 rtx target, rtx *ops, int n)
34084 {
34085 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
34086 int i, j;
34087 rtx op0, op1;
34088 rtx (*gen_load_even) (rtx, rtx, rtx);
34089 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
34090 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
34091
34092 switch (mode)
34093 {
34094 case V8HImode:
34095 gen_load_even = gen_vec_setv8hi;
34096 gen_interleave_first_low = gen_vec_interleave_lowv4si;
34097 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34098 inner_mode = HImode;
34099 first_imode = V4SImode;
34100 second_imode = V2DImode;
34101 third_imode = VOIDmode;
34102 break;
34103 case V16QImode:
34104 gen_load_even = gen_vec_setv16qi;
34105 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
34106 gen_interleave_second_low = gen_vec_interleave_lowv4si;
34107 inner_mode = QImode;
34108 first_imode = V8HImode;
34109 second_imode = V4SImode;
34110 third_imode = V2DImode;
34111 break;
34112 default:
34113 gcc_unreachable ();
34114 }
34115
34116 for (i = 0; i < n; i++)
34117 {
34118 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
34119 op0 = gen_reg_rtx (SImode);
34120 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
34121
34122 /* Insert the SImode value as low element of V4SImode vector. */
34123 op1 = gen_reg_rtx (V4SImode);
34124 op0 = gen_rtx_VEC_MERGE (V4SImode,
34125 gen_rtx_VEC_DUPLICATE (V4SImode,
34126 op0),
34127 CONST0_RTX (V4SImode),
34128 const1_rtx);
34129 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
34130
34131 /* Cast the V4SImode vector back to a vector in orignal mode. */
34132 op0 = gen_reg_rtx (mode);
34133 emit_move_insn (op0, gen_lowpart (mode, op1));
34134
34135 /* Load even elements into the second positon. */
34136 emit_insn (gen_load_even (op0,
34137 force_reg (inner_mode,
34138 ops [i + i + 1]),
34139 const1_rtx));
34140
34141 /* Cast vector to FIRST_IMODE vector. */
34142 ops[i] = gen_reg_rtx (first_imode);
34143 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
34144 }
34145
34146 /* Interleave low FIRST_IMODE vectors. */
34147 for (i = j = 0; i < n; i += 2, j++)
34148 {
34149 op0 = gen_reg_rtx (first_imode);
34150 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
34151
34152 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
34153 ops[j] = gen_reg_rtx (second_imode);
34154 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
34155 }
34156
34157 /* Interleave low SECOND_IMODE vectors. */
34158 switch (second_imode)
34159 {
34160 case V4SImode:
34161 for (i = j = 0; i < n / 2; i += 2, j++)
34162 {
34163 op0 = gen_reg_rtx (second_imode);
34164 emit_insn (gen_interleave_second_low (op0, ops[i],
34165 ops[i + 1]));
34166
34167 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
34168 vector. */
34169 ops[j] = gen_reg_rtx (third_imode);
34170 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
34171 }
34172 second_imode = V2DImode;
34173 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34174 /* FALLTHRU */
34175
34176 case V2DImode:
34177 op0 = gen_reg_rtx (second_imode);
34178 emit_insn (gen_interleave_second_low (op0, ops[0],
34179 ops[1]));
34180
34181 /* Cast the SECOND_IMODE vector back to a vector on original
34182 mode. */
34183 emit_insn (gen_rtx_SET (VOIDmode, target,
34184 gen_lowpart (mode, op0)));
34185 break;
34186
34187 default:
34188 gcc_unreachable ();
34189 }
34190 }
34191
34192 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
34193 all values variable, and none identical. */
34194
34195 static void
34196 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34197 rtx target, rtx vals)
34198 {
34199 rtx ops[32], op0, op1;
34200 enum machine_mode half_mode = VOIDmode;
34201 int n, i;
34202
34203 switch (mode)
34204 {
34205 case V2SFmode:
34206 case V2SImode:
34207 if (!mmx_ok && !TARGET_SSE)
34208 break;
34209 /* FALLTHRU */
34210
34211 case V8SFmode:
34212 case V8SImode:
34213 case V4DFmode:
34214 case V4DImode:
34215 case V4SFmode:
34216 case V4SImode:
34217 case V2DFmode:
34218 case V2DImode:
34219 n = GET_MODE_NUNITS (mode);
34220 for (i = 0; i < n; i++)
34221 ops[i] = XVECEXP (vals, 0, i);
34222 ix86_expand_vector_init_concat (mode, target, ops, n);
34223 return;
34224
34225 case V32QImode:
34226 half_mode = V16QImode;
34227 goto half;
34228
34229 case V16HImode:
34230 half_mode = V8HImode;
34231 goto half;
34232
34233 half:
34234 n = GET_MODE_NUNITS (mode);
34235 for (i = 0; i < n; i++)
34236 ops[i] = XVECEXP (vals, 0, i);
34237 op0 = gen_reg_rtx (half_mode);
34238 op1 = gen_reg_rtx (half_mode);
34239 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34240 n >> 2);
34241 ix86_expand_vector_init_interleave (half_mode, op1,
34242 &ops [n >> 1], n >> 2);
34243 emit_insn (gen_rtx_SET (VOIDmode, target,
34244 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34245 return;
34246
34247 case V16QImode:
34248 if (!TARGET_SSE4_1)
34249 break;
34250 /* FALLTHRU */
34251
34252 case V8HImode:
34253 if (!TARGET_SSE2)
34254 break;
34255
34256 /* Don't use ix86_expand_vector_init_interleave if we can't
34257 move from GPR to SSE register directly. */
34258 if (!TARGET_INTER_UNIT_MOVES)
34259 break;
34260
34261 n = GET_MODE_NUNITS (mode);
34262 for (i = 0; i < n; i++)
34263 ops[i] = XVECEXP (vals, 0, i);
34264 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34265 return;
34266
34267 case V4HImode:
34268 case V8QImode:
34269 break;
34270
34271 default:
34272 gcc_unreachable ();
34273 }
34274
34275 {
34276 int i, j, n_elts, n_words, n_elt_per_word;
34277 enum machine_mode inner_mode;
34278 rtx words[4], shift;
34279
34280 inner_mode = GET_MODE_INNER (mode);
34281 n_elts = GET_MODE_NUNITS (mode);
34282 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34283 n_elt_per_word = n_elts / n_words;
34284 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34285
34286 for (i = 0; i < n_words; ++i)
34287 {
34288 rtx word = NULL_RTX;
34289
34290 for (j = 0; j < n_elt_per_word; ++j)
34291 {
34292 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34293 elt = convert_modes (word_mode, inner_mode, elt, true);
34294
34295 if (j == 0)
34296 word = elt;
34297 else
34298 {
34299 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34300 word, 1, OPTAB_LIB_WIDEN);
34301 word = expand_simple_binop (word_mode, IOR, word, elt,
34302 word, 1, OPTAB_LIB_WIDEN);
34303 }
34304 }
34305
34306 words[i] = word;
34307 }
34308
34309 if (n_words == 1)
34310 emit_move_insn (target, gen_lowpart (mode, words[0]));
34311 else if (n_words == 2)
34312 {
34313 rtx tmp = gen_reg_rtx (mode);
34314 emit_clobber (tmp);
34315 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34316 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34317 emit_move_insn (target, tmp);
34318 }
34319 else if (n_words == 4)
34320 {
34321 rtx tmp = gen_reg_rtx (V4SImode);
34322 gcc_assert (word_mode == SImode);
34323 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34324 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34325 emit_move_insn (target, gen_lowpart (mode, tmp));
34326 }
34327 else
34328 gcc_unreachable ();
34329 }
34330 }
34331
34332 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34333 instructions unless MMX_OK is true. */
34334
34335 void
34336 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34337 {
34338 enum machine_mode mode = GET_MODE (target);
34339 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34340 int n_elts = GET_MODE_NUNITS (mode);
34341 int n_var = 0, one_var = -1;
34342 bool all_same = true, all_const_zero = true;
34343 int i;
34344 rtx x;
34345
34346 for (i = 0; i < n_elts; ++i)
34347 {
34348 x = XVECEXP (vals, 0, i);
34349 if (!(CONST_INT_P (x)
34350 || GET_CODE (x) == CONST_DOUBLE
34351 || GET_CODE (x) == CONST_FIXED))
34352 n_var++, one_var = i;
34353 else if (x != CONST0_RTX (inner_mode))
34354 all_const_zero = false;
34355 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34356 all_same = false;
34357 }
34358
34359 /* Constants are best loaded from the constant pool. */
34360 if (n_var == 0)
34361 {
34362 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34363 return;
34364 }
34365
34366 /* If all values are identical, broadcast the value. */
34367 if (all_same
34368 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34369 XVECEXP (vals, 0, 0)))
34370 return;
34371
34372 /* Values where only one field is non-constant are best loaded from
34373 the pool and overwritten via move later. */
34374 if (n_var == 1)
34375 {
34376 if (all_const_zero
34377 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34378 XVECEXP (vals, 0, one_var),
34379 one_var))
34380 return;
34381
34382 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34383 return;
34384 }
34385
34386 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34387 }
34388
34389 void
34390 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34391 {
34392 enum machine_mode mode = GET_MODE (target);
34393 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34394 enum machine_mode half_mode;
34395 bool use_vec_merge = false;
34396 rtx tmp;
34397 static rtx (*gen_extract[6][2]) (rtx, rtx)
34398 = {
34399 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34400 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34401 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34402 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34403 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34404 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34405 };
34406 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34407 = {
34408 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34409 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34410 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34411 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34412 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34413 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34414 };
34415 int i, j, n;
34416
34417 switch (mode)
34418 {
34419 case V2SFmode:
34420 case V2SImode:
34421 if (mmx_ok)
34422 {
34423 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34424 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34425 if (elt == 0)
34426 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34427 else
34428 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34429 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34430 return;
34431 }
34432 break;
34433
34434 case V2DImode:
34435 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34436 if (use_vec_merge)
34437 break;
34438
34439 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34440 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34441 if (elt == 0)
34442 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34443 else
34444 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34445 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34446 return;
34447
34448 case V2DFmode:
34449 {
34450 rtx op0, op1;
34451
34452 /* For the two element vectors, we implement a VEC_CONCAT with
34453 the extraction of the other element. */
34454
34455 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34456 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34457
34458 if (elt == 0)
34459 op0 = val, op1 = tmp;
34460 else
34461 op0 = tmp, op1 = val;
34462
34463 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34464 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34465 }
34466 return;
34467
34468 case V4SFmode:
34469 use_vec_merge = TARGET_SSE4_1;
34470 if (use_vec_merge)
34471 break;
34472
34473 switch (elt)
34474 {
34475 case 0:
34476 use_vec_merge = true;
34477 break;
34478
34479 case 1:
34480 /* tmp = target = A B C D */
34481 tmp = copy_to_reg (target);
34482 /* target = A A B B */
34483 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34484 /* target = X A B B */
34485 ix86_expand_vector_set (false, target, val, 0);
34486 /* target = A X C D */
34487 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34488 const1_rtx, const0_rtx,
34489 GEN_INT (2+4), GEN_INT (3+4)));
34490 return;
34491
34492 case 2:
34493 /* tmp = target = A B C D */
34494 tmp = copy_to_reg (target);
34495 /* tmp = X B C D */
34496 ix86_expand_vector_set (false, tmp, val, 0);
34497 /* target = A B X D */
34498 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34499 const0_rtx, const1_rtx,
34500 GEN_INT (0+4), GEN_INT (3+4)));
34501 return;
34502
34503 case 3:
34504 /* tmp = target = A B C D */
34505 tmp = copy_to_reg (target);
34506 /* tmp = X B C D */
34507 ix86_expand_vector_set (false, tmp, val, 0);
34508 /* target = A B X D */
34509 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34510 const0_rtx, const1_rtx,
34511 GEN_INT (2+4), GEN_INT (0+4)));
34512 return;
34513
34514 default:
34515 gcc_unreachable ();
34516 }
34517 break;
34518
34519 case V4SImode:
34520 use_vec_merge = TARGET_SSE4_1;
34521 if (use_vec_merge)
34522 break;
34523
34524 /* Element 0 handled by vec_merge below. */
34525 if (elt == 0)
34526 {
34527 use_vec_merge = true;
34528 break;
34529 }
34530
34531 if (TARGET_SSE2)
34532 {
34533 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34534 store into element 0, then shuffle them back. */
34535
34536 rtx order[4];
34537
34538 order[0] = GEN_INT (elt);
34539 order[1] = const1_rtx;
34540 order[2] = const2_rtx;
34541 order[3] = GEN_INT (3);
34542 order[elt] = const0_rtx;
34543
34544 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34545 order[1], order[2], order[3]));
34546
34547 ix86_expand_vector_set (false, target, val, 0);
34548
34549 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34550 order[1], order[2], order[3]));
34551 }
34552 else
34553 {
34554 /* For SSE1, we have to reuse the V4SF code. */
34555 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34556 gen_lowpart (SFmode, val), elt);
34557 }
34558 return;
34559
34560 case V8HImode:
34561 use_vec_merge = TARGET_SSE2;
34562 break;
34563 case V4HImode:
34564 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34565 break;
34566
34567 case V16QImode:
34568 use_vec_merge = TARGET_SSE4_1;
34569 break;
34570
34571 case V8QImode:
34572 break;
34573
34574 case V32QImode:
34575 half_mode = V16QImode;
34576 j = 0;
34577 n = 16;
34578 goto half;
34579
34580 case V16HImode:
34581 half_mode = V8HImode;
34582 j = 1;
34583 n = 8;
34584 goto half;
34585
34586 case V8SImode:
34587 half_mode = V4SImode;
34588 j = 2;
34589 n = 4;
34590 goto half;
34591
34592 case V4DImode:
34593 half_mode = V2DImode;
34594 j = 3;
34595 n = 2;
34596 goto half;
34597
34598 case V8SFmode:
34599 half_mode = V4SFmode;
34600 j = 4;
34601 n = 4;
34602 goto half;
34603
34604 case V4DFmode:
34605 half_mode = V2DFmode;
34606 j = 5;
34607 n = 2;
34608 goto half;
34609
34610 half:
34611 /* Compute offset. */
34612 i = elt / n;
34613 elt %= n;
34614
34615 gcc_assert (i <= 1);
34616
34617 /* Extract the half. */
34618 tmp = gen_reg_rtx (half_mode);
34619 emit_insn (gen_extract[j][i] (tmp, target));
34620
34621 /* Put val in tmp at elt. */
34622 ix86_expand_vector_set (false, tmp, val, elt);
34623
34624 /* Put it back. */
34625 emit_insn (gen_insert[j][i] (target, target, tmp));
34626 return;
34627
34628 default:
34629 break;
34630 }
34631
34632 if (use_vec_merge)
34633 {
34634 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34635 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34636 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34637 }
34638 else
34639 {
34640 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34641
34642 emit_move_insn (mem, target);
34643
34644 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34645 emit_move_insn (tmp, val);
34646
34647 emit_move_insn (target, mem);
34648 }
34649 }
34650
34651 void
34652 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34653 {
34654 enum machine_mode mode = GET_MODE (vec);
34655 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34656 bool use_vec_extr = false;
34657 rtx tmp;
34658
34659 switch (mode)
34660 {
34661 case V2SImode:
34662 case V2SFmode:
34663 if (!mmx_ok)
34664 break;
34665 /* FALLTHRU */
34666
34667 case V2DFmode:
34668 case V2DImode:
34669 use_vec_extr = true;
34670 break;
34671
34672 case V4SFmode:
34673 use_vec_extr = TARGET_SSE4_1;
34674 if (use_vec_extr)
34675 break;
34676
34677 switch (elt)
34678 {
34679 case 0:
34680 tmp = vec;
34681 break;
34682
34683 case 1:
34684 case 3:
34685 tmp = gen_reg_rtx (mode);
34686 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34687 GEN_INT (elt), GEN_INT (elt),
34688 GEN_INT (elt+4), GEN_INT (elt+4)));
34689 break;
34690
34691 case 2:
34692 tmp = gen_reg_rtx (mode);
34693 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34694 break;
34695
34696 default:
34697 gcc_unreachable ();
34698 }
34699 vec = tmp;
34700 use_vec_extr = true;
34701 elt = 0;
34702 break;
34703
34704 case V4SImode:
34705 use_vec_extr = TARGET_SSE4_1;
34706 if (use_vec_extr)
34707 break;
34708
34709 if (TARGET_SSE2)
34710 {
34711 switch (elt)
34712 {
34713 case 0:
34714 tmp = vec;
34715 break;
34716
34717 case 1:
34718 case 3:
34719 tmp = gen_reg_rtx (mode);
34720 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34721 GEN_INT (elt), GEN_INT (elt),
34722 GEN_INT (elt), GEN_INT (elt)));
34723 break;
34724
34725 case 2:
34726 tmp = gen_reg_rtx (mode);
34727 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34728 break;
34729
34730 default:
34731 gcc_unreachable ();
34732 }
34733 vec = tmp;
34734 use_vec_extr = true;
34735 elt = 0;
34736 }
34737 else
34738 {
34739 /* For SSE1, we have to reuse the V4SF code. */
34740 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34741 gen_lowpart (V4SFmode, vec), elt);
34742 return;
34743 }
34744 break;
34745
34746 case V8HImode:
34747 use_vec_extr = TARGET_SSE2;
34748 break;
34749 case V4HImode:
34750 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34751 break;
34752
34753 case V16QImode:
34754 use_vec_extr = TARGET_SSE4_1;
34755 break;
34756
34757 case V8SFmode:
34758 if (TARGET_AVX)
34759 {
34760 tmp = gen_reg_rtx (V4SFmode);
34761 if (elt < 4)
34762 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34763 else
34764 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34765 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34766 return;
34767 }
34768 break;
34769
34770 case V4DFmode:
34771 if (TARGET_AVX)
34772 {
34773 tmp = gen_reg_rtx (V2DFmode);
34774 if (elt < 2)
34775 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34776 else
34777 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34778 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34779 return;
34780 }
34781 break;
34782
34783 case V32QImode:
34784 if (TARGET_AVX)
34785 {
34786 tmp = gen_reg_rtx (V16QImode);
34787 if (elt < 16)
34788 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34789 else
34790 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34791 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34792 return;
34793 }
34794 break;
34795
34796 case V16HImode:
34797 if (TARGET_AVX)
34798 {
34799 tmp = gen_reg_rtx (V8HImode);
34800 if (elt < 8)
34801 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34802 else
34803 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34804 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34805 return;
34806 }
34807 break;
34808
34809 case V8SImode:
34810 if (TARGET_AVX)
34811 {
34812 tmp = gen_reg_rtx (V4SImode);
34813 if (elt < 4)
34814 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34815 else
34816 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34817 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34818 return;
34819 }
34820 break;
34821
34822 case V4DImode:
34823 if (TARGET_AVX)
34824 {
34825 tmp = gen_reg_rtx (V2DImode);
34826 if (elt < 2)
34827 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34828 else
34829 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34830 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34831 return;
34832 }
34833 break;
34834
34835 case V8QImode:
34836 /* ??? Could extract the appropriate HImode element and shift. */
34837 default:
34838 break;
34839 }
34840
34841 if (use_vec_extr)
34842 {
34843 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34844 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34845
34846 /* Let the rtl optimizers know about the zero extension performed. */
34847 if (inner_mode == QImode || inner_mode == HImode)
34848 {
34849 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34850 target = gen_lowpart (SImode, target);
34851 }
34852
34853 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34854 }
34855 else
34856 {
34857 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34858
34859 emit_move_insn (mem, vec);
34860
34861 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34862 emit_move_insn (target, tmp);
34863 }
34864 }
34865
34866 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34867 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34868 The upper bits of DEST are undefined, though they shouldn't cause
34869 exceptions (some bits from src or all zeros are ok). */
34870
34871 static void
34872 emit_reduc_half (rtx dest, rtx src, int i)
34873 {
34874 rtx tem;
34875 switch (GET_MODE (src))
34876 {
34877 case V4SFmode:
34878 if (i == 128)
34879 tem = gen_sse_movhlps (dest, src, src);
34880 else
34881 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34882 GEN_INT (1 + 4), GEN_INT (1 + 4));
34883 break;
34884 case V2DFmode:
34885 tem = gen_vec_interleave_highv2df (dest, src, src);
34886 break;
34887 case V16QImode:
34888 case V8HImode:
34889 case V4SImode:
34890 case V2DImode:
34891 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34892 gen_lowpart (V1TImode, src),
34893 GEN_INT (i / 2));
34894 break;
34895 case V8SFmode:
34896 if (i == 256)
34897 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34898 else
34899 tem = gen_avx_shufps256 (dest, src, src,
34900 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34901 break;
34902 case V4DFmode:
34903 if (i == 256)
34904 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34905 else
34906 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34907 break;
34908 case V32QImode:
34909 case V16HImode:
34910 case V8SImode:
34911 case V4DImode:
34912 if (i == 256)
34913 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34914 gen_lowpart (V4DImode, src),
34915 gen_lowpart (V4DImode, src),
34916 const1_rtx);
34917 else
34918 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34919 gen_lowpart (V2TImode, src),
34920 GEN_INT (i / 2));
34921 break;
34922 default:
34923 gcc_unreachable ();
34924 }
34925 emit_insn (tem);
34926 }
34927
34928 /* Expand a vector reduction. FN is the binary pattern to reduce;
34929 DEST is the destination; IN is the input vector. */
34930
34931 void
34932 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34933 {
34934 rtx half, dst, vec = in;
34935 enum machine_mode mode = GET_MODE (in);
34936 int i;
34937
34938 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34939 if (TARGET_SSE4_1
34940 && mode == V8HImode
34941 && fn == gen_uminv8hi3)
34942 {
34943 emit_insn (gen_sse4_1_phminposuw (dest, in));
34944 return;
34945 }
34946
34947 for (i = GET_MODE_BITSIZE (mode);
34948 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34949 i >>= 1)
34950 {
34951 half = gen_reg_rtx (mode);
34952 emit_reduc_half (half, vec, i);
34953 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34954 dst = dest;
34955 else
34956 dst = gen_reg_rtx (mode);
34957 emit_insn (fn (dst, half, vec));
34958 vec = dst;
34959 }
34960 }
34961 \f
34962 /* Target hook for scalar_mode_supported_p. */
34963 static bool
34964 ix86_scalar_mode_supported_p (enum machine_mode mode)
34965 {
34966 if (DECIMAL_FLOAT_MODE_P (mode))
34967 return default_decimal_float_supported_p ();
34968 else if (mode == TFmode)
34969 return true;
34970 else
34971 return default_scalar_mode_supported_p (mode);
34972 }
34973
34974 /* Implements target hook vector_mode_supported_p. */
34975 static bool
34976 ix86_vector_mode_supported_p (enum machine_mode mode)
34977 {
34978 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34979 return true;
34980 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34981 return true;
34982 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34983 return true;
34984 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34985 return true;
34986 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34987 return true;
34988 return false;
34989 }
34990
34991 /* Target hook for c_mode_for_suffix. */
34992 static enum machine_mode
34993 ix86_c_mode_for_suffix (char suffix)
34994 {
34995 if (suffix == 'q')
34996 return TFmode;
34997 if (suffix == 'w')
34998 return XFmode;
34999
35000 return VOIDmode;
35001 }
35002
35003 /* Worker function for TARGET_MD_ASM_CLOBBERS.
35004
35005 We do this in the new i386 backend to maintain source compatibility
35006 with the old cc0-based compiler. */
35007
35008 static tree
35009 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
35010 tree inputs ATTRIBUTE_UNUSED,
35011 tree clobbers)
35012 {
35013 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
35014 clobbers);
35015 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
35016 clobbers);
35017 return clobbers;
35018 }
35019
35020 /* Implements target vector targetm.asm.encode_section_info. */
35021
35022 static void ATTRIBUTE_UNUSED
35023 ix86_encode_section_info (tree decl, rtx rtl, int first)
35024 {
35025 default_encode_section_info (decl, rtl, first);
35026
35027 if (TREE_CODE (decl) == VAR_DECL
35028 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
35029 && ix86_in_large_data_p (decl))
35030 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
35031 }
35032
35033 /* Worker function for REVERSE_CONDITION. */
35034
35035 enum rtx_code
35036 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
35037 {
35038 return (mode != CCFPmode && mode != CCFPUmode
35039 ? reverse_condition (code)
35040 : reverse_condition_maybe_unordered (code));
35041 }
35042
35043 /* Output code to perform an x87 FP register move, from OPERANDS[1]
35044 to OPERANDS[0]. */
35045
35046 const char *
35047 output_387_reg_move (rtx insn, rtx *operands)
35048 {
35049 if (REG_P (operands[0]))
35050 {
35051 if (REG_P (operands[1])
35052 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35053 {
35054 if (REGNO (operands[0]) == FIRST_STACK_REG)
35055 return output_387_ffreep (operands, 0);
35056 return "fstp\t%y0";
35057 }
35058 if (STACK_TOP_P (operands[0]))
35059 return "fld%Z1\t%y1";
35060 return "fst\t%y0";
35061 }
35062 else if (MEM_P (operands[0]))
35063 {
35064 gcc_assert (REG_P (operands[1]));
35065 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35066 return "fstp%Z0\t%y0";
35067 else
35068 {
35069 /* There is no non-popping store to memory for XFmode.
35070 So if we need one, follow the store with a load. */
35071 if (GET_MODE (operands[0]) == XFmode)
35072 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
35073 else
35074 return "fst%Z0\t%y0";
35075 }
35076 }
35077 else
35078 gcc_unreachable();
35079 }
35080
35081 /* Output code to perform a conditional jump to LABEL, if C2 flag in
35082 FP status register is set. */
35083
35084 void
35085 ix86_emit_fp_unordered_jump (rtx label)
35086 {
35087 rtx reg = gen_reg_rtx (HImode);
35088 rtx temp;
35089
35090 emit_insn (gen_x86_fnstsw_1 (reg));
35091
35092 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
35093 {
35094 emit_insn (gen_x86_sahf_1 (reg));
35095
35096 temp = gen_rtx_REG (CCmode, FLAGS_REG);
35097 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
35098 }
35099 else
35100 {
35101 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
35102
35103 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
35104 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
35105 }
35106
35107 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
35108 gen_rtx_LABEL_REF (VOIDmode, label),
35109 pc_rtx);
35110 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
35111
35112 emit_jump_insn (temp);
35113 predict_jump (REG_BR_PROB_BASE * 10 / 100);
35114 }
35115
35116 /* Output code to perform a log1p XFmode calculation. */
35117
35118 void ix86_emit_i387_log1p (rtx op0, rtx op1)
35119 {
35120 rtx label1 = gen_label_rtx ();
35121 rtx label2 = gen_label_rtx ();
35122
35123 rtx tmp = gen_reg_rtx (XFmode);
35124 rtx tmp2 = gen_reg_rtx (XFmode);
35125 rtx test;
35126
35127 emit_insn (gen_absxf2 (tmp, op1));
35128 test = gen_rtx_GE (VOIDmode, tmp,
35129 CONST_DOUBLE_FROM_REAL_VALUE (
35130 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
35131 XFmode));
35132 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
35133
35134 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35135 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
35136 emit_jump (label2);
35137
35138 emit_label (label1);
35139 emit_move_insn (tmp, CONST1_RTX (XFmode));
35140 emit_insn (gen_addxf3 (tmp, op1, tmp));
35141 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35142 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
35143
35144 emit_label (label2);
35145 }
35146
35147 /* Emit code for round calculation. */
35148 void ix86_emit_i387_round (rtx op0, rtx op1)
35149 {
35150 enum machine_mode inmode = GET_MODE (op1);
35151 enum machine_mode outmode = GET_MODE (op0);
35152 rtx e1, e2, res, tmp, tmp1, half;
35153 rtx scratch = gen_reg_rtx (HImode);
35154 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
35155 rtx jump_label = gen_label_rtx ();
35156 rtx insn;
35157 rtx (*gen_abs) (rtx, rtx);
35158 rtx (*gen_neg) (rtx, rtx);
35159
35160 switch (inmode)
35161 {
35162 case SFmode:
35163 gen_abs = gen_abssf2;
35164 break;
35165 case DFmode:
35166 gen_abs = gen_absdf2;
35167 break;
35168 case XFmode:
35169 gen_abs = gen_absxf2;
35170 break;
35171 default:
35172 gcc_unreachable ();
35173 }
35174
35175 switch (outmode)
35176 {
35177 case SFmode:
35178 gen_neg = gen_negsf2;
35179 break;
35180 case DFmode:
35181 gen_neg = gen_negdf2;
35182 break;
35183 case XFmode:
35184 gen_neg = gen_negxf2;
35185 break;
35186 case HImode:
35187 gen_neg = gen_neghi2;
35188 break;
35189 case SImode:
35190 gen_neg = gen_negsi2;
35191 break;
35192 case DImode:
35193 gen_neg = gen_negdi2;
35194 break;
35195 default:
35196 gcc_unreachable ();
35197 }
35198
35199 e1 = gen_reg_rtx (inmode);
35200 e2 = gen_reg_rtx (inmode);
35201 res = gen_reg_rtx (outmode);
35202
35203 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35204
35205 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35206
35207 /* scratch = fxam(op1) */
35208 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35209 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35210 UNSPEC_FXAM)));
35211 /* e1 = fabs(op1) */
35212 emit_insn (gen_abs (e1, op1));
35213
35214 /* e2 = e1 + 0.5 */
35215 half = force_reg (inmode, half);
35216 emit_insn (gen_rtx_SET (VOIDmode, e2,
35217 gen_rtx_PLUS (inmode, e1, half)));
35218
35219 /* res = floor(e2) */
35220 if (inmode != XFmode)
35221 {
35222 tmp1 = gen_reg_rtx (XFmode);
35223
35224 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35225 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35226 }
35227 else
35228 tmp1 = e2;
35229
35230 switch (outmode)
35231 {
35232 case SFmode:
35233 case DFmode:
35234 {
35235 rtx tmp0 = gen_reg_rtx (XFmode);
35236
35237 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35238
35239 emit_insn (gen_rtx_SET (VOIDmode, res,
35240 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35241 UNSPEC_TRUNC_NOOP)));
35242 }
35243 break;
35244 case XFmode:
35245 emit_insn (gen_frndintxf2_floor (res, tmp1));
35246 break;
35247 case HImode:
35248 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35249 break;
35250 case SImode:
35251 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35252 break;
35253 case DImode:
35254 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35255 break;
35256 default:
35257 gcc_unreachable ();
35258 }
35259
35260 /* flags = signbit(a) */
35261 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35262
35263 /* if (flags) then res = -res */
35264 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35265 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35266 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35267 pc_rtx);
35268 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35269 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35270 JUMP_LABEL (insn) = jump_label;
35271
35272 emit_insn (gen_neg (res, res));
35273
35274 emit_label (jump_label);
35275 LABEL_NUSES (jump_label) = 1;
35276
35277 emit_move_insn (op0, res);
35278 }
35279
35280 /* Output code to perform a Newton-Rhapson approximation of a single precision
35281 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35282
35283 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35284 {
35285 rtx x0, x1, e0, e1;
35286
35287 x0 = gen_reg_rtx (mode);
35288 e0 = gen_reg_rtx (mode);
35289 e1 = gen_reg_rtx (mode);
35290 x1 = gen_reg_rtx (mode);
35291
35292 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35293
35294 b = force_reg (mode, b);
35295
35296 /* x0 = rcp(b) estimate */
35297 emit_insn (gen_rtx_SET (VOIDmode, x0,
35298 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35299 UNSPEC_RCP)));
35300 /* e0 = x0 * b */
35301 emit_insn (gen_rtx_SET (VOIDmode, e0,
35302 gen_rtx_MULT (mode, x0, b)));
35303
35304 /* e0 = x0 * e0 */
35305 emit_insn (gen_rtx_SET (VOIDmode, e0,
35306 gen_rtx_MULT (mode, x0, e0)));
35307
35308 /* e1 = x0 + x0 */
35309 emit_insn (gen_rtx_SET (VOIDmode, e1,
35310 gen_rtx_PLUS (mode, x0, x0)));
35311
35312 /* x1 = e1 - e0 */
35313 emit_insn (gen_rtx_SET (VOIDmode, x1,
35314 gen_rtx_MINUS (mode, e1, e0)));
35315
35316 /* res = a * x1 */
35317 emit_insn (gen_rtx_SET (VOIDmode, res,
35318 gen_rtx_MULT (mode, a, x1)));
35319 }
35320
35321 /* Output code to perform a Newton-Rhapson approximation of a
35322 single precision floating point [reciprocal] square root. */
35323
35324 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35325 bool recip)
35326 {
35327 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35328 REAL_VALUE_TYPE r;
35329
35330 x0 = gen_reg_rtx (mode);
35331 e0 = gen_reg_rtx (mode);
35332 e1 = gen_reg_rtx (mode);
35333 e2 = gen_reg_rtx (mode);
35334 e3 = gen_reg_rtx (mode);
35335
35336 real_from_integer (&r, VOIDmode, -3, -1, 0);
35337 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35338
35339 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35340 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35341
35342 if (VECTOR_MODE_P (mode))
35343 {
35344 mthree = ix86_build_const_vector (mode, true, mthree);
35345 mhalf = ix86_build_const_vector (mode, true, mhalf);
35346 }
35347
35348 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35349 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35350
35351 a = force_reg (mode, a);
35352
35353 /* x0 = rsqrt(a) estimate */
35354 emit_insn (gen_rtx_SET (VOIDmode, x0,
35355 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35356 UNSPEC_RSQRT)));
35357
35358 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35359 if (!recip)
35360 {
35361 rtx zero, mask;
35362
35363 zero = gen_reg_rtx (mode);
35364 mask = gen_reg_rtx (mode);
35365
35366 zero = force_reg (mode, CONST0_RTX(mode));
35367 emit_insn (gen_rtx_SET (VOIDmode, mask,
35368 gen_rtx_NE (mode, zero, a)));
35369
35370 emit_insn (gen_rtx_SET (VOIDmode, x0,
35371 gen_rtx_AND (mode, x0, mask)));
35372 }
35373
35374 /* e0 = x0 * a */
35375 emit_insn (gen_rtx_SET (VOIDmode, e0,
35376 gen_rtx_MULT (mode, x0, a)));
35377 /* e1 = e0 * x0 */
35378 emit_insn (gen_rtx_SET (VOIDmode, e1,
35379 gen_rtx_MULT (mode, e0, x0)));
35380
35381 /* e2 = e1 - 3. */
35382 mthree = force_reg (mode, mthree);
35383 emit_insn (gen_rtx_SET (VOIDmode, e2,
35384 gen_rtx_PLUS (mode, e1, mthree)));
35385
35386 mhalf = force_reg (mode, mhalf);
35387 if (recip)
35388 /* e3 = -.5 * x0 */
35389 emit_insn (gen_rtx_SET (VOIDmode, e3,
35390 gen_rtx_MULT (mode, x0, mhalf)));
35391 else
35392 /* e3 = -.5 * e0 */
35393 emit_insn (gen_rtx_SET (VOIDmode, e3,
35394 gen_rtx_MULT (mode, e0, mhalf)));
35395 /* ret = e2 * e3 */
35396 emit_insn (gen_rtx_SET (VOIDmode, res,
35397 gen_rtx_MULT (mode, e2, e3)));
35398 }
35399
35400 #ifdef TARGET_SOLARIS
35401 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35402
35403 static void
35404 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35405 tree decl)
35406 {
35407 /* With Binutils 2.15, the "@unwind" marker must be specified on
35408 every occurrence of the ".eh_frame" section, not just the first
35409 one. */
35410 if (TARGET_64BIT
35411 && strcmp (name, ".eh_frame") == 0)
35412 {
35413 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35414 flags & SECTION_WRITE ? "aw" : "a");
35415 return;
35416 }
35417
35418 #ifndef USE_GAS
35419 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35420 {
35421 solaris_elf_asm_comdat_section (name, flags, decl);
35422 return;
35423 }
35424 #endif
35425
35426 default_elf_asm_named_section (name, flags, decl);
35427 }
35428 #endif /* TARGET_SOLARIS */
35429
35430 /* Return the mangling of TYPE if it is an extended fundamental type. */
35431
35432 static const char *
35433 ix86_mangle_type (const_tree type)
35434 {
35435 type = TYPE_MAIN_VARIANT (type);
35436
35437 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35438 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35439 return NULL;
35440
35441 switch (TYPE_MODE (type))
35442 {
35443 case TFmode:
35444 /* __float128 is "g". */
35445 return "g";
35446 case XFmode:
35447 /* "long double" or __float80 is "e". */
35448 return "e";
35449 default:
35450 return NULL;
35451 }
35452 }
35453
35454 /* For 32-bit code we can save PIC register setup by using
35455 __stack_chk_fail_local hidden function instead of calling
35456 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35457 register, so it is better to call __stack_chk_fail directly. */
35458
35459 static tree ATTRIBUTE_UNUSED
35460 ix86_stack_protect_fail (void)
35461 {
35462 return TARGET_64BIT
35463 ? default_external_stack_protect_fail ()
35464 : default_hidden_stack_protect_fail ();
35465 }
35466
35467 /* Select a format to encode pointers in exception handling data. CODE
35468 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35469 true if the symbol may be affected by dynamic relocations.
35470
35471 ??? All x86 object file formats are capable of representing this.
35472 After all, the relocation needed is the same as for the call insn.
35473 Whether or not a particular assembler allows us to enter such, I
35474 guess we'll have to see. */
35475 int
35476 asm_preferred_eh_data_format (int code, int global)
35477 {
35478 if (flag_pic)
35479 {
35480 int type = DW_EH_PE_sdata8;
35481 if (!TARGET_64BIT
35482 || ix86_cmodel == CM_SMALL_PIC
35483 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35484 type = DW_EH_PE_sdata4;
35485 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35486 }
35487 if (ix86_cmodel == CM_SMALL
35488 || (ix86_cmodel == CM_MEDIUM && code))
35489 return DW_EH_PE_udata4;
35490 return DW_EH_PE_absptr;
35491 }
35492 \f
35493 /* Expand copysign from SIGN to the positive value ABS_VALUE
35494 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35495 the sign-bit. */
35496 static void
35497 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35498 {
35499 enum machine_mode mode = GET_MODE (sign);
35500 rtx sgn = gen_reg_rtx (mode);
35501 if (mask == NULL_RTX)
35502 {
35503 enum machine_mode vmode;
35504
35505 if (mode == SFmode)
35506 vmode = V4SFmode;
35507 else if (mode == DFmode)
35508 vmode = V2DFmode;
35509 else
35510 vmode = mode;
35511
35512 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35513 if (!VECTOR_MODE_P (mode))
35514 {
35515 /* We need to generate a scalar mode mask in this case. */
35516 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35517 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35518 mask = gen_reg_rtx (mode);
35519 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35520 }
35521 }
35522 else
35523 mask = gen_rtx_NOT (mode, mask);
35524 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35525 gen_rtx_AND (mode, mask, sign)));
35526 emit_insn (gen_rtx_SET (VOIDmode, result,
35527 gen_rtx_IOR (mode, abs_value, sgn)));
35528 }
35529
35530 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35531 mask for masking out the sign-bit is stored in *SMASK, if that is
35532 non-null. */
35533 static rtx
35534 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35535 {
35536 enum machine_mode vmode, mode = GET_MODE (op0);
35537 rtx xa, mask;
35538
35539 xa = gen_reg_rtx (mode);
35540 if (mode == SFmode)
35541 vmode = V4SFmode;
35542 else if (mode == DFmode)
35543 vmode = V2DFmode;
35544 else
35545 vmode = mode;
35546 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35547 if (!VECTOR_MODE_P (mode))
35548 {
35549 /* We need to generate a scalar mode mask in this case. */
35550 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35551 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35552 mask = gen_reg_rtx (mode);
35553 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35554 }
35555 emit_insn (gen_rtx_SET (VOIDmode, xa,
35556 gen_rtx_AND (mode, op0, mask)));
35557
35558 if (smask)
35559 *smask = mask;
35560
35561 return xa;
35562 }
35563
35564 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35565 swapping the operands if SWAP_OPERANDS is true. The expanded
35566 code is a forward jump to a newly created label in case the
35567 comparison is true. The generated label rtx is returned. */
35568 static rtx
35569 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35570 bool swap_operands)
35571 {
35572 rtx label, tmp;
35573
35574 if (swap_operands)
35575 {
35576 tmp = op0;
35577 op0 = op1;
35578 op1 = tmp;
35579 }
35580
35581 label = gen_label_rtx ();
35582 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35583 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35584 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35585 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35586 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35587 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35588 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35589 JUMP_LABEL (tmp) = label;
35590
35591 return label;
35592 }
35593
35594 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35595 using comparison code CODE. Operands are swapped for the comparison if
35596 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35597 static rtx
35598 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35599 bool swap_operands)
35600 {
35601 rtx (*insn)(rtx, rtx, rtx, rtx);
35602 enum machine_mode mode = GET_MODE (op0);
35603 rtx mask = gen_reg_rtx (mode);
35604
35605 if (swap_operands)
35606 {
35607 rtx tmp = op0;
35608 op0 = op1;
35609 op1 = tmp;
35610 }
35611
35612 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35613
35614 emit_insn (insn (mask, op0, op1,
35615 gen_rtx_fmt_ee (code, mode, op0, op1)));
35616 return mask;
35617 }
35618
35619 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35620 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35621 static rtx
35622 ix86_gen_TWO52 (enum machine_mode mode)
35623 {
35624 REAL_VALUE_TYPE TWO52r;
35625 rtx TWO52;
35626
35627 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35628 TWO52 = const_double_from_real_value (TWO52r, mode);
35629 TWO52 = force_reg (mode, TWO52);
35630
35631 return TWO52;
35632 }
35633
35634 /* Expand SSE sequence for computing lround from OP1 storing
35635 into OP0. */
35636 void
35637 ix86_expand_lround (rtx op0, rtx op1)
35638 {
35639 /* C code for the stuff we're doing below:
35640 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35641 return (long)tmp;
35642 */
35643 enum machine_mode mode = GET_MODE (op1);
35644 const struct real_format *fmt;
35645 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35646 rtx adj;
35647
35648 /* load nextafter (0.5, 0.0) */
35649 fmt = REAL_MODE_FORMAT (mode);
35650 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35651 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35652
35653 /* adj = copysign (0.5, op1) */
35654 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35655 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35656
35657 /* adj = op1 + adj */
35658 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35659
35660 /* op0 = (imode)adj */
35661 expand_fix (op0, adj, 0);
35662 }
35663
35664 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35665 into OPERAND0. */
35666 void
35667 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35668 {
35669 /* C code for the stuff we're doing below (for do_floor):
35670 xi = (long)op1;
35671 xi -= (double)xi > op1 ? 1 : 0;
35672 return xi;
35673 */
35674 enum machine_mode fmode = GET_MODE (op1);
35675 enum machine_mode imode = GET_MODE (op0);
35676 rtx ireg, freg, label, tmp;
35677
35678 /* reg = (long)op1 */
35679 ireg = gen_reg_rtx (imode);
35680 expand_fix (ireg, op1, 0);
35681
35682 /* freg = (double)reg */
35683 freg = gen_reg_rtx (fmode);
35684 expand_float (freg, ireg, 0);
35685
35686 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35687 label = ix86_expand_sse_compare_and_jump (UNLE,
35688 freg, op1, !do_floor);
35689 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35690 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35691 emit_move_insn (ireg, tmp);
35692
35693 emit_label (label);
35694 LABEL_NUSES (label) = 1;
35695
35696 emit_move_insn (op0, ireg);
35697 }
35698
35699 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35700 result in OPERAND0. */
35701 void
35702 ix86_expand_rint (rtx operand0, rtx operand1)
35703 {
35704 /* C code for the stuff we're doing below:
35705 xa = fabs (operand1);
35706 if (!isless (xa, 2**52))
35707 return operand1;
35708 xa = xa + 2**52 - 2**52;
35709 return copysign (xa, operand1);
35710 */
35711 enum machine_mode mode = GET_MODE (operand0);
35712 rtx res, xa, label, TWO52, mask;
35713
35714 res = gen_reg_rtx (mode);
35715 emit_move_insn (res, operand1);
35716
35717 /* xa = abs (operand1) */
35718 xa = ix86_expand_sse_fabs (res, &mask);
35719
35720 /* if (!isless (xa, TWO52)) goto label; */
35721 TWO52 = ix86_gen_TWO52 (mode);
35722 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35723
35724 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35725 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35726
35727 ix86_sse_copysign_to_positive (res, xa, res, mask);
35728
35729 emit_label (label);
35730 LABEL_NUSES (label) = 1;
35731
35732 emit_move_insn (operand0, res);
35733 }
35734
35735 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35736 into OPERAND0. */
35737 void
35738 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35739 {
35740 /* C code for the stuff we expand below.
35741 double xa = fabs (x), x2;
35742 if (!isless (xa, TWO52))
35743 return x;
35744 xa = xa + TWO52 - TWO52;
35745 x2 = copysign (xa, x);
35746 Compensate. Floor:
35747 if (x2 > x)
35748 x2 -= 1;
35749 Compensate. Ceil:
35750 if (x2 < x)
35751 x2 -= -1;
35752 return x2;
35753 */
35754 enum machine_mode mode = GET_MODE (operand0);
35755 rtx xa, TWO52, tmp, label, one, res, mask;
35756
35757 TWO52 = ix86_gen_TWO52 (mode);
35758
35759 /* Temporary for holding the result, initialized to the input
35760 operand to ease control flow. */
35761 res = gen_reg_rtx (mode);
35762 emit_move_insn (res, operand1);
35763
35764 /* xa = abs (operand1) */
35765 xa = ix86_expand_sse_fabs (res, &mask);
35766
35767 /* if (!isless (xa, TWO52)) goto label; */
35768 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35769
35770 /* xa = xa + TWO52 - TWO52; */
35771 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35772 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35773
35774 /* xa = copysign (xa, operand1) */
35775 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35776
35777 /* generate 1.0 or -1.0 */
35778 one = force_reg (mode,
35779 const_double_from_real_value (do_floor
35780 ? dconst1 : dconstm1, mode));
35781
35782 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35783 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35784 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35785 gen_rtx_AND (mode, one, tmp)));
35786 /* We always need to subtract here to preserve signed zero. */
35787 tmp = expand_simple_binop (mode, MINUS,
35788 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35789 emit_move_insn (res, tmp);
35790
35791 emit_label (label);
35792 LABEL_NUSES (label) = 1;
35793
35794 emit_move_insn (operand0, res);
35795 }
35796
35797 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35798 into OPERAND0. */
35799 void
35800 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35801 {
35802 /* C code for the stuff we expand below.
35803 double xa = fabs (x), x2;
35804 if (!isless (xa, TWO52))
35805 return x;
35806 x2 = (double)(long)x;
35807 Compensate. Floor:
35808 if (x2 > x)
35809 x2 -= 1;
35810 Compensate. Ceil:
35811 if (x2 < x)
35812 x2 += 1;
35813 if (HONOR_SIGNED_ZEROS (mode))
35814 return copysign (x2, x);
35815 return x2;
35816 */
35817 enum machine_mode mode = GET_MODE (operand0);
35818 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35819
35820 TWO52 = ix86_gen_TWO52 (mode);
35821
35822 /* Temporary for holding the result, initialized to the input
35823 operand to ease control flow. */
35824 res = gen_reg_rtx (mode);
35825 emit_move_insn (res, operand1);
35826
35827 /* xa = abs (operand1) */
35828 xa = ix86_expand_sse_fabs (res, &mask);
35829
35830 /* if (!isless (xa, TWO52)) goto label; */
35831 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35832
35833 /* xa = (double)(long)x */
35834 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35835 expand_fix (xi, res, 0);
35836 expand_float (xa, xi, 0);
35837
35838 /* generate 1.0 */
35839 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35840
35841 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35842 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35843 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35844 gen_rtx_AND (mode, one, tmp)));
35845 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35846 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35847 emit_move_insn (res, tmp);
35848
35849 if (HONOR_SIGNED_ZEROS (mode))
35850 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35851
35852 emit_label (label);
35853 LABEL_NUSES (label) = 1;
35854
35855 emit_move_insn (operand0, res);
35856 }
35857
35858 /* Expand SSE sequence for computing round from OPERAND1 storing
35859 into OPERAND0. Sequence that works without relying on DImode truncation
35860 via cvttsd2siq that is only available on 64bit targets. */
35861 void
35862 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35863 {
35864 /* C code for the stuff we expand below.
35865 double xa = fabs (x), xa2, x2;
35866 if (!isless (xa, TWO52))
35867 return x;
35868 Using the absolute value and copying back sign makes
35869 -0.0 -> -0.0 correct.
35870 xa2 = xa + TWO52 - TWO52;
35871 Compensate.
35872 dxa = xa2 - xa;
35873 if (dxa <= -0.5)
35874 xa2 += 1;
35875 else if (dxa > 0.5)
35876 xa2 -= 1;
35877 x2 = copysign (xa2, x);
35878 return x2;
35879 */
35880 enum machine_mode mode = GET_MODE (operand0);
35881 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35882
35883 TWO52 = ix86_gen_TWO52 (mode);
35884
35885 /* Temporary for holding the result, initialized to the input
35886 operand to ease control flow. */
35887 res = gen_reg_rtx (mode);
35888 emit_move_insn (res, operand1);
35889
35890 /* xa = abs (operand1) */
35891 xa = ix86_expand_sse_fabs (res, &mask);
35892
35893 /* if (!isless (xa, TWO52)) goto label; */
35894 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35895
35896 /* xa2 = xa + TWO52 - TWO52; */
35897 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35898 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35899
35900 /* dxa = xa2 - xa; */
35901 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35902
35903 /* generate 0.5, 1.0 and -0.5 */
35904 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35905 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35906 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35907 0, OPTAB_DIRECT);
35908
35909 /* Compensate. */
35910 tmp = gen_reg_rtx (mode);
35911 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35912 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35913 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35914 gen_rtx_AND (mode, one, tmp)));
35915 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35916 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35917 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35918 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35919 gen_rtx_AND (mode, one, tmp)));
35920 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35921
35922 /* res = copysign (xa2, operand1) */
35923 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35924
35925 emit_label (label);
35926 LABEL_NUSES (label) = 1;
35927
35928 emit_move_insn (operand0, res);
35929 }
35930
35931 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35932 into OPERAND0. */
35933 void
35934 ix86_expand_trunc (rtx operand0, rtx operand1)
35935 {
35936 /* C code for SSE variant we expand below.
35937 double xa = fabs (x), x2;
35938 if (!isless (xa, TWO52))
35939 return x;
35940 x2 = (double)(long)x;
35941 if (HONOR_SIGNED_ZEROS (mode))
35942 return copysign (x2, x);
35943 return x2;
35944 */
35945 enum machine_mode mode = GET_MODE (operand0);
35946 rtx xa, xi, TWO52, label, res, mask;
35947
35948 TWO52 = ix86_gen_TWO52 (mode);
35949
35950 /* Temporary for holding the result, initialized to the input
35951 operand to ease control flow. */
35952 res = gen_reg_rtx (mode);
35953 emit_move_insn (res, operand1);
35954
35955 /* xa = abs (operand1) */
35956 xa = ix86_expand_sse_fabs (res, &mask);
35957
35958 /* if (!isless (xa, TWO52)) goto label; */
35959 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35960
35961 /* x = (double)(long)x */
35962 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35963 expand_fix (xi, res, 0);
35964 expand_float (res, xi, 0);
35965
35966 if (HONOR_SIGNED_ZEROS (mode))
35967 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35968
35969 emit_label (label);
35970 LABEL_NUSES (label) = 1;
35971
35972 emit_move_insn (operand0, res);
35973 }
35974
35975 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35976 into OPERAND0. */
35977 void
35978 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35979 {
35980 enum machine_mode mode = GET_MODE (operand0);
35981 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35982
35983 /* C code for SSE variant we expand below.
35984 double xa = fabs (x), x2;
35985 if (!isless (xa, TWO52))
35986 return x;
35987 xa2 = xa + TWO52 - TWO52;
35988 Compensate:
35989 if (xa2 > xa)
35990 xa2 -= 1.0;
35991 x2 = copysign (xa2, x);
35992 return x2;
35993 */
35994
35995 TWO52 = ix86_gen_TWO52 (mode);
35996
35997 /* Temporary for holding the result, initialized to the input
35998 operand to ease control flow. */
35999 res = gen_reg_rtx (mode);
36000 emit_move_insn (res, operand1);
36001
36002 /* xa = abs (operand1) */
36003 xa = ix86_expand_sse_fabs (res, &smask);
36004
36005 /* if (!isless (xa, TWO52)) goto label; */
36006 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36007
36008 /* res = xa + TWO52 - TWO52; */
36009 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36010 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
36011 emit_move_insn (res, tmp);
36012
36013 /* generate 1.0 */
36014 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36015
36016 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
36017 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
36018 emit_insn (gen_rtx_SET (VOIDmode, mask,
36019 gen_rtx_AND (mode, mask, one)));
36020 tmp = expand_simple_binop (mode, MINUS,
36021 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
36022 emit_move_insn (res, tmp);
36023
36024 /* res = copysign (res, operand1) */
36025 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
36026
36027 emit_label (label);
36028 LABEL_NUSES (label) = 1;
36029
36030 emit_move_insn (operand0, res);
36031 }
36032
36033 /* Expand SSE sequence for computing round from OPERAND1 storing
36034 into OPERAND0. */
36035 void
36036 ix86_expand_round (rtx operand0, rtx operand1)
36037 {
36038 /* C code for the stuff we're doing below:
36039 double xa = fabs (x);
36040 if (!isless (xa, TWO52))
36041 return x;
36042 xa = (double)(long)(xa + nextafter (0.5, 0.0));
36043 return copysign (xa, x);
36044 */
36045 enum machine_mode mode = GET_MODE (operand0);
36046 rtx res, TWO52, xa, label, xi, half, mask;
36047 const struct real_format *fmt;
36048 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36049
36050 /* Temporary for holding the result, initialized to the input
36051 operand to ease control flow. */
36052 res = gen_reg_rtx (mode);
36053 emit_move_insn (res, operand1);
36054
36055 TWO52 = ix86_gen_TWO52 (mode);
36056 xa = ix86_expand_sse_fabs (res, &mask);
36057 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36058
36059 /* load nextafter (0.5, 0.0) */
36060 fmt = REAL_MODE_FORMAT (mode);
36061 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36062 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36063
36064 /* xa = xa + 0.5 */
36065 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
36066 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
36067
36068 /* xa = (double)(int64_t)xa */
36069 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36070 expand_fix (xi, xa, 0);
36071 expand_float (xa, xi, 0);
36072
36073 /* res = copysign (xa, operand1) */
36074 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
36075
36076 emit_label (label);
36077 LABEL_NUSES (label) = 1;
36078
36079 emit_move_insn (operand0, res);
36080 }
36081
36082 /* Expand SSE sequence for computing round
36083 from OP1 storing into OP0 using sse4 round insn. */
36084 void
36085 ix86_expand_round_sse4 (rtx op0, rtx op1)
36086 {
36087 enum machine_mode mode = GET_MODE (op0);
36088 rtx e1, e2, res, half;
36089 const struct real_format *fmt;
36090 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36091 rtx (*gen_copysign) (rtx, rtx, rtx);
36092 rtx (*gen_round) (rtx, rtx, rtx);
36093
36094 switch (mode)
36095 {
36096 case SFmode:
36097 gen_copysign = gen_copysignsf3;
36098 gen_round = gen_sse4_1_roundsf2;
36099 break;
36100 case DFmode:
36101 gen_copysign = gen_copysigndf3;
36102 gen_round = gen_sse4_1_rounddf2;
36103 break;
36104 default:
36105 gcc_unreachable ();
36106 }
36107
36108 /* round (a) = trunc (a + copysign (0.5, a)) */
36109
36110 /* load nextafter (0.5, 0.0) */
36111 fmt = REAL_MODE_FORMAT (mode);
36112 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36113 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36114 half = const_double_from_real_value (pred_half, mode);
36115
36116 /* e1 = copysign (0.5, op1) */
36117 e1 = gen_reg_rtx (mode);
36118 emit_insn (gen_copysign (e1, half, op1));
36119
36120 /* e2 = op1 + e1 */
36121 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
36122
36123 /* res = trunc (e2) */
36124 res = gen_reg_rtx (mode);
36125 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
36126
36127 emit_move_insn (op0, res);
36128 }
36129 \f
36130
36131 /* Table of valid machine attributes. */
36132 static const struct attribute_spec ix86_attribute_table[] =
36133 {
36134 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
36135 affects_type_identity } */
36136 /* Stdcall attribute says callee is responsible for popping arguments
36137 if they are not variable. */
36138 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36139 true },
36140 /* Fastcall attribute says callee is responsible for popping arguments
36141 if they are not variable. */
36142 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36143 true },
36144 /* Thiscall attribute says callee is responsible for popping arguments
36145 if they are not variable. */
36146 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36147 true },
36148 /* Cdecl attribute says the callee is a normal C declaration */
36149 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36150 true },
36151 /* Regparm attribute specifies how many integer arguments are to be
36152 passed in registers. */
36153 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
36154 true },
36155 /* Sseregparm attribute says we are using x86_64 calling conventions
36156 for FP arguments. */
36157 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36158 true },
36159 /* The transactional memory builtins are implicitly regparm or fastcall
36160 depending on the ABI. Override the generic do-nothing attribute that
36161 these builtins were declared with. */
36162 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
36163 true },
36164 /* force_align_arg_pointer says this function realigns the stack at entry. */
36165 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
36166 false, true, true, ix86_handle_cconv_attribute, false },
36167 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
36168 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
36169 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
36170 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
36171 false },
36172 #endif
36173 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36174 false },
36175 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36176 false },
36177 #ifdef SUBTARGET_ATTRIBUTE_TABLE
36178 SUBTARGET_ATTRIBUTE_TABLE,
36179 #endif
36180 /* ms_abi and sysv_abi calling convention function attributes. */
36181 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36182 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36183 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
36184 false },
36185 { "callee_pop_aggregate_return", 1, 1, false, true, true,
36186 ix86_handle_callee_pop_aggregate_return, true },
36187 /* End element. */
36188 { NULL, 0, 0, false, false, false, NULL, false }
36189 };
36190
36191 /* Implement targetm.vectorize.builtin_vectorization_cost. */
36192 static int
36193 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
36194 tree vectype,
36195 int misalign ATTRIBUTE_UNUSED)
36196 {
36197 unsigned elements;
36198
36199 switch (type_of_cost)
36200 {
36201 case scalar_stmt:
36202 return ix86_cost->scalar_stmt_cost;
36203
36204 case scalar_load:
36205 return ix86_cost->scalar_load_cost;
36206
36207 case scalar_store:
36208 return ix86_cost->scalar_store_cost;
36209
36210 case vector_stmt:
36211 return ix86_cost->vec_stmt_cost;
36212
36213 case vector_load:
36214 return ix86_cost->vec_align_load_cost;
36215
36216 case vector_store:
36217 return ix86_cost->vec_store_cost;
36218
36219 case vec_to_scalar:
36220 return ix86_cost->vec_to_scalar_cost;
36221
36222 case scalar_to_vec:
36223 return ix86_cost->scalar_to_vec_cost;
36224
36225 case unaligned_load:
36226 case unaligned_store:
36227 return ix86_cost->vec_unalign_load_cost;
36228
36229 case cond_branch_taken:
36230 return ix86_cost->cond_taken_branch_cost;
36231
36232 case cond_branch_not_taken:
36233 return ix86_cost->cond_not_taken_branch_cost;
36234
36235 case vec_perm:
36236 case vec_promote_demote:
36237 return ix86_cost->vec_stmt_cost;
36238
36239 case vec_construct:
36240 elements = TYPE_VECTOR_SUBPARTS (vectype);
36241 return elements / 2 + 1;
36242
36243 default:
36244 gcc_unreachable ();
36245 }
36246 }
36247
36248 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36249 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36250 insn every time. */
36251
36252 static GTY(()) rtx vselect_insn;
36253
36254 /* Initialize vselect_insn. */
36255
36256 static void
36257 init_vselect_insn (void)
36258 {
36259 unsigned i;
36260 rtx x;
36261
36262 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36263 for (i = 0; i < MAX_VECT_LEN; ++i)
36264 XVECEXP (x, 0, i) = const0_rtx;
36265 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36266 const0_rtx), x);
36267 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36268 start_sequence ();
36269 vselect_insn = emit_insn (x);
36270 end_sequence ();
36271 }
36272
36273 /* Construct (set target (vec_select op0 (parallel perm))) and
36274 return true if that's a valid instruction in the active ISA. */
36275
36276 static bool
36277 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36278 unsigned nelt, bool testing_p)
36279 {
36280 unsigned int i;
36281 rtx x, save_vconcat;
36282 int icode;
36283
36284 if (vselect_insn == NULL_RTX)
36285 init_vselect_insn ();
36286
36287 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36288 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36289 for (i = 0; i < nelt; ++i)
36290 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36291 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36292 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36293 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36294 SET_DEST (PATTERN (vselect_insn)) = target;
36295 icode = recog_memoized (vselect_insn);
36296
36297 if (icode >= 0 && !testing_p)
36298 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36299
36300 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36301 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36302 INSN_CODE (vselect_insn) = -1;
36303
36304 return icode >= 0;
36305 }
36306
36307 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36308
36309 static bool
36310 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36311 const unsigned char *perm, unsigned nelt,
36312 bool testing_p)
36313 {
36314 enum machine_mode v2mode;
36315 rtx x;
36316 bool ok;
36317
36318 if (vselect_insn == NULL_RTX)
36319 init_vselect_insn ();
36320
36321 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36322 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36323 PUT_MODE (x, v2mode);
36324 XEXP (x, 0) = op0;
36325 XEXP (x, 1) = op1;
36326 ok = expand_vselect (target, x, perm, nelt, testing_p);
36327 XEXP (x, 0) = const0_rtx;
36328 XEXP (x, 1) = const0_rtx;
36329 return ok;
36330 }
36331
36332 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36333 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36334
36335 static bool
36336 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36337 {
36338 enum machine_mode vmode = d->vmode;
36339 unsigned i, mask, nelt = d->nelt;
36340 rtx target, op0, op1, x;
36341 rtx rperm[32], vperm;
36342
36343 if (d->one_operand_p)
36344 return false;
36345 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36346 ;
36347 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36348 ;
36349 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36350 ;
36351 else
36352 return false;
36353
36354 /* This is a blend, not a permute. Elements must stay in their
36355 respective lanes. */
36356 for (i = 0; i < nelt; ++i)
36357 {
36358 unsigned e = d->perm[i];
36359 if (!(e == i || e == i + nelt))
36360 return false;
36361 }
36362
36363 if (d->testing_p)
36364 return true;
36365
36366 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36367 decision should be extracted elsewhere, so that we only try that
36368 sequence once all budget==3 options have been tried. */
36369 target = d->target;
36370 op0 = d->op0;
36371 op1 = d->op1;
36372 mask = 0;
36373
36374 switch (vmode)
36375 {
36376 case V4DFmode:
36377 case V8SFmode:
36378 case V2DFmode:
36379 case V4SFmode:
36380 case V8HImode:
36381 case V8SImode:
36382 for (i = 0; i < nelt; ++i)
36383 mask |= (d->perm[i] >= nelt) << i;
36384 break;
36385
36386 case V2DImode:
36387 for (i = 0; i < 2; ++i)
36388 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36389 vmode = V8HImode;
36390 goto do_subreg;
36391
36392 case V4SImode:
36393 for (i = 0; i < 4; ++i)
36394 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36395 vmode = V8HImode;
36396 goto do_subreg;
36397
36398 case V16QImode:
36399 /* See if bytes move in pairs so we can use pblendw with
36400 an immediate argument, rather than pblendvb with a vector
36401 argument. */
36402 for (i = 0; i < 16; i += 2)
36403 if (d->perm[i] + 1 != d->perm[i + 1])
36404 {
36405 use_pblendvb:
36406 for (i = 0; i < nelt; ++i)
36407 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36408
36409 finish_pblendvb:
36410 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36411 vperm = force_reg (vmode, vperm);
36412
36413 if (GET_MODE_SIZE (vmode) == 16)
36414 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36415 else
36416 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36417 return true;
36418 }
36419
36420 for (i = 0; i < 8; ++i)
36421 mask |= (d->perm[i * 2] >= 16) << i;
36422 vmode = V8HImode;
36423 /* FALLTHRU */
36424
36425 do_subreg:
36426 target = gen_lowpart (vmode, target);
36427 op0 = gen_lowpart (vmode, op0);
36428 op1 = gen_lowpart (vmode, op1);
36429 break;
36430
36431 case V32QImode:
36432 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36433 for (i = 0; i < 32; i += 2)
36434 if (d->perm[i] + 1 != d->perm[i + 1])
36435 goto use_pblendvb;
36436 /* See if bytes move in quadruplets. If yes, vpblendd
36437 with immediate can be used. */
36438 for (i = 0; i < 32; i += 4)
36439 if (d->perm[i] + 2 != d->perm[i + 2])
36440 break;
36441 if (i < 32)
36442 {
36443 /* See if bytes move the same in both lanes. If yes,
36444 vpblendw with immediate can be used. */
36445 for (i = 0; i < 16; i += 2)
36446 if (d->perm[i] + 16 != d->perm[i + 16])
36447 goto use_pblendvb;
36448
36449 /* Use vpblendw. */
36450 for (i = 0; i < 16; ++i)
36451 mask |= (d->perm[i * 2] >= 32) << i;
36452 vmode = V16HImode;
36453 goto do_subreg;
36454 }
36455
36456 /* Use vpblendd. */
36457 for (i = 0; i < 8; ++i)
36458 mask |= (d->perm[i * 4] >= 32) << i;
36459 vmode = V8SImode;
36460 goto do_subreg;
36461
36462 case V16HImode:
36463 /* See if words move in pairs. If yes, vpblendd can be used. */
36464 for (i = 0; i < 16; i += 2)
36465 if (d->perm[i] + 1 != d->perm[i + 1])
36466 break;
36467 if (i < 16)
36468 {
36469 /* See if words move the same in both lanes. If not,
36470 vpblendvb must be used. */
36471 for (i = 0; i < 8; i++)
36472 if (d->perm[i] + 8 != d->perm[i + 8])
36473 {
36474 /* Use vpblendvb. */
36475 for (i = 0; i < 32; ++i)
36476 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36477
36478 vmode = V32QImode;
36479 nelt = 32;
36480 target = gen_lowpart (vmode, target);
36481 op0 = gen_lowpart (vmode, op0);
36482 op1 = gen_lowpart (vmode, op1);
36483 goto finish_pblendvb;
36484 }
36485
36486 /* Use vpblendw. */
36487 for (i = 0; i < 16; ++i)
36488 mask |= (d->perm[i] >= 16) << i;
36489 break;
36490 }
36491
36492 /* Use vpblendd. */
36493 for (i = 0; i < 8; ++i)
36494 mask |= (d->perm[i * 2] >= 16) << i;
36495 vmode = V8SImode;
36496 goto do_subreg;
36497
36498 case V4DImode:
36499 /* Use vpblendd. */
36500 for (i = 0; i < 4; ++i)
36501 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36502 vmode = V8SImode;
36503 goto do_subreg;
36504
36505 default:
36506 gcc_unreachable ();
36507 }
36508
36509 /* This matches five different patterns with the different modes. */
36510 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36511 x = gen_rtx_SET (VOIDmode, target, x);
36512 emit_insn (x);
36513
36514 return true;
36515 }
36516
36517 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36518 in terms of the variable form of vpermilps.
36519
36520 Note that we will have already failed the immediate input vpermilps,
36521 which requires that the high and low part shuffle be identical; the
36522 variable form doesn't require that. */
36523
36524 static bool
36525 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36526 {
36527 rtx rperm[8], vperm;
36528 unsigned i;
36529
36530 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36531 return false;
36532
36533 /* We can only permute within the 128-bit lane. */
36534 for (i = 0; i < 8; ++i)
36535 {
36536 unsigned e = d->perm[i];
36537 if (i < 4 ? e >= 4 : e < 4)
36538 return false;
36539 }
36540
36541 if (d->testing_p)
36542 return true;
36543
36544 for (i = 0; i < 8; ++i)
36545 {
36546 unsigned e = d->perm[i];
36547
36548 /* Within each 128-bit lane, the elements of op0 are numbered
36549 from 0 and the elements of op1 are numbered from 4. */
36550 if (e >= 8 + 4)
36551 e -= 8;
36552 else if (e >= 4)
36553 e -= 4;
36554
36555 rperm[i] = GEN_INT (e);
36556 }
36557
36558 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36559 vperm = force_reg (V8SImode, vperm);
36560 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36561
36562 return true;
36563 }
36564
36565 /* Return true if permutation D can be performed as VMODE permutation
36566 instead. */
36567
36568 static bool
36569 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36570 {
36571 unsigned int i, j, chunk;
36572
36573 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36574 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36575 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36576 return false;
36577
36578 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36579 return true;
36580
36581 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36582 for (i = 0; i < d->nelt; i += chunk)
36583 if (d->perm[i] & (chunk - 1))
36584 return false;
36585 else
36586 for (j = 1; j < chunk; ++j)
36587 if (d->perm[i] + j != d->perm[i + j])
36588 return false;
36589
36590 return true;
36591 }
36592
36593 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36594 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36595
36596 static bool
36597 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36598 {
36599 unsigned i, nelt, eltsz, mask;
36600 unsigned char perm[32];
36601 enum machine_mode vmode = V16QImode;
36602 rtx rperm[32], vperm, target, op0, op1;
36603
36604 nelt = d->nelt;
36605
36606 if (!d->one_operand_p)
36607 {
36608 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36609 {
36610 if (TARGET_AVX2
36611 && valid_perm_using_mode_p (V2TImode, d))
36612 {
36613 if (d->testing_p)
36614 return true;
36615
36616 /* Use vperm2i128 insn. The pattern uses
36617 V4DImode instead of V2TImode. */
36618 target = gen_lowpart (V4DImode, d->target);
36619 op0 = gen_lowpart (V4DImode, d->op0);
36620 op1 = gen_lowpart (V4DImode, d->op1);
36621 rperm[0]
36622 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36623 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36624 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36625 return true;
36626 }
36627 return false;
36628 }
36629 }
36630 else
36631 {
36632 if (GET_MODE_SIZE (d->vmode) == 16)
36633 {
36634 if (!TARGET_SSSE3)
36635 return false;
36636 }
36637 else if (GET_MODE_SIZE (d->vmode) == 32)
36638 {
36639 if (!TARGET_AVX2)
36640 return false;
36641
36642 /* V4DImode should be already handled through
36643 expand_vselect by vpermq instruction. */
36644 gcc_assert (d->vmode != V4DImode);
36645
36646 vmode = V32QImode;
36647 if (d->vmode == V8SImode
36648 || d->vmode == V16HImode
36649 || d->vmode == V32QImode)
36650 {
36651 /* First see if vpermq can be used for
36652 V8SImode/V16HImode/V32QImode. */
36653 if (valid_perm_using_mode_p (V4DImode, d))
36654 {
36655 for (i = 0; i < 4; i++)
36656 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36657 if (d->testing_p)
36658 return true;
36659 return expand_vselect (gen_lowpart (V4DImode, d->target),
36660 gen_lowpart (V4DImode, d->op0),
36661 perm, 4, false);
36662 }
36663
36664 /* Next see if vpermd can be used. */
36665 if (valid_perm_using_mode_p (V8SImode, d))
36666 vmode = V8SImode;
36667 }
36668 /* Or if vpermps can be used. */
36669 else if (d->vmode == V8SFmode)
36670 vmode = V8SImode;
36671
36672 if (vmode == V32QImode)
36673 {
36674 /* vpshufb only works intra lanes, it is not
36675 possible to shuffle bytes in between the lanes. */
36676 for (i = 0; i < nelt; ++i)
36677 if ((d->perm[i] ^ i) & (nelt / 2))
36678 return false;
36679 }
36680 }
36681 else
36682 return false;
36683 }
36684
36685 if (d->testing_p)
36686 return true;
36687
36688 if (vmode == V8SImode)
36689 for (i = 0; i < 8; ++i)
36690 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36691 else
36692 {
36693 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36694 if (!d->one_operand_p)
36695 mask = 2 * nelt - 1;
36696 else if (vmode == V16QImode)
36697 mask = nelt - 1;
36698 else
36699 mask = nelt / 2 - 1;
36700
36701 for (i = 0; i < nelt; ++i)
36702 {
36703 unsigned j, e = d->perm[i] & mask;
36704 for (j = 0; j < eltsz; ++j)
36705 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36706 }
36707 }
36708
36709 vperm = gen_rtx_CONST_VECTOR (vmode,
36710 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36711 vperm = force_reg (vmode, vperm);
36712
36713 target = gen_lowpart (vmode, d->target);
36714 op0 = gen_lowpart (vmode, d->op0);
36715 if (d->one_operand_p)
36716 {
36717 if (vmode == V16QImode)
36718 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36719 else if (vmode == V32QImode)
36720 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36721 else if (vmode == V8SFmode)
36722 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
36723 else
36724 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36725 }
36726 else
36727 {
36728 op1 = gen_lowpart (vmode, d->op1);
36729 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36730 }
36731
36732 return true;
36733 }
36734
36735 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36736 in a single instruction. */
36737
36738 static bool
36739 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36740 {
36741 unsigned i, nelt = d->nelt;
36742 unsigned char perm2[MAX_VECT_LEN];
36743
36744 /* Check plain VEC_SELECT first, because AVX has instructions that could
36745 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36746 input where SEL+CONCAT may not. */
36747 if (d->one_operand_p)
36748 {
36749 int mask = nelt - 1;
36750 bool identity_perm = true;
36751 bool broadcast_perm = true;
36752
36753 for (i = 0; i < nelt; i++)
36754 {
36755 perm2[i] = d->perm[i] & mask;
36756 if (perm2[i] != i)
36757 identity_perm = false;
36758 if (perm2[i])
36759 broadcast_perm = false;
36760 }
36761
36762 if (identity_perm)
36763 {
36764 if (!d->testing_p)
36765 emit_move_insn (d->target, d->op0);
36766 return true;
36767 }
36768 else if (broadcast_perm && TARGET_AVX2)
36769 {
36770 /* Use vpbroadcast{b,w,d}. */
36771 rtx (*gen) (rtx, rtx) = NULL;
36772 switch (d->vmode)
36773 {
36774 case V32QImode:
36775 gen = gen_avx2_pbroadcastv32qi_1;
36776 break;
36777 case V16HImode:
36778 gen = gen_avx2_pbroadcastv16hi_1;
36779 break;
36780 case V8SImode:
36781 gen = gen_avx2_pbroadcastv8si_1;
36782 break;
36783 case V16QImode:
36784 gen = gen_avx2_pbroadcastv16qi;
36785 break;
36786 case V8HImode:
36787 gen = gen_avx2_pbroadcastv8hi;
36788 break;
36789 case V8SFmode:
36790 gen = gen_avx2_vec_dupv8sf_1;
36791 break;
36792 /* For other modes prefer other shuffles this function creates. */
36793 default: break;
36794 }
36795 if (gen != NULL)
36796 {
36797 if (!d->testing_p)
36798 emit_insn (gen (d->target, d->op0));
36799 return true;
36800 }
36801 }
36802
36803 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36804 return true;
36805
36806 /* There are plenty of patterns in sse.md that are written for
36807 SEL+CONCAT and are not replicated for a single op. Perhaps
36808 that should be changed, to avoid the nastiness here. */
36809
36810 /* Recognize interleave style patterns, which means incrementing
36811 every other permutation operand. */
36812 for (i = 0; i < nelt; i += 2)
36813 {
36814 perm2[i] = d->perm[i] & mask;
36815 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36816 }
36817 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36818 d->testing_p))
36819 return true;
36820
36821 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36822 if (nelt >= 4)
36823 {
36824 for (i = 0; i < nelt; i += 4)
36825 {
36826 perm2[i + 0] = d->perm[i + 0] & mask;
36827 perm2[i + 1] = d->perm[i + 1] & mask;
36828 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36829 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36830 }
36831
36832 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36833 d->testing_p))
36834 return true;
36835 }
36836 }
36837
36838 /* Finally, try the fully general two operand permute. */
36839 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36840 d->testing_p))
36841 return true;
36842
36843 /* Recognize interleave style patterns with reversed operands. */
36844 if (!d->one_operand_p)
36845 {
36846 for (i = 0; i < nelt; ++i)
36847 {
36848 unsigned e = d->perm[i];
36849 if (e >= nelt)
36850 e -= nelt;
36851 else
36852 e += nelt;
36853 perm2[i] = e;
36854 }
36855
36856 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36857 d->testing_p))
36858 return true;
36859 }
36860
36861 /* Try the SSE4.1 blend variable merge instructions. */
36862 if (expand_vec_perm_blend (d))
36863 return true;
36864
36865 /* Try one of the AVX vpermil variable permutations. */
36866 if (expand_vec_perm_vpermil (d))
36867 return true;
36868
36869 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36870 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36871 if (expand_vec_perm_pshufb (d))
36872 return true;
36873
36874 return false;
36875 }
36876
36877 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36878 in terms of a pair of pshuflw + pshufhw instructions. */
36879
36880 static bool
36881 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36882 {
36883 unsigned char perm2[MAX_VECT_LEN];
36884 unsigned i;
36885 bool ok;
36886
36887 if (d->vmode != V8HImode || !d->one_operand_p)
36888 return false;
36889
36890 /* The two permutations only operate in 64-bit lanes. */
36891 for (i = 0; i < 4; ++i)
36892 if (d->perm[i] >= 4)
36893 return false;
36894 for (i = 4; i < 8; ++i)
36895 if (d->perm[i] < 4)
36896 return false;
36897
36898 if (d->testing_p)
36899 return true;
36900
36901 /* Emit the pshuflw. */
36902 memcpy (perm2, d->perm, 4);
36903 for (i = 4; i < 8; ++i)
36904 perm2[i] = i;
36905 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36906 gcc_assert (ok);
36907
36908 /* Emit the pshufhw. */
36909 memcpy (perm2 + 4, d->perm + 4, 4);
36910 for (i = 0; i < 4; ++i)
36911 perm2[i] = i;
36912 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36913 gcc_assert (ok);
36914
36915 return true;
36916 }
36917
36918 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36919 the permutation using the SSSE3 palignr instruction. This succeeds
36920 when all of the elements in PERM fit within one vector and we merely
36921 need to shift them down so that a single vector permutation has a
36922 chance to succeed. */
36923
36924 static bool
36925 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36926 {
36927 unsigned i, nelt = d->nelt;
36928 unsigned min, max;
36929 bool in_order, ok;
36930 rtx shift;
36931
36932 /* Even with AVX, palignr only operates on 128-bit vectors. */
36933 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36934 return false;
36935
36936 min = nelt, max = 0;
36937 for (i = 0; i < nelt; ++i)
36938 {
36939 unsigned e = d->perm[i];
36940 if (e < min)
36941 min = e;
36942 if (e > max)
36943 max = e;
36944 }
36945 if (min == 0 || max - min >= nelt)
36946 return false;
36947
36948 /* Given that we have SSSE3, we know we'll be able to implement the
36949 single operand permutation after the palignr with pshufb. */
36950 if (d->testing_p)
36951 return true;
36952
36953 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36954 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36955 gen_lowpart (TImode, d->op1),
36956 gen_lowpart (TImode, d->op0), shift));
36957
36958 d->op0 = d->op1 = d->target;
36959 d->one_operand_p = true;
36960
36961 in_order = true;
36962 for (i = 0; i < nelt; ++i)
36963 {
36964 unsigned e = d->perm[i] - min;
36965 if (e != i)
36966 in_order = false;
36967 d->perm[i] = e;
36968 }
36969
36970 /* Test for the degenerate case where the alignment by itself
36971 produces the desired permutation. */
36972 if (in_order)
36973 return true;
36974
36975 ok = expand_vec_perm_1 (d);
36976 gcc_assert (ok);
36977
36978 return ok;
36979 }
36980
36981 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36982
36983 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36984 a two vector permutation into a single vector permutation by using
36985 an interleave operation to merge the vectors. */
36986
36987 static bool
36988 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36989 {
36990 struct expand_vec_perm_d dremap, dfinal;
36991 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36992 unsigned HOST_WIDE_INT contents;
36993 unsigned char remap[2 * MAX_VECT_LEN];
36994 rtx seq;
36995 bool ok, same_halves = false;
36996
36997 if (GET_MODE_SIZE (d->vmode) == 16)
36998 {
36999 if (d->one_operand_p)
37000 return false;
37001 }
37002 else if (GET_MODE_SIZE (d->vmode) == 32)
37003 {
37004 if (!TARGET_AVX)
37005 return false;
37006 /* For 32-byte modes allow even d->one_operand_p.
37007 The lack of cross-lane shuffling in some instructions
37008 might prevent a single insn shuffle. */
37009 dfinal = *d;
37010 dfinal.testing_p = true;
37011 /* If expand_vec_perm_interleave3 can expand this into
37012 a 3 insn sequence, give up and let it be expanded as
37013 3 insn sequence. While that is one insn longer,
37014 it doesn't need a memory operand and in the common
37015 case that both interleave low and high permutations
37016 with the same operands are adjacent needs 4 insns
37017 for both after CSE. */
37018 if (expand_vec_perm_interleave3 (&dfinal))
37019 return false;
37020 }
37021 else
37022 return false;
37023
37024 /* Examine from whence the elements come. */
37025 contents = 0;
37026 for (i = 0; i < nelt; ++i)
37027 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
37028
37029 memset (remap, 0xff, sizeof (remap));
37030 dremap = *d;
37031
37032 if (GET_MODE_SIZE (d->vmode) == 16)
37033 {
37034 unsigned HOST_WIDE_INT h1, h2, h3, h4;
37035
37036 /* Split the two input vectors into 4 halves. */
37037 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
37038 h2 = h1 << nelt2;
37039 h3 = h2 << nelt2;
37040 h4 = h3 << nelt2;
37041
37042 /* If the elements from the low halves use interleave low, and similarly
37043 for interleave high. If the elements are from mis-matched halves, we
37044 can use shufps for V4SF/V4SI or do a DImode shuffle. */
37045 if ((contents & (h1 | h3)) == contents)
37046 {
37047 /* punpckl* */
37048 for (i = 0; i < nelt2; ++i)
37049 {
37050 remap[i] = i * 2;
37051 remap[i + nelt] = i * 2 + 1;
37052 dremap.perm[i * 2] = i;
37053 dremap.perm[i * 2 + 1] = i + nelt;
37054 }
37055 if (!TARGET_SSE2 && d->vmode == V4SImode)
37056 dremap.vmode = V4SFmode;
37057 }
37058 else if ((contents & (h2 | h4)) == contents)
37059 {
37060 /* punpckh* */
37061 for (i = 0; i < nelt2; ++i)
37062 {
37063 remap[i + nelt2] = i * 2;
37064 remap[i + nelt + nelt2] = i * 2 + 1;
37065 dremap.perm[i * 2] = i + nelt2;
37066 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
37067 }
37068 if (!TARGET_SSE2 && d->vmode == V4SImode)
37069 dremap.vmode = V4SFmode;
37070 }
37071 else if ((contents & (h1 | h4)) == contents)
37072 {
37073 /* shufps */
37074 for (i = 0; i < nelt2; ++i)
37075 {
37076 remap[i] = i;
37077 remap[i + nelt + nelt2] = i + nelt2;
37078 dremap.perm[i] = i;
37079 dremap.perm[i + nelt2] = i + nelt + nelt2;
37080 }
37081 if (nelt != 4)
37082 {
37083 /* shufpd */
37084 dremap.vmode = V2DImode;
37085 dremap.nelt = 2;
37086 dremap.perm[0] = 0;
37087 dremap.perm[1] = 3;
37088 }
37089 }
37090 else if ((contents & (h2 | h3)) == contents)
37091 {
37092 /* shufps */
37093 for (i = 0; i < nelt2; ++i)
37094 {
37095 remap[i + nelt2] = i;
37096 remap[i + nelt] = i + nelt2;
37097 dremap.perm[i] = i + nelt2;
37098 dremap.perm[i + nelt2] = i + nelt;
37099 }
37100 if (nelt != 4)
37101 {
37102 /* shufpd */
37103 dremap.vmode = V2DImode;
37104 dremap.nelt = 2;
37105 dremap.perm[0] = 1;
37106 dremap.perm[1] = 2;
37107 }
37108 }
37109 else
37110 return false;
37111 }
37112 else
37113 {
37114 unsigned int nelt4 = nelt / 4, nzcnt = 0;
37115 unsigned HOST_WIDE_INT q[8];
37116 unsigned int nonzero_halves[4];
37117
37118 /* Split the two input vectors into 8 quarters. */
37119 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
37120 for (i = 1; i < 8; ++i)
37121 q[i] = q[0] << (nelt4 * i);
37122 for (i = 0; i < 4; ++i)
37123 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
37124 {
37125 nonzero_halves[nzcnt] = i;
37126 ++nzcnt;
37127 }
37128
37129 if (nzcnt == 1)
37130 {
37131 gcc_assert (d->one_operand_p);
37132 nonzero_halves[1] = nonzero_halves[0];
37133 same_halves = true;
37134 }
37135 else if (d->one_operand_p)
37136 {
37137 gcc_assert (nonzero_halves[0] == 0);
37138 gcc_assert (nonzero_halves[1] == 1);
37139 }
37140
37141 if (nzcnt <= 2)
37142 {
37143 if (d->perm[0] / nelt2 == nonzero_halves[1])
37144 {
37145 /* Attempt to increase the likelihood that dfinal
37146 shuffle will be intra-lane. */
37147 char tmph = nonzero_halves[0];
37148 nonzero_halves[0] = nonzero_halves[1];
37149 nonzero_halves[1] = tmph;
37150 }
37151
37152 /* vperm2f128 or vperm2i128. */
37153 for (i = 0; i < nelt2; ++i)
37154 {
37155 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
37156 remap[i + nonzero_halves[0] * nelt2] = i;
37157 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
37158 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
37159 }
37160
37161 if (d->vmode != V8SFmode
37162 && d->vmode != V4DFmode
37163 && d->vmode != V8SImode)
37164 {
37165 dremap.vmode = V8SImode;
37166 dremap.nelt = 8;
37167 for (i = 0; i < 4; ++i)
37168 {
37169 dremap.perm[i] = i + nonzero_halves[0] * 4;
37170 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
37171 }
37172 }
37173 }
37174 else if (d->one_operand_p)
37175 return false;
37176 else if (TARGET_AVX2
37177 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
37178 {
37179 /* vpunpckl* */
37180 for (i = 0; i < nelt4; ++i)
37181 {
37182 remap[i] = i * 2;
37183 remap[i + nelt] = i * 2 + 1;
37184 remap[i + nelt2] = i * 2 + nelt2;
37185 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
37186 dremap.perm[i * 2] = i;
37187 dremap.perm[i * 2 + 1] = i + nelt;
37188 dremap.perm[i * 2 + nelt2] = i + nelt2;
37189 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
37190 }
37191 }
37192 else if (TARGET_AVX2
37193 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
37194 {
37195 /* vpunpckh* */
37196 for (i = 0; i < nelt4; ++i)
37197 {
37198 remap[i + nelt4] = i * 2;
37199 remap[i + nelt + nelt4] = i * 2 + 1;
37200 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37201 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37202 dremap.perm[i * 2] = i + nelt4;
37203 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37204 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37205 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37206 }
37207 }
37208 else
37209 return false;
37210 }
37211
37212 /* Use the remapping array set up above to move the elements from their
37213 swizzled locations into their final destinations. */
37214 dfinal = *d;
37215 for (i = 0; i < nelt; ++i)
37216 {
37217 unsigned e = remap[d->perm[i]];
37218 gcc_assert (e < nelt);
37219 /* If same_halves is true, both halves of the remapped vector are the
37220 same. Avoid cross-lane accesses if possible. */
37221 if (same_halves && i >= nelt2)
37222 {
37223 gcc_assert (e < nelt2);
37224 dfinal.perm[i] = e + nelt2;
37225 }
37226 else
37227 dfinal.perm[i] = e;
37228 }
37229 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37230 dfinal.op1 = dfinal.op0;
37231 dfinal.one_operand_p = true;
37232 dremap.target = dfinal.op0;
37233
37234 /* Test if the final remap can be done with a single insn. For V4SFmode or
37235 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37236 start_sequence ();
37237 ok = expand_vec_perm_1 (&dfinal);
37238 seq = get_insns ();
37239 end_sequence ();
37240
37241 if (!ok)
37242 return false;
37243
37244 if (d->testing_p)
37245 return true;
37246
37247 if (dremap.vmode != dfinal.vmode)
37248 {
37249 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37250 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37251 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37252 }
37253
37254 ok = expand_vec_perm_1 (&dremap);
37255 gcc_assert (ok);
37256
37257 emit_insn (seq);
37258 return true;
37259 }
37260
37261 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37262 a single vector cross-lane permutation into vpermq followed
37263 by any of the single insn permutations. */
37264
37265 static bool
37266 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37267 {
37268 struct expand_vec_perm_d dremap, dfinal;
37269 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37270 unsigned contents[2];
37271 bool ok;
37272
37273 if (!(TARGET_AVX2
37274 && (d->vmode == V32QImode || d->vmode == V16HImode)
37275 && d->one_operand_p))
37276 return false;
37277
37278 contents[0] = 0;
37279 contents[1] = 0;
37280 for (i = 0; i < nelt2; ++i)
37281 {
37282 contents[0] |= 1u << (d->perm[i] / nelt4);
37283 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37284 }
37285
37286 for (i = 0; i < 2; ++i)
37287 {
37288 unsigned int cnt = 0;
37289 for (j = 0; j < 4; ++j)
37290 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37291 return false;
37292 }
37293
37294 if (d->testing_p)
37295 return true;
37296
37297 dremap = *d;
37298 dremap.vmode = V4DImode;
37299 dremap.nelt = 4;
37300 dremap.target = gen_reg_rtx (V4DImode);
37301 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37302 dremap.op1 = dremap.op0;
37303 dremap.one_operand_p = true;
37304 for (i = 0; i < 2; ++i)
37305 {
37306 unsigned int cnt = 0;
37307 for (j = 0; j < 4; ++j)
37308 if ((contents[i] & (1u << j)) != 0)
37309 dremap.perm[2 * i + cnt++] = j;
37310 for (; cnt < 2; ++cnt)
37311 dremap.perm[2 * i + cnt] = 0;
37312 }
37313
37314 dfinal = *d;
37315 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37316 dfinal.op1 = dfinal.op0;
37317 dfinal.one_operand_p = true;
37318 for (i = 0, j = 0; i < nelt; ++i)
37319 {
37320 if (i == nelt2)
37321 j = 2;
37322 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37323 if ((d->perm[i] / nelt4) == dremap.perm[j])
37324 ;
37325 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37326 dfinal.perm[i] |= nelt4;
37327 else
37328 gcc_unreachable ();
37329 }
37330
37331 ok = expand_vec_perm_1 (&dremap);
37332 gcc_assert (ok);
37333
37334 ok = expand_vec_perm_1 (&dfinal);
37335 gcc_assert (ok);
37336
37337 return true;
37338 }
37339
37340 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37341 a vector permutation using two instructions, vperm2f128 resp.
37342 vperm2i128 followed by any single in-lane permutation. */
37343
37344 static bool
37345 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37346 {
37347 struct expand_vec_perm_d dfirst, dsecond;
37348 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37349 bool ok;
37350
37351 if (!TARGET_AVX
37352 || GET_MODE_SIZE (d->vmode) != 32
37353 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37354 return false;
37355
37356 dsecond = *d;
37357 dsecond.one_operand_p = false;
37358 dsecond.testing_p = true;
37359
37360 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37361 immediate. For perm < 16 the second permutation uses
37362 d->op0 as first operand, for perm >= 16 it uses d->op1
37363 as first operand. The second operand is the result of
37364 vperm2[fi]128. */
37365 for (perm = 0; perm < 32; perm++)
37366 {
37367 /* Ignore permutations which do not move anything cross-lane. */
37368 if (perm < 16)
37369 {
37370 /* The second shuffle for e.g. V4DFmode has
37371 0123 and ABCD operands.
37372 Ignore AB23, as 23 is already in the second lane
37373 of the first operand. */
37374 if ((perm & 0xc) == (1 << 2)) continue;
37375 /* And 01CD, as 01 is in the first lane of the first
37376 operand. */
37377 if ((perm & 3) == 0) continue;
37378 /* And 4567, as then the vperm2[fi]128 doesn't change
37379 anything on the original 4567 second operand. */
37380 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37381 }
37382 else
37383 {
37384 /* The second shuffle for e.g. V4DFmode has
37385 4567 and ABCD operands.
37386 Ignore AB67, as 67 is already in the second lane
37387 of the first operand. */
37388 if ((perm & 0xc) == (3 << 2)) continue;
37389 /* And 45CD, as 45 is in the first lane of the first
37390 operand. */
37391 if ((perm & 3) == 2) continue;
37392 /* And 0123, as then the vperm2[fi]128 doesn't change
37393 anything on the original 0123 first operand. */
37394 if ((perm & 0xf) == (1 << 2)) continue;
37395 }
37396
37397 for (i = 0; i < nelt; i++)
37398 {
37399 j = d->perm[i] / nelt2;
37400 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37401 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37402 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37403 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37404 else
37405 break;
37406 }
37407
37408 if (i == nelt)
37409 {
37410 start_sequence ();
37411 ok = expand_vec_perm_1 (&dsecond);
37412 end_sequence ();
37413 }
37414 else
37415 ok = false;
37416
37417 if (ok)
37418 {
37419 if (d->testing_p)
37420 return true;
37421
37422 /* Found a usable second shuffle. dfirst will be
37423 vperm2f128 on d->op0 and d->op1. */
37424 dsecond.testing_p = false;
37425 dfirst = *d;
37426 dfirst.target = gen_reg_rtx (d->vmode);
37427 for (i = 0; i < nelt; i++)
37428 dfirst.perm[i] = (i & (nelt2 - 1))
37429 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37430
37431 ok = expand_vec_perm_1 (&dfirst);
37432 gcc_assert (ok);
37433
37434 /* And dsecond is some single insn shuffle, taking
37435 d->op0 and result of vperm2f128 (if perm < 16) or
37436 d->op1 and result of vperm2f128 (otherwise). */
37437 dsecond.op1 = dfirst.target;
37438 if (perm >= 16)
37439 dsecond.op0 = dfirst.op1;
37440
37441 ok = expand_vec_perm_1 (&dsecond);
37442 gcc_assert (ok);
37443
37444 return true;
37445 }
37446
37447 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37448 if (d->one_operand_p)
37449 return false;
37450 }
37451
37452 return false;
37453 }
37454
37455 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37456 a two vector permutation using 2 intra-lane interleave insns
37457 and cross-lane shuffle for 32-byte vectors. */
37458
37459 static bool
37460 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37461 {
37462 unsigned i, nelt;
37463 rtx (*gen) (rtx, rtx, rtx);
37464
37465 if (d->one_operand_p)
37466 return false;
37467 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37468 ;
37469 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37470 ;
37471 else
37472 return false;
37473
37474 nelt = d->nelt;
37475 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37476 return false;
37477 for (i = 0; i < nelt; i += 2)
37478 if (d->perm[i] != d->perm[0] + i / 2
37479 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37480 return false;
37481
37482 if (d->testing_p)
37483 return true;
37484
37485 switch (d->vmode)
37486 {
37487 case V32QImode:
37488 if (d->perm[0])
37489 gen = gen_vec_interleave_highv32qi;
37490 else
37491 gen = gen_vec_interleave_lowv32qi;
37492 break;
37493 case V16HImode:
37494 if (d->perm[0])
37495 gen = gen_vec_interleave_highv16hi;
37496 else
37497 gen = gen_vec_interleave_lowv16hi;
37498 break;
37499 case V8SImode:
37500 if (d->perm[0])
37501 gen = gen_vec_interleave_highv8si;
37502 else
37503 gen = gen_vec_interleave_lowv8si;
37504 break;
37505 case V4DImode:
37506 if (d->perm[0])
37507 gen = gen_vec_interleave_highv4di;
37508 else
37509 gen = gen_vec_interleave_lowv4di;
37510 break;
37511 case V8SFmode:
37512 if (d->perm[0])
37513 gen = gen_vec_interleave_highv8sf;
37514 else
37515 gen = gen_vec_interleave_lowv8sf;
37516 break;
37517 case V4DFmode:
37518 if (d->perm[0])
37519 gen = gen_vec_interleave_highv4df;
37520 else
37521 gen = gen_vec_interleave_lowv4df;
37522 break;
37523 default:
37524 gcc_unreachable ();
37525 }
37526
37527 emit_insn (gen (d->target, d->op0, d->op1));
37528 return true;
37529 }
37530
37531 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37532 a single vector permutation using a single intra-lane vector
37533 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37534 the non-swapped and swapped vectors together. */
37535
37536 static bool
37537 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37538 {
37539 struct expand_vec_perm_d dfirst, dsecond;
37540 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37541 rtx seq;
37542 bool ok;
37543 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37544
37545 if (!TARGET_AVX
37546 || TARGET_AVX2
37547 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37548 || !d->one_operand_p)
37549 return false;
37550
37551 dfirst = *d;
37552 for (i = 0; i < nelt; i++)
37553 dfirst.perm[i] = 0xff;
37554 for (i = 0, msk = 0; i < nelt; i++)
37555 {
37556 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37557 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37558 return false;
37559 dfirst.perm[j] = d->perm[i];
37560 if (j != i)
37561 msk |= (1 << i);
37562 }
37563 for (i = 0; i < nelt; i++)
37564 if (dfirst.perm[i] == 0xff)
37565 dfirst.perm[i] = i;
37566
37567 if (!d->testing_p)
37568 dfirst.target = gen_reg_rtx (dfirst.vmode);
37569
37570 start_sequence ();
37571 ok = expand_vec_perm_1 (&dfirst);
37572 seq = get_insns ();
37573 end_sequence ();
37574
37575 if (!ok)
37576 return false;
37577
37578 if (d->testing_p)
37579 return true;
37580
37581 emit_insn (seq);
37582
37583 dsecond = *d;
37584 dsecond.op0 = dfirst.target;
37585 dsecond.op1 = dfirst.target;
37586 dsecond.one_operand_p = true;
37587 dsecond.target = gen_reg_rtx (dsecond.vmode);
37588 for (i = 0; i < nelt; i++)
37589 dsecond.perm[i] = i ^ nelt2;
37590
37591 ok = expand_vec_perm_1 (&dsecond);
37592 gcc_assert (ok);
37593
37594 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37595 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37596 return true;
37597 }
37598
37599 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
37600 permutation using two vperm2f128, followed by a vshufpd insn blending
37601 the two vectors together. */
37602
37603 static bool
37604 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
37605 {
37606 struct expand_vec_perm_d dfirst, dsecond, dthird;
37607 bool ok;
37608
37609 if (!TARGET_AVX || (d->vmode != V4DFmode))
37610 return false;
37611
37612 if (d->testing_p)
37613 return true;
37614
37615 dfirst = *d;
37616 dsecond = *d;
37617 dthird = *d;
37618
37619 dfirst.perm[0] = (d->perm[0] & ~1);
37620 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
37621 dfirst.perm[2] = (d->perm[2] & ~1);
37622 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
37623 dsecond.perm[0] = (d->perm[1] & ~1);
37624 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
37625 dsecond.perm[2] = (d->perm[3] & ~1);
37626 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
37627 dthird.perm[0] = (d->perm[0] % 2);
37628 dthird.perm[1] = (d->perm[1] % 2) + 4;
37629 dthird.perm[2] = (d->perm[2] % 2) + 2;
37630 dthird.perm[3] = (d->perm[3] % 2) + 6;
37631
37632 dfirst.target = gen_reg_rtx (dfirst.vmode);
37633 dsecond.target = gen_reg_rtx (dsecond.vmode);
37634 dthird.op0 = dfirst.target;
37635 dthird.op1 = dsecond.target;
37636 dthird.one_operand_p = false;
37637
37638 canonicalize_perm (&dfirst);
37639 canonicalize_perm (&dsecond);
37640
37641 ok = expand_vec_perm_1 (&dfirst)
37642 && expand_vec_perm_1 (&dsecond)
37643 && expand_vec_perm_1 (&dthird);
37644
37645 gcc_assert (ok);
37646
37647 return true;
37648 }
37649
37650 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37651 permutation with two pshufb insns and an ior. We should have already
37652 failed all two instruction sequences. */
37653
37654 static bool
37655 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37656 {
37657 rtx rperm[2][16], vperm, l, h, op, m128;
37658 unsigned int i, nelt, eltsz;
37659
37660 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37661 return false;
37662 gcc_assert (!d->one_operand_p);
37663
37664 nelt = d->nelt;
37665 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37666
37667 /* Generate two permutation masks. If the required element is within
37668 the given vector it is shuffled into the proper lane. If the required
37669 element is in the other vector, force a zero into the lane by setting
37670 bit 7 in the permutation mask. */
37671 m128 = GEN_INT (-128);
37672 for (i = 0; i < nelt; ++i)
37673 {
37674 unsigned j, e = d->perm[i];
37675 unsigned which = (e >= nelt);
37676 if (e >= nelt)
37677 e -= nelt;
37678
37679 for (j = 0; j < eltsz; ++j)
37680 {
37681 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37682 rperm[1-which][i*eltsz + j] = m128;
37683 }
37684 }
37685
37686 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37687 vperm = force_reg (V16QImode, vperm);
37688
37689 l = gen_reg_rtx (V16QImode);
37690 op = gen_lowpart (V16QImode, d->op0);
37691 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37692
37693 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37694 vperm = force_reg (V16QImode, vperm);
37695
37696 h = gen_reg_rtx (V16QImode);
37697 op = gen_lowpart (V16QImode, d->op1);
37698 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37699
37700 op = gen_lowpart (V16QImode, d->target);
37701 emit_insn (gen_iorv16qi3 (op, l, h));
37702
37703 return true;
37704 }
37705
37706 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37707 with two vpshufb insns, vpermq and vpor. We should have already failed
37708 all two or three instruction sequences. */
37709
37710 static bool
37711 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37712 {
37713 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37714 unsigned int i, nelt, eltsz;
37715
37716 if (!TARGET_AVX2
37717 || !d->one_operand_p
37718 || (d->vmode != V32QImode && d->vmode != V16HImode))
37719 return false;
37720
37721 if (d->testing_p)
37722 return true;
37723
37724 nelt = d->nelt;
37725 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37726
37727 /* Generate two permutation masks. If the required element is within
37728 the same lane, it is shuffled in. If the required element from the
37729 other lane, force a zero by setting bit 7 in the permutation mask.
37730 In the other mask the mask has non-negative elements if element
37731 is requested from the other lane, but also moved to the other lane,
37732 so that the result of vpshufb can have the two V2TImode halves
37733 swapped. */
37734 m128 = GEN_INT (-128);
37735 for (i = 0; i < nelt; ++i)
37736 {
37737 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37738 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37739
37740 for (j = 0; j < eltsz; ++j)
37741 {
37742 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37743 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37744 }
37745 }
37746
37747 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37748 vperm = force_reg (V32QImode, vperm);
37749
37750 h = gen_reg_rtx (V32QImode);
37751 op = gen_lowpart (V32QImode, d->op0);
37752 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37753
37754 /* Swap the 128-byte lanes of h into hp. */
37755 hp = gen_reg_rtx (V4DImode);
37756 op = gen_lowpart (V4DImode, h);
37757 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37758 const1_rtx));
37759
37760 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37761 vperm = force_reg (V32QImode, vperm);
37762
37763 l = gen_reg_rtx (V32QImode);
37764 op = gen_lowpart (V32QImode, d->op0);
37765 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37766
37767 op = gen_lowpart (V32QImode, d->target);
37768 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37769
37770 return true;
37771 }
37772
37773 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37774 and extract-odd permutations of two V32QImode and V16QImode operand
37775 with two vpshufb insns, vpor and vpermq. We should have already
37776 failed all two or three instruction sequences. */
37777
37778 static bool
37779 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37780 {
37781 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37782 unsigned int i, nelt, eltsz;
37783
37784 if (!TARGET_AVX2
37785 || d->one_operand_p
37786 || (d->vmode != V32QImode && d->vmode != V16HImode))
37787 return false;
37788
37789 for (i = 0; i < d->nelt; ++i)
37790 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37791 return false;
37792
37793 if (d->testing_p)
37794 return true;
37795
37796 nelt = d->nelt;
37797 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37798
37799 /* Generate two permutation masks. In the first permutation mask
37800 the first quarter will contain indexes for the first half
37801 of the op0, the second quarter will contain bit 7 set, third quarter
37802 will contain indexes for the second half of the op0 and the
37803 last quarter bit 7 set. In the second permutation mask
37804 the first quarter will contain bit 7 set, the second quarter
37805 indexes for the first half of the op1, the third quarter bit 7 set
37806 and last quarter indexes for the second half of the op1.
37807 I.e. the first mask e.g. for V32QImode extract even will be:
37808 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37809 (all values masked with 0xf except for -128) and second mask
37810 for extract even will be
37811 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37812 m128 = GEN_INT (-128);
37813 for (i = 0; i < nelt; ++i)
37814 {
37815 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37816 unsigned which = d->perm[i] >= nelt;
37817 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37818
37819 for (j = 0; j < eltsz; ++j)
37820 {
37821 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37822 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37823 }
37824 }
37825
37826 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37827 vperm = force_reg (V32QImode, vperm);
37828
37829 l = gen_reg_rtx (V32QImode);
37830 op = gen_lowpart (V32QImode, d->op0);
37831 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37832
37833 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37834 vperm = force_reg (V32QImode, vperm);
37835
37836 h = gen_reg_rtx (V32QImode);
37837 op = gen_lowpart (V32QImode, d->op1);
37838 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37839
37840 ior = gen_reg_rtx (V32QImode);
37841 emit_insn (gen_iorv32qi3 (ior, l, h));
37842
37843 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37844 op = gen_lowpart (V4DImode, d->target);
37845 ior = gen_lowpart (V4DImode, ior);
37846 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37847 const1_rtx, GEN_INT (3)));
37848
37849 return true;
37850 }
37851
37852 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37853 and extract-odd permutations. */
37854
37855 static bool
37856 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37857 {
37858 rtx t1, t2, t3;
37859
37860 switch (d->vmode)
37861 {
37862 case V4DFmode:
37863 t1 = gen_reg_rtx (V4DFmode);
37864 t2 = gen_reg_rtx (V4DFmode);
37865
37866 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37867 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37868 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37869
37870 /* Now an unpck[lh]pd will produce the result required. */
37871 if (odd)
37872 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37873 else
37874 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37875 emit_insn (t3);
37876 break;
37877
37878 case V8SFmode:
37879 {
37880 int mask = odd ? 0xdd : 0x88;
37881
37882 t1 = gen_reg_rtx (V8SFmode);
37883 t2 = gen_reg_rtx (V8SFmode);
37884 t3 = gen_reg_rtx (V8SFmode);
37885
37886 /* Shuffle within the 128-bit lanes to produce:
37887 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37888 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37889 GEN_INT (mask)));
37890
37891 /* Shuffle the lanes around to produce:
37892 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37893 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37894 GEN_INT (0x3)));
37895
37896 /* Shuffle within the 128-bit lanes to produce:
37897 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37898 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37899
37900 /* Shuffle within the 128-bit lanes to produce:
37901 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37902 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37903
37904 /* Shuffle the lanes around to produce:
37905 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37906 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37907 GEN_INT (0x20)));
37908 }
37909 break;
37910
37911 case V2DFmode:
37912 case V4SFmode:
37913 case V2DImode:
37914 case V4SImode:
37915 /* These are always directly implementable by expand_vec_perm_1. */
37916 gcc_unreachable ();
37917
37918 case V8HImode:
37919 if (TARGET_SSSE3)
37920 return expand_vec_perm_pshufb2 (d);
37921 else
37922 {
37923 /* We need 2*log2(N)-1 operations to achieve odd/even
37924 with interleave. */
37925 t1 = gen_reg_rtx (V8HImode);
37926 t2 = gen_reg_rtx (V8HImode);
37927 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37928 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37929 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37930 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37931 if (odd)
37932 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37933 else
37934 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37935 emit_insn (t3);
37936 }
37937 break;
37938
37939 case V16QImode:
37940 if (TARGET_SSSE3)
37941 return expand_vec_perm_pshufb2 (d);
37942 else
37943 {
37944 t1 = gen_reg_rtx (V16QImode);
37945 t2 = gen_reg_rtx (V16QImode);
37946 t3 = gen_reg_rtx (V16QImode);
37947 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37948 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37949 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37950 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37951 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37952 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37953 if (odd)
37954 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37955 else
37956 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37957 emit_insn (t3);
37958 }
37959 break;
37960
37961 case V16HImode:
37962 case V32QImode:
37963 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37964
37965 case V4DImode:
37966 if (!TARGET_AVX2)
37967 {
37968 struct expand_vec_perm_d d_copy = *d;
37969 d_copy.vmode = V4DFmode;
37970 d_copy.target = gen_lowpart (V4DFmode, d->target);
37971 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37972 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37973 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37974 }
37975
37976 t1 = gen_reg_rtx (V4DImode);
37977 t2 = gen_reg_rtx (V4DImode);
37978
37979 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37980 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37981 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37982
37983 /* Now an vpunpck[lh]qdq will produce the result required. */
37984 if (odd)
37985 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37986 else
37987 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37988 emit_insn (t3);
37989 break;
37990
37991 case V8SImode:
37992 if (!TARGET_AVX2)
37993 {
37994 struct expand_vec_perm_d d_copy = *d;
37995 d_copy.vmode = V8SFmode;
37996 d_copy.target = gen_lowpart (V8SFmode, d->target);
37997 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37998 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37999 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38000 }
38001
38002 t1 = gen_reg_rtx (V8SImode);
38003 t2 = gen_reg_rtx (V8SImode);
38004
38005 /* Shuffle the lanes around into
38006 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
38007 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
38008 gen_lowpart (V4DImode, d->op0),
38009 gen_lowpart (V4DImode, d->op1),
38010 GEN_INT (0x20)));
38011 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
38012 gen_lowpart (V4DImode, d->op0),
38013 gen_lowpart (V4DImode, d->op1),
38014 GEN_INT (0x31)));
38015
38016 /* Swap the 2nd and 3rd position in each lane into
38017 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
38018 emit_insn (gen_avx2_pshufdv3 (t1, t1,
38019 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38020 emit_insn (gen_avx2_pshufdv3 (t2, t2,
38021 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38022
38023 /* Now an vpunpck[lh]qdq will produce
38024 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
38025 if (odd)
38026 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
38027 gen_lowpart (V4DImode, t1),
38028 gen_lowpart (V4DImode, t2));
38029 else
38030 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
38031 gen_lowpart (V4DImode, t1),
38032 gen_lowpart (V4DImode, t2));
38033 emit_insn (t3);
38034 break;
38035
38036 default:
38037 gcc_unreachable ();
38038 }
38039
38040 return true;
38041 }
38042
38043 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38044 extract-even and extract-odd permutations. */
38045
38046 static bool
38047 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
38048 {
38049 unsigned i, odd, nelt = d->nelt;
38050
38051 odd = d->perm[0];
38052 if (odd != 0 && odd != 1)
38053 return false;
38054
38055 for (i = 1; i < nelt; ++i)
38056 if (d->perm[i] != 2 * i + odd)
38057 return false;
38058
38059 return expand_vec_perm_even_odd_1 (d, odd);
38060 }
38061
38062 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
38063 permutations. We assume that expand_vec_perm_1 has already failed. */
38064
38065 static bool
38066 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
38067 {
38068 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
38069 enum machine_mode vmode = d->vmode;
38070 unsigned char perm2[4];
38071 rtx op0 = d->op0;
38072 bool ok;
38073
38074 switch (vmode)
38075 {
38076 case V4DFmode:
38077 case V8SFmode:
38078 /* These are special-cased in sse.md so that we can optionally
38079 use the vbroadcast instruction. They expand to two insns
38080 if the input happens to be in a register. */
38081 gcc_unreachable ();
38082
38083 case V2DFmode:
38084 case V2DImode:
38085 case V4SFmode:
38086 case V4SImode:
38087 /* These are always implementable using standard shuffle patterns. */
38088 gcc_unreachable ();
38089
38090 case V8HImode:
38091 case V16QImode:
38092 /* These can be implemented via interleave. We save one insn by
38093 stopping once we have promoted to V4SImode and then use pshufd. */
38094 do
38095 {
38096 rtx dest;
38097 rtx (*gen) (rtx, rtx, rtx)
38098 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
38099 : gen_vec_interleave_lowv8hi;
38100
38101 if (elt >= nelt2)
38102 {
38103 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
38104 : gen_vec_interleave_highv8hi;
38105 elt -= nelt2;
38106 }
38107 nelt2 /= 2;
38108
38109 dest = gen_reg_rtx (vmode);
38110 emit_insn (gen (dest, op0, op0));
38111 vmode = get_mode_wider_vector (vmode);
38112 op0 = gen_lowpart (vmode, dest);
38113 }
38114 while (vmode != V4SImode);
38115
38116 memset (perm2, elt, 4);
38117 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
38118 d->testing_p);
38119 gcc_assert (ok);
38120 return true;
38121
38122 case V32QImode:
38123 case V16HImode:
38124 case V8SImode:
38125 case V4DImode:
38126 /* For AVX2 broadcasts of the first element vpbroadcast* or
38127 vpermq should be used by expand_vec_perm_1. */
38128 gcc_assert (!TARGET_AVX2 || d->perm[0]);
38129 return false;
38130
38131 default:
38132 gcc_unreachable ();
38133 }
38134 }
38135
38136 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38137 broadcast permutations. */
38138
38139 static bool
38140 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
38141 {
38142 unsigned i, elt, nelt = d->nelt;
38143
38144 if (!d->one_operand_p)
38145 return false;
38146
38147 elt = d->perm[0];
38148 for (i = 1; i < nelt; ++i)
38149 if (d->perm[i] != elt)
38150 return false;
38151
38152 return expand_vec_perm_broadcast_1 (d);
38153 }
38154
38155 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
38156 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
38157 all the shorter instruction sequences. */
38158
38159 static bool
38160 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
38161 {
38162 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
38163 unsigned int i, nelt, eltsz;
38164 bool used[4];
38165
38166 if (!TARGET_AVX2
38167 || d->one_operand_p
38168 || (d->vmode != V32QImode && d->vmode != V16HImode))
38169 return false;
38170
38171 if (d->testing_p)
38172 return true;
38173
38174 nelt = d->nelt;
38175 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38176
38177 /* Generate 4 permutation masks. If the required element is within
38178 the same lane, it is shuffled in. If the required element from the
38179 other lane, force a zero by setting bit 7 in the permutation mask.
38180 In the other mask the mask has non-negative elements if element
38181 is requested from the other lane, but also moved to the other lane,
38182 so that the result of vpshufb can have the two V2TImode halves
38183 swapped. */
38184 m128 = GEN_INT (-128);
38185 for (i = 0; i < 32; ++i)
38186 {
38187 rperm[0][i] = m128;
38188 rperm[1][i] = m128;
38189 rperm[2][i] = m128;
38190 rperm[3][i] = m128;
38191 }
38192 used[0] = false;
38193 used[1] = false;
38194 used[2] = false;
38195 used[3] = false;
38196 for (i = 0; i < nelt; ++i)
38197 {
38198 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38199 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38200 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
38201
38202 for (j = 0; j < eltsz; ++j)
38203 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
38204 used[which] = true;
38205 }
38206
38207 for (i = 0; i < 2; ++i)
38208 {
38209 if (!used[2 * i + 1])
38210 {
38211 h[i] = NULL_RTX;
38212 continue;
38213 }
38214 vperm = gen_rtx_CONST_VECTOR (V32QImode,
38215 gen_rtvec_v (32, rperm[2 * i + 1]));
38216 vperm = force_reg (V32QImode, vperm);
38217 h[i] = gen_reg_rtx (V32QImode);
38218 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38219 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
38220 }
38221
38222 /* Swap the 128-byte lanes of h[X]. */
38223 for (i = 0; i < 2; ++i)
38224 {
38225 if (h[i] == NULL_RTX)
38226 continue;
38227 op = gen_reg_rtx (V4DImode);
38228 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
38229 const2_rtx, GEN_INT (3), const0_rtx,
38230 const1_rtx));
38231 h[i] = gen_lowpart (V32QImode, op);
38232 }
38233
38234 for (i = 0; i < 2; ++i)
38235 {
38236 if (!used[2 * i])
38237 {
38238 l[i] = NULL_RTX;
38239 continue;
38240 }
38241 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
38242 vperm = force_reg (V32QImode, vperm);
38243 l[i] = gen_reg_rtx (V32QImode);
38244 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38245 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
38246 }
38247
38248 for (i = 0; i < 2; ++i)
38249 {
38250 if (h[i] && l[i])
38251 {
38252 op = gen_reg_rtx (V32QImode);
38253 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38254 l[i] = op;
38255 }
38256 else if (h[i])
38257 l[i] = h[i];
38258 }
38259
38260 gcc_assert (l[0] && l[1]);
38261 op = gen_lowpart (V32QImode, d->target);
38262 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38263 return true;
38264 }
38265
38266 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38267 With all of the interface bits taken care of, perform the expansion
38268 in D and return true on success. */
38269
38270 static bool
38271 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38272 {
38273 /* Try a single instruction expansion. */
38274 if (expand_vec_perm_1 (d))
38275 return true;
38276
38277 /* Try sequences of two instructions. */
38278
38279 if (expand_vec_perm_pshuflw_pshufhw (d))
38280 return true;
38281
38282 if (expand_vec_perm_palignr (d))
38283 return true;
38284
38285 if (expand_vec_perm_interleave2 (d))
38286 return true;
38287
38288 if (expand_vec_perm_broadcast (d))
38289 return true;
38290
38291 if (expand_vec_perm_vpermq_perm_1 (d))
38292 return true;
38293
38294 if (expand_vec_perm_vperm2f128 (d))
38295 return true;
38296
38297 /* Try sequences of three instructions. */
38298
38299 if (expand_vec_perm_2vperm2f128_vshuf (d))
38300 return true;
38301
38302 if (expand_vec_perm_pshufb2 (d))
38303 return true;
38304
38305 if (expand_vec_perm_interleave3 (d))
38306 return true;
38307
38308 if (expand_vec_perm_vperm2f128_vblend (d))
38309 return true;
38310
38311 /* Try sequences of four instructions. */
38312
38313 if (expand_vec_perm_vpshufb2_vpermq (d))
38314 return true;
38315
38316 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38317 return true;
38318
38319 /* ??? Look for narrow permutations whose element orderings would
38320 allow the promotion to a wider mode. */
38321
38322 /* ??? Look for sequences of interleave or a wider permute that place
38323 the data into the correct lanes for a half-vector shuffle like
38324 pshuf[lh]w or vpermilps. */
38325
38326 /* ??? Look for sequences of interleave that produce the desired results.
38327 The combinatorics of punpck[lh] get pretty ugly... */
38328
38329 if (expand_vec_perm_even_odd (d))
38330 return true;
38331
38332 /* Even longer sequences. */
38333 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38334 return true;
38335
38336 return false;
38337 }
38338
38339 /* If a permutation only uses one operand, make it clear. Returns true
38340 if the permutation references both operands. */
38341
38342 static bool
38343 canonicalize_perm (struct expand_vec_perm_d *d)
38344 {
38345 int i, which, nelt = d->nelt;
38346
38347 for (i = which = 0; i < nelt; ++i)
38348 which |= (d->perm[i] < nelt ? 1 : 2);
38349
38350 d->one_operand_p = true;
38351 switch (which)
38352 {
38353 default:
38354 gcc_unreachable();
38355
38356 case 3:
38357 if (!rtx_equal_p (d->op0, d->op1))
38358 {
38359 d->one_operand_p = false;
38360 break;
38361 }
38362 /* The elements of PERM do not suggest that only the first operand
38363 is used, but both operands are identical. Allow easier matching
38364 of the permutation by folding the permutation into the single
38365 input vector. */
38366 /* FALLTHRU */
38367
38368 case 2:
38369 for (i = 0; i < nelt; ++i)
38370 d->perm[i] &= nelt - 1;
38371 d->op0 = d->op1;
38372 break;
38373
38374 case 1:
38375 d->op1 = d->op0;
38376 break;
38377 }
38378
38379 return (which == 3);
38380 }
38381
38382 bool
38383 ix86_expand_vec_perm_const (rtx operands[4])
38384 {
38385 struct expand_vec_perm_d d;
38386 unsigned char perm[MAX_VECT_LEN];
38387 int i, nelt;
38388 bool two_args;
38389 rtx sel;
38390
38391 d.target = operands[0];
38392 d.op0 = operands[1];
38393 d.op1 = operands[2];
38394 sel = operands[3];
38395
38396 d.vmode = GET_MODE (d.target);
38397 gcc_assert (VECTOR_MODE_P (d.vmode));
38398 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38399 d.testing_p = false;
38400
38401 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38402 gcc_assert (XVECLEN (sel, 0) == nelt);
38403 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38404
38405 for (i = 0; i < nelt; ++i)
38406 {
38407 rtx e = XVECEXP (sel, 0, i);
38408 int ei = INTVAL (e) & (2 * nelt - 1);
38409 d.perm[i] = ei;
38410 perm[i] = ei;
38411 }
38412
38413 two_args = canonicalize_perm (&d);
38414
38415 if (ix86_expand_vec_perm_const_1 (&d))
38416 return true;
38417
38418 /* If the selector says both arguments are needed, but the operands are the
38419 same, the above tried to expand with one_operand_p and flattened selector.
38420 If that didn't work, retry without one_operand_p; we succeeded with that
38421 during testing. */
38422 if (two_args && d.one_operand_p)
38423 {
38424 d.one_operand_p = false;
38425 memcpy (d.perm, perm, sizeof (perm));
38426 return ix86_expand_vec_perm_const_1 (&d);
38427 }
38428
38429 return false;
38430 }
38431
38432 /* Implement targetm.vectorize.vec_perm_const_ok. */
38433
38434 static bool
38435 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38436 const unsigned char *sel)
38437 {
38438 struct expand_vec_perm_d d;
38439 unsigned int i, nelt, which;
38440 bool ret;
38441
38442 d.vmode = vmode;
38443 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38444 d.testing_p = true;
38445
38446 /* Given sufficient ISA support we can just return true here
38447 for selected vector modes. */
38448 if (GET_MODE_SIZE (d.vmode) == 16)
38449 {
38450 /* All implementable with a single vpperm insn. */
38451 if (TARGET_XOP)
38452 return true;
38453 /* All implementable with 2 pshufb + 1 ior. */
38454 if (TARGET_SSSE3)
38455 return true;
38456 /* All implementable with shufpd or unpck[lh]pd. */
38457 if (d.nelt == 2)
38458 return true;
38459 }
38460
38461 /* Extract the values from the vector CST into the permutation
38462 array in D. */
38463 memcpy (d.perm, sel, nelt);
38464 for (i = which = 0; i < nelt; ++i)
38465 {
38466 unsigned char e = d.perm[i];
38467 gcc_assert (e < 2 * nelt);
38468 which |= (e < nelt ? 1 : 2);
38469 }
38470
38471 /* For all elements from second vector, fold the elements to first. */
38472 if (which == 2)
38473 for (i = 0; i < nelt; ++i)
38474 d.perm[i] -= nelt;
38475
38476 /* Check whether the mask can be applied to the vector type. */
38477 d.one_operand_p = (which != 3);
38478
38479 /* Implementable with shufps or pshufd. */
38480 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38481 return true;
38482
38483 /* Otherwise we have to go through the motions and see if we can
38484 figure out how to generate the requested permutation. */
38485 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38486 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38487 if (!d.one_operand_p)
38488 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38489
38490 start_sequence ();
38491 ret = ix86_expand_vec_perm_const_1 (&d);
38492 end_sequence ();
38493
38494 return ret;
38495 }
38496
38497 void
38498 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38499 {
38500 struct expand_vec_perm_d d;
38501 unsigned i, nelt;
38502
38503 d.target = targ;
38504 d.op0 = op0;
38505 d.op1 = op1;
38506 d.vmode = GET_MODE (targ);
38507 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38508 d.one_operand_p = false;
38509 d.testing_p = false;
38510
38511 for (i = 0; i < nelt; ++i)
38512 d.perm[i] = i * 2 + odd;
38513
38514 /* We'll either be able to implement the permutation directly... */
38515 if (expand_vec_perm_1 (&d))
38516 return;
38517
38518 /* ... or we use the special-case patterns. */
38519 expand_vec_perm_even_odd_1 (&d, odd);
38520 }
38521
38522 /* Expand a vector operation CODE for a V*QImode in terms of the
38523 same operation on V*HImode. */
38524
38525 void
38526 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
38527 {
38528 enum machine_mode qimode = GET_MODE (dest);
38529 enum machine_mode himode;
38530 rtx (*gen_il) (rtx, rtx, rtx);
38531 rtx (*gen_ih) (rtx, rtx, rtx);
38532 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
38533 struct expand_vec_perm_d d;
38534 bool ok, full_interleave;
38535 bool uns_p = false;
38536 int i;
38537
38538 switch (qimode)
38539 {
38540 case V16QImode:
38541 himode = V8HImode;
38542 gen_il = gen_vec_interleave_lowv16qi;
38543 gen_ih = gen_vec_interleave_highv16qi;
38544 break;
38545 case V32QImode:
38546 himode = V16HImode;
38547 gen_il = gen_avx2_interleave_lowv32qi;
38548 gen_ih = gen_avx2_interleave_highv32qi;
38549 break;
38550 default:
38551 gcc_unreachable ();
38552 }
38553
38554 op2_l = op2_h = op2;
38555 switch (code)
38556 {
38557 case MULT:
38558 /* Unpack data such that we've got a source byte in each low byte of
38559 each word. We don't care what goes into the high byte of each word.
38560 Rather than trying to get zero in there, most convenient is to let
38561 it be a copy of the low byte. */
38562 op2_l = gen_reg_rtx (qimode);
38563 op2_h = gen_reg_rtx (qimode);
38564 emit_insn (gen_il (op2_l, op2, op2));
38565 emit_insn (gen_ih (op2_h, op2, op2));
38566 /* FALLTHRU */
38567
38568 op1_l = gen_reg_rtx (qimode);
38569 op1_h = gen_reg_rtx (qimode);
38570 emit_insn (gen_il (op1_l, op1, op1));
38571 emit_insn (gen_ih (op1_h, op1, op1));
38572 full_interleave = qimode == V16QImode;
38573 break;
38574
38575 case ASHIFT:
38576 case LSHIFTRT:
38577 uns_p = true;
38578 /* FALLTHRU */
38579 case ASHIFTRT:
38580 op1_l = gen_reg_rtx (himode);
38581 op1_h = gen_reg_rtx (himode);
38582 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
38583 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
38584 full_interleave = true;
38585 break;
38586 default:
38587 gcc_unreachable ();
38588 }
38589
38590 /* Perform the operation. */
38591 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
38592 1, OPTAB_DIRECT);
38593 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
38594 1, OPTAB_DIRECT);
38595 gcc_assert (res_l && res_h);
38596
38597 /* Merge the data back into the right place. */
38598 d.target = dest;
38599 d.op0 = gen_lowpart (qimode, res_l);
38600 d.op1 = gen_lowpart (qimode, res_h);
38601 d.vmode = qimode;
38602 d.nelt = GET_MODE_NUNITS (qimode);
38603 d.one_operand_p = false;
38604 d.testing_p = false;
38605
38606 if (full_interleave)
38607 {
38608 /* For SSE2, we used an full interleave, so the desired
38609 results are in the even elements. */
38610 for (i = 0; i < 32; ++i)
38611 d.perm[i] = i * 2;
38612 }
38613 else
38614 {
38615 /* For AVX, the interleave used above was not cross-lane. So the
38616 extraction is evens but with the second and third quarter swapped.
38617 Happily, that is even one insn shorter than even extraction. */
38618 for (i = 0; i < 32; ++i)
38619 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
38620 }
38621
38622 ok = ix86_expand_vec_perm_const_1 (&d);
38623 gcc_assert (ok);
38624
38625 set_unique_reg_note (get_last_insn (), REG_EQUAL,
38626 gen_rtx_fmt_ee (code, qimode, op1, op2));
38627 }
38628
38629 void
38630 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
38631 {
38632 rtx op1_m1, op1_m2;
38633 rtx op2_m1, op2_m2;
38634 rtx res_1, res_2;
38635
38636 /* Shift both input vectors down one element, so that elements 3
38637 and 1 are now in the slots for elements 2 and 0. For K8, at
38638 least, this is faster than using a shuffle. */
38639 op1_m1 = op1 = force_reg (V4SImode, op1);
38640 op1_m2 = gen_reg_rtx (V4SImode);
38641 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
38642 gen_lowpart (V1TImode, op1),
38643 GEN_INT (32)));
38644
38645 if (GET_CODE (op2) == CONST_VECTOR)
38646 {
38647 rtvec v;
38648
38649 /* Constant propagate the vector shift, leaving the dont-care
38650 vector elements as zero. */
38651 v = rtvec_alloc (4);
38652 RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
38653 RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
38654 RTVEC_ELT (v, 1) = const0_rtx;
38655 RTVEC_ELT (v, 3) = const0_rtx;
38656 op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
38657 op2_m1 = force_reg (V4SImode, op2_m1);
38658
38659 v = rtvec_alloc (4);
38660 RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
38661 RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
38662 RTVEC_ELT (v, 1) = const0_rtx;
38663 RTVEC_ELT (v, 3) = const0_rtx;
38664 op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
38665 op2_m2 = force_reg (V4SImode, op2_m2);
38666 }
38667 else
38668 {
38669 op2_m1 = op2 = force_reg (V4SImode, op2);
38670 op2_m2 = gen_reg_rtx (V4SImode);
38671 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
38672 gen_lowpart (V1TImode, op2),
38673 GEN_INT (32)));
38674 }
38675
38676 /* Widening multiply of elements 0+2, and 1+3. */
38677 res_1 = gen_reg_rtx (V4SImode);
38678 res_2 = gen_reg_rtx (V4SImode);
38679 emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
38680 op1_m1, op2_m1));
38681 emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
38682 op1_m2, op2_m2));
38683
38684 /* Move the results in element 2 down to element 1; we don't care
38685 what goes in elements 2 and 3. Then we can merge the parts
38686 back together with an interleave.
38687
38688 Note that two other sequences were tried:
38689 (1) Use interleaves at the start instead of psrldq, which allows
38690 us to use a single shufps to merge things back at the end.
38691 (2) Use shufps here to combine the two vectors, then pshufd to
38692 put the elements in the correct order.
38693 In both cases the cost of the reformatting stall was too high
38694 and the overall sequence slower. */
38695
38696 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
38697 const0_rtx, const0_rtx));
38698 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
38699 const0_rtx, const0_rtx));
38700 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
38701
38702 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
38703 }
38704
38705 /* Expand an insert into a vector register through pinsr insn.
38706 Return true if successful. */
38707
38708 bool
38709 ix86_expand_pinsr (rtx *operands)
38710 {
38711 rtx dst = operands[0];
38712 rtx src = operands[3];
38713
38714 unsigned int size = INTVAL (operands[1]);
38715 unsigned int pos = INTVAL (operands[2]);
38716
38717 if (GET_CODE (dst) == SUBREG)
38718 {
38719 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
38720 dst = SUBREG_REG (dst);
38721 }
38722
38723 if (GET_CODE (src) == SUBREG)
38724 src = SUBREG_REG (src);
38725
38726 switch (GET_MODE (dst))
38727 {
38728 case V16QImode:
38729 case V8HImode:
38730 case V4SImode:
38731 case V2DImode:
38732 {
38733 enum machine_mode srcmode, dstmode;
38734 rtx (*pinsr)(rtx, rtx, rtx, rtx);
38735
38736 srcmode = mode_for_size (size, MODE_INT, 0);
38737
38738 switch (srcmode)
38739 {
38740 case QImode:
38741 if (!TARGET_SSE4_1)
38742 return false;
38743 dstmode = V16QImode;
38744 pinsr = gen_sse4_1_pinsrb;
38745 break;
38746
38747 case HImode:
38748 if (!TARGET_SSE2)
38749 return false;
38750 dstmode = V8HImode;
38751 pinsr = gen_sse2_pinsrw;
38752 break;
38753
38754 case SImode:
38755 if (!TARGET_SSE4_1)
38756 return false;
38757 dstmode = V4SImode;
38758 pinsr = gen_sse4_1_pinsrd;
38759 break;
38760
38761 case DImode:
38762 gcc_assert (TARGET_64BIT);
38763 if (!TARGET_SSE4_1)
38764 return false;
38765 dstmode = V2DImode;
38766 pinsr = gen_sse4_1_pinsrq;
38767 break;
38768
38769 default:
38770 return false;
38771 }
38772
38773 dst = gen_lowpart (dstmode, dst);
38774 src = gen_lowpart (srcmode, src);
38775
38776 pos /= size;
38777
38778 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
38779 return true;
38780 }
38781
38782 default:
38783 return false;
38784 }
38785 }
38786 \f
38787 /* This function returns the calling abi specific va_list type node.
38788 It returns the FNDECL specific va_list type. */
38789
38790 static tree
38791 ix86_fn_abi_va_list (tree fndecl)
38792 {
38793 if (!TARGET_64BIT)
38794 return va_list_type_node;
38795 gcc_assert (fndecl != NULL_TREE);
38796
38797 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
38798 return ms_va_list_type_node;
38799 else
38800 return sysv_va_list_type_node;
38801 }
38802
38803 /* Returns the canonical va_list type specified by TYPE. If there
38804 is no valid TYPE provided, it return NULL_TREE. */
38805
38806 static tree
38807 ix86_canonical_va_list_type (tree type)
38808 {
38809 tree wtype, htype;
38810
38811 /* Resolve references and pointers to va_list type. */
38812 if (TREE_CODE (type) == MEM_REF)
38813 type = TREE_TYPE (type);
38814 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
38815 type = TREE_TYPE (type);
38816 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
38817 type = TREE_TYPE (type);
38818
38819 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
38820 {
38821 wtype = va_list_type_node;
38822 gcc_assert (wtype != NULL_TREE);
38823 htype = type;
38824 if (TREE_CODE (wtype) == ARRAY_TYPE)
38825 {
38826 /* If va_list is an array type, the argument may have decayed
38827 to a pointer type, e.g. by being passed to another function.
38828 In that case, unwrap both types so that we can compare the
38829 underlying records. */
38830 if (TREE_CODE (htype) == ARRAY_TYPE
38831 || POINTER_TYPE_P (htype))
38832 {
38833 wtype = TREE_TYPE (wtype);
38834 htype = TREE_TYPE (htype);
38835 }
38836 }
38837 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38838 return va_list_type_node;
38839 wtype = sysv_va_list_type_node;
38840 gcc_assert (wtype != NULL_TREE);
38841 htype = type;
38842 if (TREE_CODE (wtype) == ARRAY_TYPE)
38843 {
38844 /* If va_list is an array type, the argument may have decayed
38845 to a pointer type, e.g. by being passed to another function.
38846 In that case, unwrap both types so that we can compare the
38847 underlying records. */
38848 if (TREE_CODE (htype) == ARRAY_TYPE
38849 || POINTER_TYPE_P (htype))
38850 {
38851 wtype = TREE_TYPE (wtype);
38852 htype = TREE_TYPE (htype);
38853 }
38854 }
38855 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38856 return sysv_va_list_type_node;
38857 wtype = ms_va_list_type_node;
38858 gcc_assert (wtype != NULL_TREE);
38859 htype = type;
38860 if (TREE_CODE (wtype) == ARRAY_TYPE)
38861 {
38862 /* If va_list is an array type, the argument may have decayed
38863 to a pointer type, e.g. by being passed to another function.
38864 In that case, unwrap both types so that we can compare the
38865 underlying records. */
38866 if (TREE_CODE (htype) == ARRAY_TYPE
38867 || POINTER_TYPE_P (htype))
38868 {
38869 wtype = TREE_TYPE (wtype);
38870 htype = TREE_TYPE (htype);
38871 }
38872 }
38873 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38874 return ms_va_list_type_node;
38875 return NULL_TREE;
38876 }
38877 return std_canonical_va_list_type (type);
38878 }
38879
38880 /* Iterate through the target-specific builtin types for va_list.
38881 IDX denotes the iterator, *PTREE is set to the result type of
38882 the va_list builtin, and *PNAME to its internal type.
38883 Returns zero if there is no element for this index, otherwise
38884 IDX should be increased upon the next call.
38885 Note, do not iterate a base builtin's name like __builtin_va_list.
38886 Used from c_common_nodes_and_builtins. */
38887
38888 static int
38889 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
38890 {
38891 if (TARGET_64BIT)
38892 {
38893 switch (idx)
38894 {
38895 default:
38896 break;
38897
38898 case 0:
38899 *ptree = ms_va_list_type_node;
38900 *pname = "__builtin_ms_va_list";
38901 return 1;
38902
38903 case 1:
38904 *ptree = sysv_va_list_type_node;
38905 *pname = "__builtin_sysv_va_list";
38906 return 1;
38907 }
38908 }
38909
38910 return 0;
38911 }
38912
38913 #undef TARGET_SCHED_DISPATCH
38914 #define TARGET_SCHED_DISPATCH has_dispatch
38915 #undef TARGET_SCHED_DISPATCH_DO
38916 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38917 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38918 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38919 #undef TARGET_SCHED_REORDER
38920 #define TARGET_SCHED_REORDER ix86_sched_reorder
38921
38922 /* The size of the dispatch window is the total number of bytes of
38923 object code allowed in a window. */
38924 #define DISPATCH_WINDOW_SIZE 16
38925
38926 /* Number of dispatch windows considered for scheduling. */
38927 #define MAX_DISPATCH_WINDOWS 3
38928
38929 /* Maximum number of instructions in a window. */
38930 #define MAX_INSN 4
38931
38932 /* Maximum number of immediate operands in a window. */
38933 #define MAX_IMM 4
38934
38935 /* Maximum number of immediate bits allowed in a window. */
38936 #define MAX_IMM_SIZE 128
38937
38938 /* Maximum number of 32 bit immediates allowed in a window. */
38939 #define MAX_IMM_32 4
38940
38941 /* Maximum number of 64 bit immediates allowed in a window. */
38942 #define MAX_IMM_64 2
38943
38944 /* Maximum total of loads or prefetches allowed in a window. */
38945 #define MAX_LOAD 2
38946
38947 /* Maximum total of stores allowed in a window. */
38948 #define MAX_STORE 1
38949
38950 #undef BIG
38951 #define BIG 100
38952
38953
38954 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38955 enum dispatch_group {
38956 disp_no_group = 0,
38957 disp_load,
38958 disp_store,
38959 disp_load_store,
38960 disp_prefetch,
38961 disp_imm,
38962 disp_imm_32,
38963 disp_imm_64,
38964 disp_branch,
38965 disp_cmp,
38966 disp_jcc,
38967 disp_last
38968 };
38969
38970 /* Number of allowable groups in a dispatch window. It is an array
38971 indexed by dispatch_group enum. 100 is used as a big number,
38972 because the number of these kind of operations does not have any
38973 effect in dispatch window, but we need them for other reasons in
38974 the table. */
38975 static unsigned int num_allowable_groups[disp_last] = {
38976 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38977 };
38978
38979 char group_name[disp_last + 1][16] = {
38980 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38981 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38982 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38983 };
38984
38985 /* Instruction path. */
38986 enum insn_path {
38987 no_path = 0,
38988 path_single, /* Single micro op. */
38989 path_double, /* Double micro op. */
38990 path_multi, /* Instructions with more than 2 micro op.. */
38991 last_path
38992 };
38993
38994 /* sched_insn_info defines a window to the instructions scheduled in
38995 the basic block. It contains a pointer to the insn_info table and
38996 the instruction scheduled.
38997
38998 Windows are allocated for each basic block and are linked
38999 together. */
39000 typedef struct sched_insn_info_s {
39001 rtx insn;
39002 enum dispatch_group group;
39003 enum insn_path path;
39004 int byte_len;
39005 int imm_bytes;
39006 } sched_insn_info;
39007
39008 /* Linked list of dispatch windows. This is a two way list of
39009 dispatch windows of a basic block. It contains information about
39010 the number of uops in the window and the total number of
39011 instructions and of bytes in the object code for this dispatch
39012 window. */
39013 typedef struct dispatch_windows_s {
39014 int num_insn; /* Number of insn in the window. */
39015 int num_uops; /* Number of uops in the window. */
39016 int window_size; /* Number of bytes in the window. */
39017 int window_num; /* Window number between 0 or 1. */
39018 int num_imm; /* Number of immediates in an insn. */
39019 int num_imm_32; /* Number of 32 bit immediates in an insn. */
39020 int num_imm_64; /* Number of 64 bit immediates in an insn. */
39021 int imm_size; /* Total immediates in the window. */
39022 int num_loads; /* Total memory loads in the window. */
39023 int num_stores; /* Total memory stores in the window. */
39024 int violation; /* Violation exists in window. */
39025 sched_insn_info *window; /* Pointer to the window. */
39026 struct dispatch_windows_s *next;
39027 struct dispatch_windows_s *prev;
39028 } dispatch_windows;
39029
39030 /* Immediate valuse used in an insn. */
39031 typedef struct imm_info_s
39032 {
39033 int imm;
39034 int imm32;
39035 int imm64;
39036 } imm_info;
39037
39038 static dispatch_windows *dispatch_window_list;
39039 static dispatch_windows *dispatch_window_list1;
39040
39041 /* Get dispatch group of insn. */
39042
39043 static enum dispatch_group
39044 get_mem_group (rtx insn)
39045 {
39046 enum attr_memory memory;
39047
39048 if (INSN_CODE (insn) < 0)
39049 return disp_no_group;
39050 memory = get_attr_memory (insn);
39051 if (memory == MEMORY_STORE)
39052 return disp_store;
39053
39054 if (memory == MEMORY_LOAD)
39055 return disp_load;
39056
39057 if (memory == MEMORY_BOTH)
39058 return disp_load_store;
39059
39060 return disp_no_group;
39061 }
39062
39063 /* Return true if insn is a compare instruction. */
39064
39065 static bool
39066 is_cmp (rtx insn)
39067 {
39068 enum attr_type type;
39069
39070 type = get_attr_type (insn);
39071 return (type == TYPE_TEST
39072 || type == TYPE_ICMP
39073 || type == TYPE_FCMP
39074 || GET_CODE (PATTERN (insn)) == COMPARE);
39075 }
39076
39077 /* Return true if a dispatch violation encountered. */
39078
39079 static bool
39080 dispatch_violation (void)
39081 {
39082 if (dispatch_window_list->next)
39083 return dispatch_window_list->next->violation;
39084 return dispatch_window_list->violation;
39085 }
39086
39087 /* Return true if insn is a branch instruction. */
39088
39089 static bool
39090 is_branch (rtx insn)
39091 {
39092 return (CALL_P (insn) || JUMP_P (insn));
39093 }
39094
39095 /* Return true if insn is a prefetch instruction. */
39096
39097 static bool
39098 is_prefetch (rtx insn)
39099 {
39100 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
39101 }
39102
39103 /* This function initializes a dispatch window and the list container holding a
39104 pointer to the window. */
39105
39106 static void
39107 init_window (int window_num)
39108 {
39109 int i;
39110 dispatch_windows *new_list;
39111
39112 if (window_num == 0)
39113 new_list = dispatch_window_list;
39114 else
39115 new_list = dispatch_window_list1;
39116
39117 new_list->num_insn = 0;
39118 new_list->num_uops = 0;
39119 new_list->window_size = 0;
39120 new_list->next = NULL;
39121 new_list->prev = NULL;
39122 new_list->window_num = window_num;
39123 new_list->num_imm = 0;
39124 new_list->num_imm_32 = 0;
39125 new_list->num_imm_64 = 0;
39126 new_list->imm_size = 0;
39127 new_list->num_loads = 0;
39128 new_list->num_stores = 0;
39129 new_list->violation = false;
39130
39131 for (i = 0; i < MAX_INSN; i++)
39132 {
39133 new_list->window[i].insn = NULL;
39134 new_list->window[i].group = disp_no_group;
39135 new_list->window[i].path = no_path;
39136 new_list->window[i].byte_len = 0;
39137 new_list->window[i].imm_bytes = 0;
39138 }
39139 return;
39140 }
39141
39142 /* This function allocates and initializes a dispatch window and the
39143 list container holding a pointer to the window. */
39144
39145 static dispatch_windows *
39146 allocate_window (void)
39147 {
39148 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
39149 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
39150
39151 return new_list;
39152 }
39153
39154 /* This routine initializes the dispatch scheduling information. It
39155 initiates building dispatch scheduler tables and constructs the
39156 first dispatch window. */
39157
39158 static void
39159 init_dispatch_sched (void)
39160 {
39161 /* Allocate a dispatch list and a window. */
39162 dispatch_window_list = allocate_window ();
39163 dispatch_window_list1 = allocate_window ();
39164 init_window (0);
39165 init_window (1);
39166 }
39167
39168 /* This function returns true if a branch is detected. End of a basic block
39169 does not have to be a branch, but here we assume only branches end a
39170 window. */
39171
39172 static bool
39173 is_end_basic_block (enum dispatch_group group)
39174 {
39175 return group == disp_branch;
39176 }
39177
39178 /* This function is called when the end of a window processing is reached. */
39179
39180 static void
39181 process_end_window (void)
39182 {
39183 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
39184 if (dispatch_window_list->next)
39185 {
39186 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
39187 gcc_assert (dispatch_window_list->window_size
39188 + dispatch_window_list1->window_size <= 48);
39189 init_window (1);
39190 }
39191 init_window (0);
39192 }
39193
39194 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
39195 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
39196 for 48 bytes of instructions. Note that these windows are not dispatch
39197 windows that their sizes are DISPATCH_WINDOW_SIZE. */
39198
39199 static dispatch_windows *
39200 allocate_next_window (int window_num)
39201 {
39202 if (window_num == 0)
39203 {
39204 if (dispatch_window_list->next)
39205 init_window (1);
39206 init_window (0);
39207 return dispatch_window_list;
39208 }
39209
39210 dispatch_window_list->next = dispatch_window_list1;
39211 dispatch_window_list1->prev = dispatch_window_list;
39212
39213 return dispatch_window_list1;
39214 }
39215
39216 /* Increment the number of immediate operands of an instruction. */
39217
39218 static int
39219 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
39220 {
39221 if (*in_rtx == 0)
39222 return 0;
39223
39224 switch ( GET_CODE (*in_rtx))
39225 {
39226 case CONST:
39227 case SYMBOL_REF:
39228 case CONST_INT:
39229 (imm_values->imm)++;
39230 if (x86_64_immediate_operand (*in_rtx, SImode))
39231 (imm_values->imm32)++;
39232 else
39233 (imm_values->imm64)++;
39234 break;
39235
39236 case CONST_DOUBLE:
39237 (imm_values->imm)++;
39238 (imm_values->imm64)++;
39239 break;
39240
39241 case CODE_LABEL:
39242 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
39243 {
39244 (imm_values->imm)++;
39245 (imm_values->imm32)++;
39246 }
39247 break;
39248
39249 default:
39250 break;
39251 }
39252
39253 return 0;
39254 }
39255
39256 /* Compute number of immediate operands of an instruction. */
39257
39258 static void
39259 find_constant (rtx in_rtx, imm_info *imm_values)
39260 {
39261 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
39262 (rtx_function) find_constant_1, (void *) imm_values);
39263 }
39264
39265 /* Return total size of immediate operands of an instruction along with number
39266 of corresponding immediate-operands. It initializes its parameters to zero
39267 befor calling FIND_CONSTANT.
39268 INSN is the input instruction. IMM is the total of immediates.
39269 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
39270 bit immediates. */
39271
39272 static int
39273 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
39274 {
39275 imm_info imm_values = {0, 0, 0};
39276
39277 find_constant (insn, &imm_values);
39278 *imm = imm_values.imm;
39279 *imm32 = imm_values.imm32;
39280 *imm64 = imm_values.imm64;
39281 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
39282 }
39283
39284 /* This function indicates if an operand of an instruction is an
39285 immediate. */
39286
39287 static bool
39288 has_immediate (rtx insn)
39289 {
39290 int num_imm_operand;
39291 int num_imm32_operand;
39292 int num_imm64_operand;
39293
39294 if (insn)
39295 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39296 &num_imm64_operand);
39297 return false;
39298 }
39299
39300 /* Return single or double path for instructions. */
39301
39302 static enum insn_path
39303 get_insn_path (rtx insn)
39304 {
39305 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
39306
39307 if ((int)path == 0)
39308 return path_single;
39309
39310 if ((int)path == 1)
39311 return path_double;
39312
39313 return path_multi;
39314 }
39315
39316 /* Return insn dispatch group. */
39317
39318 static enum dispatch_group
39319 get_insn_group (rtx insn)
39320 {
39321 enum dispatch_group group = get_mem_group (insn);
39322 if (group)
39323 return group;
39324
39325 if (is_branch (insn))
39326 return disp_branch;
39327
39328 if (is_cmp (insn))
39329 return disp_cmp;
39330
39331 if (has_immediate (insn))
39332 return disp_imm;
39333
39334 if (is_prefetch (insn))
39335 return disp_prefetch;
39336
39337 return disp_no_group;
39338 }
39339
39340 /* Count number of GROUP restricted instructions in a dispatch
39341 window WINDOW_LIST. */
39342
39343 static int
39344 count_num_restricted (rtx insn, dispatch_windows *window_list)
39345 {
39346 enum dispatch_group group = get_insn_group (insn);
39347 int imm_size;
39348 int num_imm_operand;
39349 int num_imm32_operand;
39350 int num_imm64_operand;
39351
39352 if (group == disp_no_group)
39353 return 0;
39354
39355 if (group == disp_imm)
39356 {
39357 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39358 &num_imm64_operand);
39359 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
39360 || num_imm_operand + window_list->num_imm > MAX_IMM
39361 || (num_imm32_operand > 0
39362 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
39363 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
39364 || (num_imm64_operand > 0
39365 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
39366 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
39367 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
39368 && num_imm64_operand > 0
39369 && ((window_list->num_imm_64 > 0
39370 && window_list->num_insn >= 2)
39371 || window_list->num_insn >= 3)))
39372 return BIG;
39373
39374 return 1;
39375 }
39376
39377 if ((group == disp_load_store
39378 && (window_list->num_loads >= MAX_LOAD
39379 || window_list->num_stores >= MAX_STORE))
39380 || ((group == disp_load
39381 || group == disp_prefetch)
39382 && window_list->num_loads >= MAX_LOAD)
39383 || (group == disp_store
39384 && window_list->num_stores >= MAX_STORE))
39385 return BIG;
39386
39387 return 1;
39388 }
39389
39390 /* This function returns true if insn satisfies dispatch rules on the
39391 last window scheduled. */
39392
39393 static bool
39394 fits_dispatch_window (rtx insn)
39395 {
39396 dispatch_windows *window_list = dispatch_window_list;
39397 dispatch_windows *window_list_next = dispatch_window_list->next;
39398 unsigned int num_restrict;
39399 enum dispatch_group group = get_insn_group (insn);
39400 enum insn_path path = get_insn_path (insn);
39401 int sum;
39402
39403 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
39404 instructions should be given the lowest priority in the
39405 scheduling process in Haifa scheduler to make sure they will be
39406 scheduled in the same dispatch window as the reference to them. */
39407 if (group == disp_jcc || group == disp_cmp)
39408 return false;
39409
39410 /* Check nonrestricted. */
39411 if (group == disp_no_group || group == disp_branch)
39412 return true;
39413
39414 /* Get last dispatch window. */
39415 if (window_list_next)
39416 window_list = window_list_next;
39417
39418 if (window_list->window_num == 1)
39419 {
39420 sum = window_list->prev->window_size + window_list->window_size;
39421
39422 if (sum == 32
39423 || (min_insn_size (insn) + sum) >= 48)
39424 /* Window 1 is full. Go for next window. */
39425 return true;
39426 }
39427
39428 num_restrict = count_num_restricted (insn, window_list);
39429
39430 if (num_restrict > num_allowable_groups[group])
39431 return false;
39432
39433 /* See if it fits in the first window. */
39434 if (window_list->window_num == 0)
39435 {
39436 /* The first widow should have only single and double path
39437 uops. */
39438 if (path == path_double
39439 && (window_list->num_uops + 2) > MAX_INSN)
39440 return false;
39441 else if (path != path_single)
39442 return false;
39443 }
39444 return true;
39445 }
39446
39447 /* Add an instruction INSN with NUM_UOPS micro-operations to the
39448 dispatch window WINDOW_LIST. */
39449
39450 static void
39451 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
39452 {
39453 int byte_len = min_insn_size (insn);
39454 int num_insn = window_list->num_insn;
39455 int imm_size;
39456 sched_insn_info *window = window_list->window;
39457 enum dispatch_group group = get_insn_group (insn);
39458 enum insn_path path = get_insn_path (insn);
39459 int num_imm_operand;
39460 int num_imm32_operand;
39461 int num_imm64_operand;
39462
39463 if (!window_list->violation && group != disp_cmp
39464 && !fits_dispatch_window (insn))
39465 window_list->violation = true;
39466
39467 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39468 &num_imm64_operand);
39469
39470 /* Initialize window with new instruction. */
39471 window[num_insn].insn = insn;
39472 window[num_insn].byte_len = byte_len;
39473 window[num_insn].group = group;
39474 window[num_insn].path = path;
39475 window[num_insn].imm_bytes = imm_size;
39476
39477 window_list->window_size += byte_len;
39478 window_list->num_insn = num_insn + 1;
39479 window_list->num_uops = window_list->num_uops + num_uops;
39480 window_list->imm_size += imm_size;
39481 window_list->num_imm += num_imm_operand;
39482 window_list->num_imm_32 += num_imm32_operand;
39483 window_list->num_imm_64 += num_imm64_operand;
39484
39485 if (group == disp_store)
39486 window_list->num_stores += 1;
39487 else if (group == disp_load
39488 || group == disp_prefetch)
39489 window_list->num_loads += 1;
39490 else if (group == disp_load_store)
39491 {
39492 window_list->num_stores += 1;
39493 window_list->num_loads += 1;
39494 }
39495 }
39496
39497 /* Adds a scheduled instruction, INSN, to the current dispatch window.
39498 If the total bytes of instructions or the number of instructions in
39499 the window exceed allowable, it allocates a new window. */
39500
39501 static void
39502 add_to_dispatch_window (rtx insn)
39503 {
39504 int byte_len;
39505 dispatch_windows *window_list;
39506 dispatch_windows *next_list;
39507 dispatch_windows *window0_list;
39508 enum insn_path path;
39509 enum dispatch_group insn_group;
39510 bool insn_fits;
39511 int num_insn;
39512 int num_uops;
39513 int window_num;
39514 int insn_num_uops;
39515 int sum;
39516
39517 if (INSN_CODE (insn) < 0)
39518 return;
39519
39520 byte_len = min_insn_size (insn);
39521 window_list = dispatch_window_list;
39522 next_list = window_list->next;
39523 path = get_insn_path (insn);
39524 insn_group = get_insn_group (insn);
39525
39526 /* Get the last dispatch window. */
39527 if (next_list)
39528 window_list = dispatch_window_list->next;
39529
39530 if (path == path_single)
39531 insn_num_uops = 1;
39532 else if (path == path_double)
39533 insn_num_uops = 2;
39534 else
39535 insn_num_uops = (int) path;
39536
39537 /* If current window is full, get a new window.
39538 Window number zero is full, if MAX_INSN uops are scheduled in it.
39539 Window number one is full, if window zero's bytes plus window
39540 one's bytes is 32, or if the bytes of the new instruction added
39541 to the total makes it greater than 48, or it has already MAX_INSN
39542 instructions in it. */
39543 num_insn = window_list->num_insn;
39544 num_uops = window_list->num_uops;
39545 window_num = window_list->window_num;
39546 insn_fits = fits_dispatch_window (insn);
39547
39548 if (num_insn >= MAX_INSN
39549 || num_uops + insn_num_uops > MAX_INSN
39550 || !(insn_fits))
39551 {
39552 window_num = ~window_num & 1;
39553 window_list = allocate_next_window (window_num);
39554 }
39555
39556 if (window_num == 0)
39557 {
39558 add_insn_window (insn, window_list, insn_num_uops);
39559 if (window_list->num_insn >= MAX_INSN
39560 && insn_group == disp_branch)
39561 {
39562 process_end_window ();
39563 return;
39564 }
39565 }
39566 else if (window_num == 1)
39567 {
39568 window0_list = window_list->prev;
39569 sum = window0_list->window_size + window_list->window_size;
39570 if (sum == 32
39571 || (byte_len + sum) >= 48)
39572 {
39573 process_end_window ();
39574 window_list = dispatch_window_list;
39575 }
39576
39577 add_insn_window (insn, window_list, insn_num_uops);
39578 }
39579 else
39580 gcc_unreachable ();
39581
39582 if (is_end_basic_block (insn_group))
39583 {
39584 /* End of basic block is reached do end-basic-block process. */
39585 process_end_window ();
39586 return;
39587 }
39588 }
39589
39590 /* Print the dispatch window, WINDOW_NUM, to FILE. */
39591
39592 DEBUG_FUNCTION static void
39593 debug_dispatch_window_file (FILE *file, int window_num)
39594 {
39595 dispatch_windows *list;
39596 int i;
39597
39598 if (window_num == 0)
39599 list = dispatch_window_list;
39600 else
39601 list = dispatch_window_list1;
39602
39603 fprintf (file, "Window #%d:\n", list->window_num);
39604 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
39605 list->num_insn, list->num_uops, list->window_size);
39606 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39607 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
39608
39609 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
39610 list->num_stores);
39611 fprintf (file, " insn info:\n");
39612
39613 for (i = 0; i < MAX_INSN; i++)
39614 {
39615 if (!list->window[i].insn)
39616 break;
39617 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
39618 i, group_name[list->window[i].group],
39619 i, (void *)list->window[i].insn,
39620 i, list->window[i].path,
39621 i, list->window[i].byte_len,
39622 i, list->window[i].imm_bytes);
39623 }
39624 }
39625
39626 /* Print to stdout a dispatch window. */
39627
39628 DEBUG_FUNCTION void
39629 debug_dispatch_window (int window_num)
39630 {
39631 debug_dispatch_window_file (stdout, window_num);
39632 }
39633
39634 /* Print INSN dispatch information to FILE. */
39635
39636 DEBUG_FUNCTION static void
39637 debug_insn_dispatch_info_file (FILE *file, rtx insn)
39638 {
39639 int byte_len;
39640 enum insn_path path;
39641 enum dispatch_group group;
39642 int imm_size;
39643 int num_imm_operand;
39644 int num_imm32_operand;
39645 int num_imm64_operand;
39646
39647 if (INSN_CODE (insn) < 0)
39648 return;
39649
39650 byte_len = min_insn_size (insn);
39651 path = get_insn_path (insn);
39652 group = get_insn_group (insn);
39653 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39654 &num_imm64_operand);
39655
39656 fprintf (file, " insn info:\n");
39657 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
39658 group_name[group], path, byte_len);
39659 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39660 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
39661 }
39662
39663 /* Print to STDERR the status of the ready list with respect to
39664 dispatch windows. */
39665
39666 DEBUG_FUNCTION void
39667 debug_ready_dispatch (void)
39668 {
39669 int i;
39670 int no_ready = number_in_ready ();
39671
39672 fprintf (stdout, "Number of ready: %d\n", no_ready);
39673
39674 for (i = 0; i < no_ready; i++)
39675 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
39676 }
39677
39678 /* This routine is the driver of the dispatch scheduler. */
39679
39680 static void
39681 do_dispatch (rtx insn, int mode)
39682 {
39683 if (mode == DISPATCH_INIT)
39684 init_dispatch_sched ();
39685 else if (mode == ADD_TO_DISPATCH_WINDOW)
39686 add_to_dispatch_window (insn);
39687 }
39688
39689 /* Return TRUE if Dispatch Scheduling is supported. */
39690
39691 static bool
39692 has_dispatch (rtx insn, int action)
39693 {
39694 if ((TARGET_BDVER1 || TARGET_BDVER2)
39695 && flag_dispatch_scheduler)
39696 switch (action)
39697 {
39698 default:
39699 return false;
39700
39701 case IS_DISPATCH_ON:
39702 return true;
39703 break;
39704
39705 case IS_CMP:
39706 return is_cmp (insn);
39707
39708 case DISPATCH_VIOLATION:
39709 return dispatch_violation ();
39710
39711 case FITS_DISPATCH_WINDOW:
39712 return fits_dispatch_window (insn);
39713 }
39714
39715 return false;
39716 }
39717
39718 /* Implementation of reassociation_width target hook used by
39719 reassoc phase to identify parallelism level in reassociated
39720 tree. Statements tree_code is passed in OPC. Arguments type
39721 is passed in MODE.
39722
39723 Currently parallel reassociation is enabled for Atom
39724 processors only and we set reassociation width to be 2
39725 because Atom may issue up to 2 instructions per cycle.
39726
39727 Return value should be fixed if parallel reassociation is
39728 enabled for other processors. */
39729
39730 static int
39731 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
39732 enum machine_mode mode)
39733 {
39734 int res = 1;
39735
39736 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
39737 res = 2;
39738 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
39739 res = 2;
39740
39741 return res;
39742 }
39743
39744 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
39745 place emms and femms instructions. */
39746
39747 static enum machine_mode
39748 ix86_preferred_simd_mode (enum machine_mode mode)
39749 {
39750 if (!TARGET_SSE)
39751 return word_mode;
39752
39753 switch (mode)
39754 {
39755 case QImode:
39756 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
39757 case HImode:
39758 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
39759 case SImode:
39760 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
39761 case DImode:
39762 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
39763
39764 case SFmode:
39765 if (TARGET_AVX && !TARGET_PREFER_AVX128)
39766 return V8SFmode;
39767 else
39768 return V4SFmode;
39769
39770 case DFmode:
39771 if (!TARGET_VECTORIZE_DOUBLE)
39772 return word_mode;
39773 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
39774 return V4DFmode;
39775 else if (TARGET_SSE2)
39776 return V2DFmode;
39777 /* FALLTHRU */
39778
39779 default:
39780 return word_mode;
39781 }
39782 }
39783
39784 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
39785 vectors. */
39786
39787 static unsigned int
39788 ix86_autovectorize_vector_sizes (void)
39789 {
39790 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
39791 }
39792
39793 /* Validate target specific memory model bits in VAL. */
39794
39795 static unsigned HOST_WIDE_INT
39796 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
39797 {
39798 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
39799 unsigned HOST_WIDE_INT strong;
39800
39801 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
39802 |MEMMODEL_MASK)
39803 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
39804 {
39805 warning (OPT_Winvalid_memory_model,
39806 "Unknown architecture specific memory model");
39807 return MEMMODEL_SEQ_CST;
39808 }
39809 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
39810 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
39811 {
39812 warning (OPT_Winvalid_memory_model,
39813 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
39814 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
39815 }
39816 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
39817 {
39818 warning (OPT_Winvalid_memory_model,
39819 "HLE_RELEASE not used with RELEASE or stronger memory model");
39820 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
39821 }
39822 return val;
39823 }
39824
39825 /* Initialize the GCC target structure. */
39826 #undef TARGET_RETURN_IN_MEMORY
39827 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
39828
39829 #undef TARGET_LEGITIMIZE_ADDRESS
39830 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
39831
39832 #undef TARGET_ATTRIBUTE_TABLE
39833 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
39834 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39835 # undef TARGET_MERGE_DECL_ATTRIBUTES
39836 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
39837 #endif
39838
39839 #undef TARGET_COMP_TYPE_ATTRIBUTES
39840 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
39841
39842 #undef TARGET_INIT_BUILTINS
39843 #define TARGET_INIT_BUILTINS ix86_init_builtins
39844 #undef TARGET_BUILTIN_DECL
39845 #define TARGET_BUILTIN_DECL ix86_builtin_decl
39846 #undef TARGET_EXPAND_BUILTIN
39847 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
39848
39849 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
39850 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
39851 ix86_builtin_vectorized_function
39852
39853 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
39854 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
39855
39856 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
39857 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
39858
39859 #undef TARGET_VECTORIZE_BUILTIN_GATHER
39860 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
39861
39862 #undef TARGET_BUILTIN_RECIPROCAL
39863 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
39864
39865 #undef TARGET_ASM_FUNCTION_EPILOGUE
39866 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
39867
39868 #undef TARGET_ENCODE_SECTION_INFO
39869 #ifndef SUBTARGET_ENCODE_SECTION_INFO
39870 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
39871 #else
39872 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
39873 #endif
39874
39875 #undef TARGET_ASM_OPEN_PAREN
39876 #define TARGET_ASM_OPEN_PAREN ""
39877 #undef TARGET_ASM_CLOSE_PAREN
39878 #define TARGET_ASM_CLOSE_PAREN ""
39879
39880 #undef TARGET_ASM_BYTE_OP
39881 #define TARGET_ASM_BYTE_OP ASM_BYTE
39882
39883 #undef TARGET_ASM_ALIGNED_HI_OP
39884 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
39885 #undef TARGET_ASM_ALIGNED_SI_OP
39886 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
39887 #ifdef ASM_QUAD
39888 #undef TARGET_ASM_ALIGNED_DI_OP
39889 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
39890 #endif
39891
39892 #undef TARGET_PROFILE_BEFORE_PROLOGUE
39893 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
39894
39895 #undef TARGET_ASM_UNALIGNED_HI_OP
39896 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
39897 #undef TARGET_ASM_UNALIGNED_SI_OP
39898 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
39899 #undef TARGET_ASM_UNALIGNED_DI_OP
39900 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
39901
39902 #undef TARGET_PRINT_OPERAND
39903 #define TARGET_PRINT_OPERAND ix86_print_operand
39904 #undef TARGET_PRINT_OPERAND_ADDRESS
39905 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
39906 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
39907 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
39908 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
39909 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
39910
39911 #undef TARGET_SCHED_INIT_GLOBAL
39912 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
39913 #undef TARGET_SCHED_ADJUST_COST
39914 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
39915 #undef TARGET_SCHED_ISSUE_RATE
39916 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
39917 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
39918 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
39919 ia32_multipass_dfa_lookahead
39920
39921 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
39922 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
39923
39924 #undef TARGET_MEMMODEL_CHECK
39925 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
39926
39927 #ifdef HAVE_AS_TLS
39928 #undef TARGET_HAVE_TLS
39929 #define TARGET_HAVE_TLS true
39930 #endif
39931 #undef TARGET_CANNOT_FORCE_CONST_MEM
39932 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39933 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39934 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39935
39936 #undef TARGET_DELEGITIMIZE_ADDRESS
39937 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39938
39939 #undef TARGET_MS_BITFIELD_LAYOUT_P
39940 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39941
39942 #if TARGET_MACHO
39943 #undef TARGET_BINDS_LOCAL_P
39944 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39945 #endif
39946 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39947 #undef TARGET_BINDS_LOCAL_P
39948 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39949 #endif
39950
39951 #undef TARGET_ASM_OUTPUT_MI_THUNK
39952 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39953 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39954 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39955
39956 #undef TARGET_ASM_FILE_START
39957 #define TARGET_ASM_FILE_START x86_file_start
39958
39959 #undef TARGET_OPTION_OVERRIDE
39960 #define TARGET_OPTION_OVERRIDE ix86_option_override
39961
39962 #undef TARGET_REGISTER_MOVE_COST
39963 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39964 #undef TARGET_MEMORY_MOVE_COST
39965 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39966 #undef TARGET_RTX_COSTS
39967 #define TARGET_RTX_COSTS ix86_rtx_costs
39968 #undef TARGET_ADDRESS_COST
39969 #define TARGET_ADDRESS_COST ix86_address_cost
39970
39971 #undef TARGET_FIXED_CONDITION_CODE_REGS
39972 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39973 #undef TARGET_CC_MODES_COMPATIBLE
39974 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39975
39976 #undef TARGET_MACHINE_DEPENDENT_REORG
39977 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39978
39979 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39980 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39981
39982 #undef TARGET_BUILD_BUILTIN_VA_LIST
39983 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39984
39985 #undef TARGET_FOLD_BUILTIN
39986 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
39987
39988 #undef TARGET_ENUM_VA_LIST_P
39989 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39990
39991 #undef TARGET_FN_ABI_VA_LIST
39992 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39993
39994 #undef TARGET_CANONICAL_VA_LIST_TYPE
39995 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39996
39997 #undef TARGET_EXPAND_BUILTIN_VA_START
39998 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39999
40000 #undef TARGET_MD_ASM_CLOBBERS
40001 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
40002
40003 #undef TARGET_PROMOTE_PROTOTYPES
40004 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
40005 #undef TARGET_STRUCT_VALUE_RTX
40006 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
40007 #undef TARGET_SETUP_INCOMING_VARARGS
40008 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
40009 #undef TARGET_MUST_PASS_IN_STACK
40010 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
40011 #undef TARGET_FUNCTION_ARG_ADVANCE
40012 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
40013 #undef TARGET_FUNCTION_ARG
40014 #define TARGET_FUNCTION_ARG ix86_function_arg
40015 #undef TARGET_FUNCTION_ARG_BOUNDARY
40016 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
40017 #undef TARGET_PASS_BY_REFERENCE
40018 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
40019 #undef TARGET_INTERNAL_ARG_POINTER
40020 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
40021 #undef TARGET_UPDATE_STACK_BOUNDARY
40022 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
40023 #undef TARGET_GET_DRAP_RTX
40024 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
40025 #undef TARGET_STRICT_ARGUMENT_NAMING
40026 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
40027 #undef TARGET_STATIC_CHAIN
40028 #define TARGET_STATIC_CHAIN ix86_static_chain
40029 #undef TARGET_TRAMPOLINE_INIT
40030 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
40031 #undef TARGET_RETURN_POPS_ARGS
40032 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
40033
40034 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
40035 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
40036
40037 #undef TARGET_SCALAR_MODE_SUPPORTED_P
40038 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
40039
40040 #undef TARGET_VECTOR_MODE_SUPPORTED_P
40041 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
40042
40043 #undef TARGET_C_MODE_FOR_SUFFIX
40044 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
40045
40046 #ifdef HAVE_AS_TLS
40047 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
40048 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
40049 #endif
40050
40051 #ifdef SUBTARGET_INSERT_ATTRIBUTES
40052 #undef TARGET_INSERT_ATTRIBUTES
40053 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
40054 #endif
40055
40056 #undef TARGET_MANGLE_TYPE
40057 #define TARGET_MANGLE_TYPE ix86_mangle_type
40058
40059 #if !TARGET_MACHO
40060 #undef TARGET_STACK_PROTECT_FAIL
40061 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
40062 #endif
40063
40064 #undef TARGET_FUNCTION_VALUE
40065 #define TARGET_FUNCTION_VALUE ix86_function_value
40066
40067 #undef TARGET_FUNCTION_VALUE_REGNO_P
40068 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
40069
40070 #undef TARGET_PROMOTE_FUNCTION_MODE
40071 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
40072
40073 #undef TARGET_SECONDARY_RELOAD
40074 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
40075
40076 #undef TARGET_CLASS_MAX_NREGS
40077 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
40078
40079 #undef TARGET_PREFERRED_RELOAD_CLASS
40080 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
40081 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
40082 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
40083 #undef TARGET_CLASS_LIKELY_SPILLED_P
40084 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
40085
40086 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
40087 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
40088 ix86_builtin_vectorization_cost
40089 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
40090 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
40091 ix86_vectorize_vec_perm_const_ok
40092 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
40093 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
40094 ix86_preferred_simd_mode
40095 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
40096 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
40097 ix86_autovectorize_vector_sizes
40098
40099 #undef TARGET_SET_CURRENT_FUNCTION
40100 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
40101
40102 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
40103 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
40104
40105 #undef TARGET_OPTION_SAVE
40106 #define TARGET_OPTION_SAVE ix86_function_specific_save
40107
40108 #undef TARGET_OPTION_RESTORE
40109 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
40110
40111 #undef TARGET_OPTION_PRINT
40112 #define TARGET_OPTION_PRINT ix86_function_specific_print
40113
40114 #undef TARGET_CAN_INLINE_P
40115 #define TARGET_CAN_INLINE_P ix86_can_inline_p
40116
40117 #undef TARGET_EXPAND_TO_RTL_HOOK
40118 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
40119
40120 #undef TARGET_LEGITIMATE_ADDRESS_P
40121 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
40122
40123 #undef TARGET_LEGITIMATE_CONSTANT_P
40124 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
40125
40126 #undef TARGET_FRAME_POINTER_REQUIRED
40127 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
40128
40129 #undef TARGET_CAN_ELIMINATE
40130 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
40131
40132 #undef TARGET_EXTRA_LIVE_ON_ENTRY
40133 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
40134
40135 #undef TARGET_ASM_CODE_END
40136 #define TARGET_ASM_CODE_END ix86_code_end
40137
40138 #undef TARGET_CONDITIONAL_REGISTER_USAGE
40139 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
40140
40141 #if TARGET_MACHO
40142 #undef TARGET_INIT_LIBFUNCS
40143 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
40144 #endif
40145
40146 struct gcc_target targetm = TARGET_INITIALIZER;
40147 \f
40148 #include "gt-i386.h"