d10bf160fe5f62cfcf1c93879450b68525a3e1c9
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64
65 enum upper_128bits_state
66 {
67 unknown = 0,
68 unused,
69 used
70 };
71
72 typedef struct block_info_def
73 {
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85 } *block_info;
86
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89 enum call_avx256_state
90 {
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102 };
103
104 /* Check if a 256bit AVX register is referenced in stores. */
105
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119 }
120
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130 {
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292 }
293
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338 done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356 }
357
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490 }
491
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580 };
581
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651 };
652
653 static const
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723 };
724
725 static const
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793 };
794
795 static const
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870 };
871
872 static const
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941 };
942
943 static const
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014 };
1015
1016 static const
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087 };
1088
1089 static const
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340 };
1341
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427 };
1428
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509 };
1510
1511 static const
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580 };
1581
1582 static const
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653 };
1654
1655 static const
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726 };
1727
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1729 static const
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803 };
1804
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807 static const
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875 };
1876
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1878
1879 /* Processor feature/optimization bitmasks. */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1896
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER (m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912
1913 /* Generic instruction choice should be common subset of supported CPUs
1914 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916
1917 /* Feature tests against the various tunings. */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921 based on the processor mask. */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924 negatively, so enabling for Generic64 seems like good code size
1925 tradeoff. We can't enable it for 32bit generic because it does not
1926 work well with PPro base chips. */
1927 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928
1929 /* X86_TUNE_PUSH_MEMORY */
1930 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931
1932 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933 m_486 | m_PENT,
1934
1935 /* X86_TUNE_UNROLL_STRLEN */
1936 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937
1938 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939 on simulation result. But after P4 was made, no performance benefit
1940 was observed with branch hints. It also increases the code size.
1941 As a result, icc never generates branch hints. */
1942 0,
1943
1944 /* X86_TUNE_DOUBLE_WITH_ADD */
1945 ~m_386,
1946
1947 /* X86_TUNE_USE_SAHF */
1948 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949
1950 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951 partial dependencies. */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953
1954 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955 register stalls on Generic32 compilation setting as well. However
1956 in current implementation the partial register stalls are not eliminated
1957 very well - they can be introduced via subregs synthesized by combine
1958 and can happen in caller/callee saving sequences. Because this option
1959 pays back little on PPro based chips and is in conflict with partial reg
1960 dependencies used by Athlon/P4 based chips, it is better to leave it off
1961 for generic32 for now. */
1962 m_PPRO,
1963
1964 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965 m_CORE2I7 | m_GENERIC,
1966
1967 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1968 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1969 m_CORE2I7 | m_GENERIC,
1970
1971 /* X86_TUNE_USE_HIMODE_FIOP */
1972 m_386 | m_486 | m_K6_GEODE,
1973
1974 /* X86_TUNE_USE_SIMODE_FIOP */
1975 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1976
1977 /* X86_TUNE_USE_MOV0 */
1978 m_K6,
1979
1980 /* X86_TUNE_USE_CLTD */
1981 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1982
1983 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1984 m_PENT4,
1985
1986 /* X86_TUNE_SPLIT_LONG_MOVES */
1987 m_PPRO,
1988
1989 /* X86_TUNE_READ_MODIFY_WRITE */
1990 ~m_PENT,
1991
1992 /* X86_TUNE_READ_MODIFY */
1993 ~(m_PENT | m_PPRO),
1994
1995 /* X86_TUNE_PROMOTE_QIMODE */
1996 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1997
1998 /* X86_TUNE_FAST_PREFIX */
1999 ~(m_386 | m_486 | m_PENT),
2000
2001 /* X86_TUNE_SINGLE_STRINGOP */
2002 m_386 | m_P4_NOCONA,
2003
2004 /* X86_TUNE_QIMODE_MATH */
2005 ~0,
2006
2007 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2008 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2009 might be considered for Generic32 if our scheme for avoiding partial
2010 stalls was more effective. */
2011 ~m_PPRO,
2012
2013 /* X86_TUNE_PROMOTE_QI_REGS */
2014 0,
2015
2016 /* X86_TUNE_PROMOTE_HI_REGS */
2017 m_PPRO,
2018
2019 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2020 over esp addition. */
2021 m_386 | m_486 | m_PENT | m_PPRO,
2022
2023 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2024 over esp addition. */
2025 m_PENT,
2026
2027 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2028 over esp subtraction. */
2029 m_386 | m_486 | m_PENT | m_K6_GEODE,
2030
2031 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2032 over esp subtraction. */
2033 m_PENT | m_K6_GEODE,
2034
2035 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2036 for DFmode copies */
2037 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2038
2039 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2040 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2041
2042 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2043 conflict here in between PPro/Pentium4 based chips that thread 128bit
2044 SSE registers as single units versus K8 based chips that divide SSE
2045 registers to two 64bit halves. This knob promotes all store destinations
2046 to be 128bit to allow register renaming on 128bit SSE units, but usually
2047 results in one extra microop on 64bit SSE units. Experimental results
2048 shows that disabling this option on P4 brings over 20% SPECfp regression,
2049 while enabling it on K8 brings roughly 2.4% regression that can be partly
2050 masked by careful scheduling of moves. */
2051 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2052
2053 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2054 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2055
2056 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2057 m_COREI7 | m_BDVER,
2058
2059 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2060 m_BDVER ,
2061
2062 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2063 are resolved on SSE register parts instead of whole registers, so we may
2064 maintain just lower part of scalar values in proper format leaving the
2065 upper part undefined. */
2066 m_ATHLON_K8,
2067
2068 /* X86_TUNE_SSE_TYPELESS_STORES */
2069 m_AMD_MULTIPLE,
2070
2071 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2072 m_PPRO | m_P4_NOCONA,
2073
2074 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2075 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2076
2077 /* X86_TUNE_PROLOGUE_USING_MOVE */
2078 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2079
2080 /* X86_TUNE_EPILOGUE_USING_MOVE */
2081 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2082
2083 /* X86_TUNE_SHIFT1 */
2084 ~m_486,
2085
2086 /* X86_TUNE_USE_FFREEP */
2087 m_AMD_MULTIPLE,
2088
2089 /* X86_TUNE_INTER_UNIT_MOVES */
2090 ~(m_AMD_MULTIPLE | m_GENERIC),
2091
2092 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2093 ~(m_AMDFAM10 | m_BDVER ),
2094
2095 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2096 than 4 branch instructions in the 16 byte window. */
2097 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2098
2099 /* X86_TUNE_SCHEDULE */
2100 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2101
2102 /* X86_TUNE_USE_BT */
2103 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2104
2105 /* X86_TUNE_USE_INCDEC */
2106 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2107
2108 /* X86_TUNE_PAD_RETURNS */
2109 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2110
2111 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2112 m_ATOM,
2113
2114 /* X86_TUNE_EXT_80387_CONSTANTS */
2115 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2116
2117 /* X86_TUNE_SHORTEN_X87_SSE */
2118 ~m_K8,
2119
2120 /* X86_TUNE_AVOID_VECTOR_DECODE */
2121 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2122
2123 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2124 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2125 ~(m_386 | m_486),
2126
2127 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2128 vector path on AMD machines. */
2129 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130
2131 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2132 machines. */
2133 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2134
2135 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2136 than a MOV. */
2137 m_PENT,
2138
2139 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2140 but one byte longer. */
2141 m_PENT,
2142
2143 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2144 operand that cannot be represented using a modRM byte. The XOR
2145 replacement is long decoded, so this split helps here as well. */
2146 m_K6,
2147
2148 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2149 from FP to FP. */
2150 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2151
2152 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2153 from integer to FP. */
2154 m_AMDFAM10,
2155
2156 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2157 with a subsequent conditional jump instruction into a single
2158 compare-and-branch uop. */
2159 m_BDVER,
2160
2161 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2162 will impact LEA instruction selection. */
2163 m_ATOM,
2164
2165 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2166 instructions. */
2167 ~m_ATOM,
2168
2169 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2170 at -O3. For the moment, the prefetching seems badly tuned for Intel
2171 chips. */
2172 m_K6_GEODE | m_AMD_MULTIPLE,
2173
2174 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2175 the auto-vectorizer. */
2176 m_BDVER,
2177
2178 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2179 during reassociation of integer computation. */
2180 m_ATOM,
2181
2182 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2183 during reassociation of fp computation. */
2184 m_ATOM
2185 };
2186
2187 /* Feature tests against the various architecture variations. */
2188 unsigned char ix86_arch_features[X86_ARCH_LAST];
2189
2190 /* Feature tests against the various architecture variations, used to create
2191 ix86_arch_features based on the processor mask. */
2192 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2193 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2194 ~(m_386 | m_486 | m_PENT | m_K6),
2195
2196 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2197 ~m_386,
2198
2199 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2200 ~(m_386 | m_486),
2201
2202 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2203 ~m_386,
2204
2205 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2206 ~m_386,
2207 };
2208
2209 static const unsigned int x86_accumulate_outgoing_args
2210 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2211
2212 static const unsigned int x86_arch_always_fancy_math_387
2213 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2214
2215 static const unsigned int x86_avx256_split_unaligned_load
2216 = m_COREI7 | m_GENERIC;
2217
2218 static const unsigned int x86_avx256_split_unaligned_store
2219 = m_COREI7 | m_BDVER | m_GENERIC;
2220
2221 /* In case the average insn count for single function invocation is
2222 lower than this constant, emit fast (but longer) prologue and
2223 epilogue code. */
2224 #define FAST_PROLOGUE_INSN_COUNT 20
2225
2226 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2227 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2228 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2229 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2230
2231 /* Array of the smallest class containing reg number REGNO, indexed by
2232 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2233
2234 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2235 {
2236 /* ax, dx, cx, bx */
2237 AREG, DREG, CREG, BREG,
2238 /* si, di, bp, sp */
2239 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2240 /* FP registers */
2241 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2242 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2243 /* arg pointer */
2244 NON_Q_REGS,
2245 /* flags, fpsr, fpcr, frame */
2246 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2247 /* SSE registers */
2248 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2249 SSE_REGS, SSE_REGS,
2250 /* MMX registers */
2251 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2252 MMX_REGS, MMX_REGS,
2253 /* REX registers */
2254 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2255 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2256 /* SSE REX registers */
2257 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2258 SSE_REGS, SSE_REGS,
2259 };
2260
2261 /* The "default" register map used in 32bit mode. */
2262
2263 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2264 {
2265 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2266 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2267 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2268 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2269 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2270 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2272 };
2273
2274 /* The "default" register map used in 64bit mode. */
2275
2276 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2277 {
2278 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2279 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2280 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2281 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2282 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2283 8,9,10,11,12,13,14,15, /* extended integer registers */
2284 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2285 };
2286
2287 /* Define the register numbers to be used in Dwarf debugging information.
2288 The SVR4 reference port C compiler uses the following register numbers
2289 in its Dwarf output code:
2290 0 for %eax (gcc regno = 0)
2291 1 for %ecx (gcc regno = 2)
2292 2 for %edx (gcc regno = 1)
2293 3 for %ebx (gcc regno = 3)
2294 4 for %esp (gcc regno = 7)
2295 5 for %ebp (gcc regno = 6)
2296 6 for %esi (gcc regno = 4)
2297 7 for %edi (gcc regno = 5)
2298 The following three DWARF register numbers are never generated by
2299 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2300 believes these numbers have these meanings.
2301 8 for %eip (no gcc equivalent)
2302 9 for %eflags (gcc regno = 17)
2303 10 for %trapno (no gcc equivalent)
2304 It is not at all clear how we should number the FP stack registers
2305 for the x86 architecture. If the version of SDB on x86/svr4 were
2306 a bit less brain dead with respect to floating-point then we would
2307 have a precedent to follow with respect to DWARF register numbers
2308 for x86 FP registers, but the SDB on x86/svr4 is so completely
2309 broken with respect to FP registers that it is hardly worth thinking
2310 of it as something to strive for compatibility with.
2311 The version of x86/svr4 SDB I have at the moment does (partially)
2312 seem to believe that DWARF register number 11 is associated with
2313 the x86 register %st(0), but that's about all. Higher DWARF
2314 register numbers don't seem to be associated with anything in
2315 particular, and even for DWARF regno 11, SDB only seems to under-
2316 stand that it should say that a variable lives in %st(0) (when
2317 asked via an `=' command) if we said it was in DWARF regno 11,
2318 but SDB still prints garbage when asked for the value of the
2319 variable in question (via a `/' command).
2320 (Also note that the labels SDB prints for various FP stack regs
2321 when doing an `x' command are all wrong.)
2322 Note that these problems generally don't affect the native SVR4
2323 C compiler because it doesn't allow the use of -O with -g and
2324 because when it is *not* optimizing, it allocates a memory
2325 location for each floating-point variable, and the memory
2326 location is what gets described in the DWARF AT_location
2327 attribute for the variable in question.
2328 Regardless of the severe mental illness of the x86/svr4 SDB, we
2329 do something sensible here and we use the following DWARF
2330 register numbers. Note that these are all stack-top-relative
2331 numbers.
2332 11 for %st(0) (gcc regno = 8)
2333 12 for %st(1) (gcc regno = 9)
2334 13 for %st(2) (gcc regno = 10)
2335 14 for %st(3) (gcc regno = 11)
2336 15 for %st(4) (gcc regno = 12)
2337 16 for %st(5) (gcc regno = 13)
2338 17 for %st(6) (gcc regno = 14)
2339 18 for %st(7) (gcc regno = 15)
2340 */
2341 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2342 {
2343 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2344 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2345 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2346 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2347 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2348 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2349 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2350 };
2351
2352 /* Define parameter passing and return registers. */
2353
2354 static int const x86_64_int_parameter_registers[6] =
2355 {
2356 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_ms_abi_int_parameter_registers[4] =
2360 {
2361 CX_REG, DX_REG, R8_REG, R9_REG
2362 };
2363
2364 static int const x86_64_int_return_registers[4] =
2365 {
2366 AX_REG, DX_REG, DI_REG, SI_REG
2367 };
2368
2369 /* Define the structure for the machine field in struct function. */
2370
2371 struct GTY(()) stack_local_entry {
2372 unsigned short mode;
2373 unsigned short n;
2374 rtx rtl;
2375 struct stack_local_entry *next;
2376 };
2377
2378 /* Structure describing stack frame layout.
2379 Stack grows downward:
2380
2381 [arguments]
2382 <- ARG_POINTER
2383 saved pc
2384
2385 saved static chain if ix86_static_chain_on_stack
2386
2387 saved frame pointer if frame_pointer_needed
2388 <- HARD_FRAME_POINTER
2389 [saved regs]
2390 <- regs_save_offset
2391 [padding0]
2392
2393 [saved SSE regs]
2394 <- sse_regs_save_offset
2395 [padding1] |
2396 | <- FRAME_POINTER
2397 [va_arg registers] |
2398 |
2399 [frame] |
2400 |
2401 [padding2] | = to_allocate
2402 <- STACK_POINTER
2403 */
2404 struct ix86_frame
2405 {
2406 int nsseregs;
2407 int nregs;
2408 int va_arg_size;
2409 int red_zone_size;
2410 int outgoing_arguments_size;
2411
2412 /* The offsets relative to ARG_POINTER. */
2413 HOST_WIDE_INT frame_pointer_offset;
2414 HOST_WIDE_INT hard_frame_pointer_offset;
2415 HOST_WIDE_INT stack_pointer_offset;
2416 HOST_WIDE_INT hfp_save_offset;
2417 HOST_WIDE_INT reg_save_offset;
2418 HOST_WIDE_INT sse_reg_save_offset;
2419
2420 /* When save_regs_using_mov is set, emit prologue using
2421 move instead of push instructions. */
2422 bool save_regs_using_mov;
2423 };
2424
2425 /* Which cpu are we scheduling for. */
2426 enum attr_cpu ix86_schedule;
2427
2428 /* Which cpu are we optimizing for. */
2429 enum processor_type ix86_tune;
2430
2431 /* Which instruction set architecture to use. */
2432 enum processor_type ix86_arch;
2433
2434 /* true if sse prefetch instruction is not NOOP. */
2435 int x86_prefetch_sse;
2436
2437 /* -mstackrealign option */
2438 static const char ix86_force_align_arg_pointer_string[]
2439 = "force_align_arg_pointer";
2440
2441 static rtx (*ix86_gen_leave) (void);
2442 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2445 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2446 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2449 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2450 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2451 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2452 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2453
2454 /* Preferred alignment for stack boundary in bits. */
2455 unsigned int ix86_preferred_stack_boundary;
2456
2457 /* Alignment for incoming stack boundary in bits specified at
2458 command line. */
2459 static unsigned int ix86_user_incoming_stack_boundary;
2460
2461 /* Default alignment for incoming stack boundary in bits. */
2462 static unsigned int ix86_default_incoming_stack_boundary;
2463
2464 /* Alignment for incoming stack boundary in bits. */
2465 unsigned int ix86_incoming_stack_boundary;
2466
2467 /* Calling abi specific va_list type nodes. */
2468 static GTY(()) tree sysv_va_list_type_node;
2469 static GTY(()) tree ms_va_list_type_node;
2470
2471 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2472 char internal_label_prefix[16];
2473 int internal_label_prefix_len;
2474
2475 /* Fence to use after loop using movnt. */
2476 tree x86_mfence;
2477
2478 /* Register class used for passing given 64bit part of the argument.
2479 These represent classes as documented by the PS ABI, with the exception
2480 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2481 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2482
2483 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2484 whenever possible (upper half does contain padding). */
2485 enum x86_64_reg_class
2486 {
2487 X86_64_NO_CLASS,
2488 X86_64_INTEGER_CLASS,
2489 X86_64_INTEGERSI_CLASS,
2490 X86_64_SSE_CLASS,
2491 X86_64_SSESF_CLASS,
2492 X86_64_SSEDF_CLASS,
2493 X86_64_SSEUP_CLASS,
2494 X86_64_X87_CLASS,
2495 X86_64_X87UP_CLASS,
2496 X86_64_COMPLEX_X87_CLASS,
2497 X86_64_MEMORY_CLASS
2498 };
2499
2500 #define MAX_CLASSES 4
2501
2502 /* Table of constants used by fldpi, fldln2, etc.... */
2503 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2504 static bool ext_80387_constants_init = 0;
2505
2506 \f
2507 static struct machine_function * ix86_init_machine_status (void);
2508 static rtx ix86_function_value (const_tree, const_tree, bool);
2509 static bool ix86_function_value_regno_p (const unsigned int);
2510 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2511 const_tree);
2512 static rtx ix86_static_chain (const_tree, bool);
2513 static int ix86_function_regparm (const_tree, const_tree);
2514 static void ix86_compute_frame_layout (struct ix86_frame *);
2515 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2516 rtx, rtx, int);
2517 static void ix86_add_new_builtins (HOST_WIDE_INT);
2518 static tree ix86_canonical_va_list_type (tree);
2519 static void predict_jump (int);
2520 static unsigned int split_stack_prologue_scratch_regno (void);
2521 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2522
2523 enum ix86_function_specific_strings
2524 {
2525 IX86_FUNCTION_SPECIFIC_ARCH,
2526 IX86_FUNCTION_SPECIFIC_TUNE,
2527 IX86_FUNCTION_SPECIFIC_MAX
2528 };
2529
2530 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2531 const char *, enum fpmath_unit, bool);
2532 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2533 static void ix86_function_specific_save (struct cl_target_option *);
2534 static void ix86_function_specific_restore (struct cl_target_option *);
2535 static void ix86_function_specific_print (FILE *, int,
2536 struct cl_target_option *);
2537 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2538 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2539 struct gcc_options *);
2540 static bool ix86_can_inline_p (tree, tree);
2541 static void ix86_set_current_function (tree);
2542 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2543
2544 static enum calling_abi ix86_function_abi (const_tree);
2545
2546 \f
2547 #ifndef SUBTARGET32_DEFAULT_CPU
2548 #define SUBTARGET32_DEFAULT_CPU "i386"
2549 #endif
2550
2551 /* The svr4 ABI for the i386 says that records and unions are returned
2552 in memory. */
2553 #ifndef DEFAULT_PCC_STRUCT_RETURN
2554 #define DEFAULT_PCC_STRUCT_RETURN 1
2555 #endif
2556
2557 /* Whether -mtune= or -march= were specified */
2558 static int ix86_tune_defaulted;
2559 static int ix86_arch_specified;
2560
2561 /* Vectorization library interface and handlers. */
2562 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2563
2564 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2565 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2566
2567 /* Processor target table, indexed by processor number */
2568 struct ptt
2569 {
2570 const struct processor_costs *cost; /* Processor costs */
2571 const int align_loop; /* Default alignments. */
2572 const int align_loop_max_skip;
2573 const int align_jump;
2574 const int align_jump_max_skip;
2575 const int align_func;
2576 };
2577
2578 static const struct ptt processor_target_table[PROCESSOR_max] =
2579 {
2580 {&i386_cost, 4, 3, 4, 3, 4},
2581 {&i486_cost, 16, 15, 16, 15, 16},
2582 {&pentium_cost, 16, 7, 16, 7, 16},
2583 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2584 {&geode_cost, 0, 0, 0, 0, 0},
2585 {&k6_cost, 32, 7, 32, 7, 32},
2586 {&athlon_cost, 16, 7, 16, 7, 16},
2587 {&pentium4_cost, 0, 0, 0, 0, 0},
2588 {&k8_cost, 16, 7, 16, 7, 16},
2589 {&nocona_cost, 0, 0, 0, 0, 0},
2590 /* Core 2 32-bit. */
2591 {&generic32_cost, 16, 10, 16, 10, 16},
2592 /* Core 2 64-bit. */
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 /* Core i7 32-bit. */
2595 {&generic32_cost, 16, 10, 16, 10, 16},
2596 /* Core i7 64-bit. */
2597 {&generic64_cost, 16, 10, 16, 10, 16},
2598 {&generic32_cost, 16, 7, 16, 7, 16},
2599 {&generic64_cost, 16, 10, 16, 10, 16},
2600 {&amdfam10_cost, 32, 24, 32, 7, 32},
2601 {&bdver1_cost, 32, 24, 32, 7, 32},
2602 {&bdver2_cost, 32, 24, 32, 7, 32},
2603 {&btver1_cost, 32, 24, 32, 7, 32},
2604 {&atom_cost, 16, 15, 16, 7, 16}
2605 };
2606
2607 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2608 {
2609 "generic",
2610 "i386",
2611 "i486",
2612 "pentium",
2613 "pentium-mmx",
2614 "pentiumpro",
2615 "pentium2",
2616 "pentium3",
2617 "pentium4",
2618 "pentium-m",
2619 "prescott",
2620 "nocona",
2621 "core2",
2622 "corei7",
2623 "atom",
2624 "geode",
2625 "k6",
2626 "k6-2",
2627 "k6-3",
2628 "athlon",
2629 "athlon-4",
2630 "k8",
2631 "amdfam10",
2632 "bdver1",
2633 "bdver2",
2634 "btver1"
2635 };
2636 \f
2637 /* Return true if a red-zone is in use. */
2638
2639 static inline bool
2640 ix86_using_red_zone (void)
2641 {
2642 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2643 }
2644 \f
2645 /* Return a string that documents the current -m options. The caller is
2646 responsible for freeing the string. */
2647
2648 static char *
2649 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2650 const char *tune, enum fpmath_unit fpmath,
2651 bool add_nl_p)
2652 {
2653 struct ix86_target_opts
2654 {
2655 const char *option; /* option string */
2656 HOST_WIDE_INT mask; /* isa mask options */
2657 };
2658
2659 /* This table is ordered so that options like -msse4.2 that imply
2660 preceding options while match those first. */
2661 static struct ix86_target_opts isa_opts[] =
2662 {
2663 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2664 { "-mfma", OPTION_MASK_ISA_FMA },
2665 { "-mxop", OPTION_MASK_ISA_XOP },
2666 { "-mlwp", OPTION_MASK_ISA_LWP },
2667 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2668 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2669 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2670 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2671 { "-msse3", OPTION_MASK_ISA_SSE3 },
2672 { "-msse2", OPTION_MASK_ISA_SSE2 },
2673 { "-msse", OPTION_MASK_ISA_SSE },
2674 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2675 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2676 { "-mmmx", OPTION_MASK_ISA_MMX },
2677 { "-mabm", OPTION_MASK_ISA_ABM },
2678 { "-mbmi", OPTION_MASK_ISA_BMI },
2679 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2680 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2681 { "-mhle", OPTION_MASK_ISA_HLE },
2682 { "-mtbm", OPTION_MASK_ISA_TBM },
2683 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2684 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2685 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2686 { "-maes", OPTION_MASK_ISA_AES },
2687 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2688 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2689 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2690 { "-mf16c", OPTION_MASK_ISA_F16C },
2691 { "-mrtm", OPTION_MASK_ISA_RTM },
2692 };
2693
2694 /* Flag options. */
2695 static struct ix86_target_opts flag_opts[] =
2696 {
2697 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2698 { "-m80387", MASK_80387 },
2699 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2700 { "-malign-double", MASK_ALIGN_DOUBLE },
2701 { "-mcld", MASK_CLD },
2702 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2703 { "-mieee-fp", MASK_IEEE_FP },
2704 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2705 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2706 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2707 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2708 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2709 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2710 { "-mno-red-zone", MASK_NO_RED_ZONE },
2711 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2712 { "-mrecip", MASK_RECIP },
2713 { "-mrtd", MASK_RTD },
2714 { "-msseregparm", MASK_SSEREGPARM },
2715 { "-mstack-arg-probe", MASK_STACK_PROBE },
2716 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2717 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2718 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2719 { "-mvzeroupper", MASK_VZEROUPPER },
2720 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2721 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2722 { "-mprefer-avx128", MASK_PREFER_AVX128},
2723 };
2724
2725 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2726
2727 char isa_other[40];
2728 char target_other[40];
2729 unsigned num = 0;
2730 unsigned i, j;
2731 char *ret;
2732 char *ptr;
2733 size_t len;
2734 size_t line_len;
2735 size_t sep_len;
2736 const char *abi;
2737
2738 memset (opts, '\0', sizeof (opts));
2739
2740 /* Add -march= option. */
2741 if (arch)
2742 {
2743 opts[num][0] = "-march=";
2744 opts[num++][1] = arch;
2745 }
2746
2747 /* Add -mtune= option. */
2748 if (tune)
2749 {
2750 opts[num][0] = "-mtune=";
2751 opts[num++][1] = tune;
2752 }
2753
2754 /* Add -m32/-m64/-mx32. */
2755 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2756 {
2757 if ((isa & OPTION_MASK_ABI_64) != 0)
2758 abi = "-m64";
2759 else
2760 abi = "-mx32";
2761 isa &= ~ (OPTION_MASK_ISA_64BIT
2762 | OPTION_MASK_ABI_64
2763 | OPTION_MASK_ABI_X32);
2764 }
2765 else
2766 abi = "-m32";
2767 opts[num++][0] = abi;
2768
2769 /* Pick out the options in isa options. */
2770 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2771 {
2772 if ((isa & isa_opts[i].mask) != 0)
2773 {
2774 opts[num++][0] = isa_opts[i].option;
2775 isa &= ~ isa_opts[i].mask;
2776 }
2777 }
2778
2779 if (isa && add_nl_p)
2780 {
2781 opts[num++][0] = isa_other;
2782 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2783 isa);
2784 }
2785
2786 /* Add flag options. */
2787 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2788 {
2789 if ((flags & flag_opts[i].mask) != 0)
2790 {
2791 opts[num++][0] = flag_opts[i].option;
2792 flags &= ~ flag_opts[i].mask;
2793 }
2794 }
2795
2796 if (flags && add_nl_p)
2797 {
2798 opts[num++][0] = target_other;
2799 sprintf (target_other, "(other flags: %#x)", flags);
2800 }
2801
2802 /* Add -fpmath= option. */
2803 if (fpmath)
2804 {
2805 opts[num][0] = "-mfpmath=";
2806 switch ((int) fpmath)
2807 {
2808 case FPMATH_387:
2809 opts[num++][1] = "387";
2810 break;
2811
2812 case FPMATH_SSE:
2813 opts[num++][1] = "sse";
2814 break;
2815
2816 case FPMATH_387 | FPMATH_SSE:
2817 opts[num++][1] = "sse+387";
2818 break;
2819
2820 default:
2821 gcc_unreachable ();
2822 }
2823 }
2824
2825 /* Any options? */
2826 if (num == 0)
2827 return NULL;
2828
2829 gcc_assert (num < ARRAY_SIZE (opts));
2830
2831 /* Size the string. */
2832 len = 0;
2833 sep_len = (add_nl_p) ? 3 : 1;
2834 for (i = 0; i < num; i++)
2835 {
2836 len += sep_len;
2837 for (j = 0; j < 2; j++)
2838 if (opts[i][j])
2839 len += strlen (opts[i][j]);
2840 }
2841
2842 /* Build the string. */
2843 ret = ptr = (char *) xmalloc (len);
2844 line_len = 0;
2845
2846 for (i = 0; i < num; i++)
2847 {
2848 size_t len2[2];
2849
2850 for (j = 0; j < 2; j++)
2851 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2852
2853 if (i != 0)
2854 {
2855 *ptr++ = ' ';
2856 line_len++;
2857
2858 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2859 {
2860 *ptr++ = '\\';
2861 *ptr++ = '\n';
2862 line_len = 0;
2863 }
2864 }
2865
2866 for (j = 0; j < 2; j++)
2867 if (opts[i][j])
2868 {
2869 memcpy (ptr, opts[i][j], len2[j]);
2870 ptr += len2[j];
2871 line_len += len2[j];
2872 }
2873 }
2874
2875 *ptr = '\0';
2876 gcc_assert (ret + len >= ptr);
2877
2878 return ret;
2879 }
2880
2881 /* Return true, if profiling code should be emitted before
2882 prologue. Otherwise it returns false.
2883 Note: For x86 with "hotfix" it is sorried. */
2884 static bool
2885 ix86_profile_before_prologue (void)
2886 {
2887 return flag_fentry != 0;
2888 }
2889
2890 /* Function that is callable from the debugger to print the current
2891 options. */
2892 void
2893 ix86_debug_options (void)
2894 {
2895 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2896 ix86_arch_string, ix86_tune_string,
2897 ix86_fpmath, true);
2898
2899 if (opts)
2900 {
2901 fprintf (stderr, "%s\n\n", opts);
2902 free (opts);
2903 }
2904 else
2905 fputs ("<no options>\n\n", stderr);
2906
2907 return;
2908 }
2909 \f
2910 /* Override various settings based on options. If MAIN_ARGS_P, the
2911 options are from the command line, otherwise they are from
2912 attributes. */
2913
2914 static void
2915 ix86_option_override_internal (bool main_args_p)
2916 {
2917 int i;
2918 unsigned int ix86_arch_mask, ix86_tune_mask;
2919 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2920 const char *prefix;
2921 const char *suffix;
2922 const char *sw;
2923
2924 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2925 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2926 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2927 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2928 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2929 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2930 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2931 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2932 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2933 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2934 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2935 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2936 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2937 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2938 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2939 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2940 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2941 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2942 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2943 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2944 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2945 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2946 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2947 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2948 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2949 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2950 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2951 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2952 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2953 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2954 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2955 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2956 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2957 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2958 /* if this reaches 64, need to widen struct pta flags below */
2959
2960 static struct pta
2961 {
2962 const char *const name; /* processor name or nickname. */
2963 const enum processor_type processor;
2964 const enum attr_cpu schedule;
2965 const unsigned HOST_WIDE_INT flags;
2966 }
2967 const processor_alias_table[] =
2968 {
2969 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2970 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2971 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2972 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2973 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2974 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2975 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2976 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2977 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2978 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2979 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2980 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2981 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2982 PTA_MMX | PTA_SSE},
2983 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2984 PTA_MMX | PTA_SSE},
2985 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2986 PTA_MMX | PTA_SSE | PTA_SSE2},
2987 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2988 PTA_MMX |PTA_SSE | PTA_SSE2},
2989 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2990 PTA_MMX | PTA_SSE | PTA_SSE2},
2991 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2992 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2993 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_CX16 | PTA_NO_SAHF},
2996 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2997 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2998 | PTA_SSSE3 | PTA_CX16},
2999 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3000 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3001 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3002 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3003 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3004 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3005 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3006 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3007 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3008 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3009 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3010 | PTA_RDRND | PTA_F16C},
3011 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3012 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3013 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3014 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3015 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3016 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3017 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3018 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3019 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3020 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3021 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3022 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3023 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3024 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3025 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3026 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3027 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3028 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3029 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3030 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3031 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3032 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3033 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3034 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3035 {"x86-64", PROCESSOR_K8, CPU_K8,
3036 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3037 {"k8", PROCESSOR_K8, CPU_K8,
3038 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3039 | PTA_SSE2 | PTA_NO_SAHF},
3040 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3041 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3042 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3043 {"opteron", PROCESSOR_K8, CPU_K8,
3044 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3045 | PTA_SSE2 | PTA_NO_SAHF},
3046 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3047 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3048 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3049 {"athlon64", PROCESSOR_K8, CPU_K8,
3050 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3051 | PTA_SSE2 | PTA_NO_SAHF},
3052 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3053 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3054 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3055 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3056 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3057 | PTA_SSE2 | PTA_NO_SAHF},
3058 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3059 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3060 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3061 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3062 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3063 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3064 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3065 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3066 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3067 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3068 | PTA_XOP | PTA_LWP},
3069 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3070 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3071 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3072 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3073 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3074 | PTA_FMA},
3075 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3076 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3077 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3078 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3079 PTA_HLE /* flags are only used for -march switch. */ },
3080 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3081 PTA_64BIT
3082 | PTA_HLE /* flags are only used for -march switch. */ },
3083 };
3084
3085 /* -mrecip options. */
3086 static struct
3087 {
3088 const char *string; /* option name */
3089 unsigned int mask; /* mask bits to set */
3090 }
3091 const recip_options[] =
3092 {
3093 { "all", RECIP_MASK_ALL },
3094 { "none", RECIP_MASK_NONE },
3095 { "div", RECIP_MASK_DIV },
3096 { "sqrt", RECIP_MASK_SQRT },
3097 { "vec-div", RECIP_MASK_VEC_DIV },
3098 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3099 };
3100
3101 int const pta_size = ARRAY_SIZE (processor_alias_table);
3102
3103 /* Set up prefix/suffix so the error messages refer to either the command
3104 line argument, or the attribute(target). */
3105 if (main_args_p)
3106 {
3107 prefix = "-m";
3108 suffix = "";
3109 sw = "switch";
3110 }
3111 else
3112 {
3113 prefix = "option(\"";
3114 suffix = "\")";
3115 sw = "attribute";
3116 }
3117
3118 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3119 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3120 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3121 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3122 #ifdef TARGET_BI_ARCH
3123 else
3124 {
3125 #if TARGET_BI_ARCH == 1
3126 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3127 is on and OPTION_MASK_ABI_X32 is off. We turn off
3128 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3129 -mx32. */
3130 if (TARGET_X32)
3131 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3132 #else
3133 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3134 on and OPTION_MASK_ABI_64 is off. We turn off
3135 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3136 -m64. */
3137 if (TARGET_LP64)
3138 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3139 #endif
3140 }
3141 #endif
3142
3143 if (TARGET_X32)
3144 {
3145 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3146 OPTION_MASK_ABI_64 for TARGET_X32. */
3147 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3148 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3149 }
3150 else if (TARGET_LP64)
3151 {
3152 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3153 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3154 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3155 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3156 }
3157
3158 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3159 SUBTARGET_OVERRIDE_OPTIONS;
3160 #endif
3161
3162 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3163 SUBSUBTARGET_OVERRIDE_OPTIONS;
3164 #endif
3165
3166 /* -fPIC is the default for x86_64. */
3167 if (TARGET_MACHO && TARGET_64BIT)
3168 flag_pic = 2;
3169
3170 /* Need to check -mtune=generic first. */
3171 if (ix86_tune_string)
3172 {
3173 if (!strcmp (ix86_tune_string, "generic")
3174 || !strcmp (ix86_tune_string, "i686")
3175 /* As special support for cross compilers we read -mtune=native
3176 as -mtune=generic. With native compilers we won't see the
3177 -mtune=native, as it was changed by the driver. */
3178 || !strcmp (ix86_tune_string, "native"))
3179 {
3180 if (TARGET_64BIT)
3181 ix86_tune_string = "generic64";
3182 else
3183 ix86_tune_string = "generic32";
3184 }
3185 /* If this call is for setting the option attribute, allow the
3186 generic32/generic64 that was previously set. */
3187 else if (!main_args_p
3188 && (!strcmp (ix86_tune_string, "generic32")
3189 || !strcmp (ix86_tune_string, "generic64")))
3190 ;
3191 else if (!strncmp (ix86_tune_string, "generic", 7))
3192 error ("bad value (%s) for %stune=%s %s",
3193 ix86_tune_string, prefix, suffix, sw);
3194 else if (!strcmp (ix86_tune_string, "x86-64"))
3195 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3196 "%stune=k8%s or %stune=generic%s instead as appropriate",
3197 prefix, suffix, prefix, suffix, prefix, suffix);
3198 }
3199 else
3200 {
3201 if (ix86_arch_string)
3202 ix86_tune_string = ix86_arch_string;
3203 if (!ix86_tune_string)
3204 {
3205 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3206 ix86_tune_defaulted = 1;
3207 }
3208
3209 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3210 need to use a sensible tune option. */
3211 if (!strcmp (ix86_tune_string, "generic")
3212 || !strcmp (ix86_tune_string, "x86-64")
3213 || !strcmp (ix86_tune_string, "i686"))
3214 {
3215 if (TARGET_64BIT)
3216 ix86_tune_string = "generic64";
3217 else
3218 ix86_tune_string = "generic32";
3219 }
3220 }
3221
3222 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3223 {
3224 /* rep; movq isn't available in 32-bit code. */
3225 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3226 ix86_stringop_alg = no_stringop;
3227 }
3228
3229 if (!ix86_arch_string)
3230 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3231 else
3232 ix86_arch_specified = 1;
3233
3234 if (global_options_set.x_ix86_pmode)
3235 {
3236 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3237 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3238 error ("address mode %qs not supported in the %s bit mode",
3239 TARGET_64BIT ? "short" : "long",
3240 TARGET_64BIT ? "64" : "32");
3241 }
3242 else
3243 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3244
3245 if (!global_options_set.x_ix86_abi)
3246 ix86_abi = DEFAULT_ABI;
3247
3248 if (global_options_set.x_ix86_cmodel)
3249 {
3250 switch (ix86_cmodel)
3251 {
3252 case CM_SMALL:
3253 case CM_SMALL_PIC:
3254 if (flag_pic)
3255 ix86_cmodel = CM_SMALL_PIC;
3256 if (!TARGET_64BIT)
3257 error ("code model %qs not supported in the %s bit mode",
3258 "small", "32");
3259 break;
3260
3261 case CM_MEDIUM:
3262 case CM_MEDIUM_PIC:
3263 if (flag_pic)
3264 ix86_cmodel = CM_MEDIUM_PIC;
3265 if (!TARGET_64BIT)
3266 error ("code model %qs not supported in the %s bit mode",
3267 "medium", "32");
3268 else if (TARGET_X32)
3269 error ("code model %qs not supported in x32 mode",
3270 "medium");
3271 break;
3272
3273 case CM_LARGE:
3274 case CM_LARGE_PIC:
3275 if (flag_pic)
3276 ix86_cmodel = CM_LARGE_PIC;
3277 if (!TARGET_64BIT)
3278 error ("code model %qs not supported in the %s bit mode",
3279 "large", "32");
3280 else if (TARGET_X32)
3281 error ("code model %qs not supported in x32 mode",
3282 "medium");
3283 break;
3284
3285 case CM_32:
3286 if (flag_pic)
3287 error ("code model %s does not support PIC mode", "32");
3288 if (TARGET_64BIT)
3289 error ("code model %qs not supported in the %s bit mode",
3290 "32", "64");
3291 break;
3292
3293 case CM_KERNEL:
3294 if (flag_pic)
3295 {
3296 error ("code model %s does not support PIC mode", "kernel");
3297 ix86_cmodel = CM_32;
3298 }
3299 if (!TARGET_64BIT)
3300 error ("code model %qs not supported in the %s bit mode",
3301 "kernel", "32");
3302 break;
3303
3304 default:
3305 gcc_unreachable ();
3306 }
3307 }
3308 else
3309 {
3310 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3311 use of rip-relative addressing. This eliminates fixups that
3312 would otherwise be needed if this object is to be placed in a
3313 DLL, and is essentially just as efficient as direct addressing. */
3314 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3315 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3316 else if (TARGET_64BIT)
3317 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3318 else
3319 ix86_cmodel = CM_32;
3320 }
3321 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3322 {
3323 error ("-masm=intel not supported in this configuration");
3324 ix86_asm_dialect = ASM_ATT;
3325 }
3326 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3327 sorry ("%i-bit mode not compiled in",
3328 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3329
3330 for (i = 0; i < pta_size; i++)
3331 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3332 {
3333 ix86_schedule = processor_alias_table[i].schedule;
3334 ix86_arch = processor_alias_table[i].processor;
3335 /* Default cpu tuning to the architecture. */
3336 ix86_tune = ix86_arch;
3337
3338 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3339 error ("CPU you selected does not support x86-64 "
3340 "instruction set");
3341
3342 if (processor_alias_table[i].flags & PTA_MMX
3343 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3344 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3345 if (processor_alias_table[i].flags & PTA_3DNOW
3346 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3347 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3348 if (processor_alias_table[i].flags & PTA_3DNOW_A
3349 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3350 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3351 if (processor_alias_table[i].flags & PTA_SSE
3352 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3353 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3354 if (processor_alias_table[i].flags & PTA_SSE2
3355 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3356 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3357 if (processor_alias_table[i].flags & PTA_SSE3
3358 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3359 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3360 if (processor_alias_table[i].flags & PTA_SSSE3
3361 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3362 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3363 if (processor_alias_table[i].flags & PTA_SSE4_1
3364 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3365 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3366 if (processor_alias_table[i].flags & PTA_SSE4_2
3367 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3368 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3369 if (processor_alias_table[i].flags & PTA_AVX
3370 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3371 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3372 if (processor_alias_table[i].flags & PTA_AVX2
3373 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3374 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3375 if (processor_alias_table[i].flags & PTA_FMA
3376 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3377 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3378 if (processor_alias_table[i].flags & PTA_SSE4A
3379 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3380 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3381 if (processor_alias_table[i].flags & PTA_FMA4
3382 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3383 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3384 if (processor_alias_table[i].flags & PTA_XOP
3385 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3386 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3387 if (processor_alias_table[i].flags & PTA_LWP
3388 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3389 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3390 if (processor_alias_table[i].flags & PTA_ABM
3391 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3392 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3393 if (processor_alias_table[i].flags & PTA_BMI
3394 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3395 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3396 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3397 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3398 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3399 if (processor_alias_table[i].flags & PTA_TBM
3400 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3401 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3402 if (processor_alias_table[i].flags & PTA_BMI2
3403 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3404 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3405 if (processor_alias_table[i].flags & PTA_CX16
3406 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3407 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3408 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3409 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3410 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3411 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3412 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3413 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3414 if (processor_alias_table[i].flags & PTA_MOVBE
3415 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3416 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3417 if (processor_alias_table[i].flags & PTA_AES
3418 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3419 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3420 if (processor_alias_table[i].flags & PTA_PCLMUL
3421 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3422 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3423 if (processor_alias_table[i].flags & PTA_FSGSBASE
3424 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3425 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3426 if (processor_alias_table[i].flags & PTA_RDRND
3427 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3428 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3429 if (processor_alias_table[i].flags & PTA_F16C
3430 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3431 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3432 if (processor_alias_table[i].flags & PTA_RTM
3433 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3434 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3435 if (processor_alias_table[i].flags & PTA_HLE
3436 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3437 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3438 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3439 x86_prefetch_sse = true;
3440
3441 break;
3442 }
3443
3444 if (!strcmp (ix86_arch_string, "generic"))
3445 error ("generic CPU can be used only for %stune=%s %s",
3446 prefix, suffix, sw);
3447 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3448 error ("bad value (%s) for %sarch=%s %s",
3449 ix86_arch_string, prefix, suffix, sw);
3450
3451 ix86_arch_mask = 1u << ix86_arch;
3452 for (i = 0; i < X86_ARCH_LAST; ++i)
3453 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3454
3455 for (i = 0; i < pta_size; i++)
3456 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3457 {
3458 ix86_schedule = processor_alias_table[i].schedule;
3459 ix86_tune = processor_alias_table[i].processor;
3460 if (TARGET_64BIT)
3461 {
3462 if (!(processor_alias_table[i].flags & PTA_64BIT))
3463 {
3464 if (ix86_tune_defaulted)
3465 {
3466 ix86_tune_string = "x86-64";
3467 for (i = 0; i < pta_size; i++)
3468 if (! strcmp (ix86_tune_string,
3469 processor_alias_table[i].name))
3470 break;
3471 ix86_schedule = processor_alias_table[i].schedule;
3472 ix86_tune = processor_alias_table[i].processor;
3473 }
3474 else
3475 error ("CPU you selected does not support x86-64 "
3476 "instruction set");
3477 }
3478 }
3479 else
3480 {
3481 /* Adjust tuning when compiling for 32-bit ABI. */
3482 switch (ix86_tune)
3483 {
3484 case PROCESSOR_GENERIC64:
3485 ix86_tune = PROCESSOR_GENERIC32;
3486 ix86_schedule = CPU_PENTIUMPRO;
3487 break;
3488
3489 case PROCESSOR_CORE2_64:
3490 ix86_tune = PROCESSOR_CORE2_32;
3491 break;
3492
3493 case PROCESSOR_COREI7_64:
3494 ix86_tune = PROCESSOR_COREI7_32;
3495 break;
3496
3497 default:
3498 break;
3499 }
3500 }
3501 /* Intel CPUs have always interpreted SSE prefetch instructions as
3502 NOPs; so, we can enable SSE prefetch instructions even when
3503 -mtune (rather than -march) points us to a processor that has them.
3504 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3505 higher processors. */
3506 if (TARGET_CMOV
3507 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3508 x86_prefetch_sse = true;
3509 break;
3510 }
3511
3512 if (ix86_tune_specified && i == pta_size)
3513 error ("bad value (%s) for %stune=%s %s",
3514 ix86_tune_string, prefix, suffix, sw);
3515
3516 ix86_tune_mask = 1u << ix86_tune;
3517 for (i = 0; i < X86_TUNE_LAST; ++i)
3518 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3519
3520 #ifndef USE_IX86_FRAME_POINTER
3521 #define USE_IX86_FRAME_POINTER 0
3522 #endif
3523
3524 #ifndef USE_X86_64_FRAME_POINTER
3525 #define USE_X86_64_FRAME_POINTER 0
3526 #endif
3527
3528 /* Set the default values for switches whose default depends on TARGET_64BIT
3529 in case they weren't overwritten by command line options. */
3530 if (TARGET_64BIT)
3531 {
3532 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3533 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3534 if (flag_asynchronous_unwind_tables == 2)
3535 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3536 if (flag_pcc_struct_return == 2)
3537 flag_pcc_struct_return = 0;
3538 }
3539 else
3540 {
3541 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3542 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3543 if (flag_asynchronous_unwind_tables == 2)
3544 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3545 if (flag_pcc_struct_return == 2)
3546 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3547 }
3548
3549 if (optimize_size)
3550 ix86_cost = &ix86_size_cost;
3551 else
3552 ix86_cost = processor_target_table[ix86_tune].cost;
3553
3554 /* Arrange to set up i386_stack_locals for all functions. */
3555 init_machine_status = ix86_init_machine_status;
3556
3557 /* Validate -mregparm= value. */
3558 if (global_options_set.x_ix86_regparm)
3559 {
3560 if (TARGET_64BIT)
3561 warning (0, "-mregparm is ignored in 64-bit mode");
3562 if (ix86_regparm > REGPARM_MAX)
3563 {
3564 error ("-mregparm=%d is not between 0 and %d",
3565 ix86_regparm, REGPARM_MAX);
3566 ix86_regparm = 0;
3567 }
3568 }
3569 if (TARGET_64BIT)
3570 ix86_regparm = REGPARM_MAX;
3571
3572 /* Default align_* from the processor table. */
3573 if (align_loops == 0)
3574 {
3575 align_loops = processor_target_table[ix86_tune].align_loop;
3576 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3577 }
3578 if (align_jumps == 0)
3579 {
3580 align_jumps = processor_target_table[ix86_tune].align_jump;
3581 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3582 }
3583 if (align_functions == 0)
3584 {
3585 align_functions = processor_target_table[ix86_tune].align_func;
3586 }
3587
3588 /* Provide default for -mbranch-cost= value. */
3589 if (!global_options_set.x_ix86_branch_cost)
3590 ix86_branch_cost = ix86_cost->branch_cost;
3591
3592 if (TARGET_64BIT)
3593 {
3594 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3595
3596 /* Enable by default the SSE and MMX builtins. Do allow the user to
3597 explicitly disable any of these. In particular, disabling SSE and
3598 MMX for kernel code is extremely useful. */
3599 if (!ix86_arch_specified)
3600 ix86_isa_flags
3601 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3602 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3603
3604 if (TARGET_RTD)
3605 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3606 }
3607 else
3608 {
3609 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3610
3611 if (!ix86_arch_specified)
3612 ix86_isa_flags
3613 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3614
3615 /* i386 ABI does not specify red zone. It still makes sense to use it
3616 when programmer takes care to stack from being destroyed. */
3617 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3618 target_flags |= MASK_NO_RED_ZONE;
3619 }
3620
3621 /* Keep nonleaf frame pointers. */
3622 if (flag_omit_frame_pointer)
3623 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3624 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3625 flag_omit_frame_pointer = 1;
3626
3627 /* If we're doing fast math, we don't care about comparison order
3628 wrt NaNs. This lets us use a shorter comparison sequence. */
3629 if (flag_finite_math_only)
3630 target_flags &= ~MASK_IEEE_FP;
3631
3632 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3633 since the insns won't need emulation. */
3634 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3635 target_flags &= ~MASK_NO_FANCY_MATH_387;
3636
3637 /* Likewise, if the target doesn't have a 387, or we've specified
3638 software floating point, don't use 387 inline intrinsics. */
3639 if (!TARGET_80387)
3640 target_flags |= MASK_NO_FANCY_MATH_387;
3641
3642 /* Turn on MMX builtins for -msse. */
3643 if (TARGET_SSE)
3644 {
3645 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3646 x86_prefetch_sse = true;
3647 }
3648
3649 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3650 if (TARGET_SSE4_2 || TARGET_ABM)
3651 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3652
3653 /* Turn on lzcnt instruction for -mabm. */
3654 if (TARGET_ABM)
3655 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3656
3657 /* Validate -mpreferred-stack-boundary= value or default it to
3658 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3659 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3660 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3661 {
3662 int min = (TARGET_64BIT ? 4 : 2);
3663 int max = (TARGET_SEH ? 4 : 12);
3664
3665 if (ix86_preferred_stack_boundary_arg < min
3666 || ix86_preferred_stack_boundary_arg > max)
3667 {
3668 if (min == max)
3669 error ("-mpreferred-stack-boundary is not supported "
3670 "for this target");
3671 else
3672 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3673 ix86_preferred_stack_boundary_arg, min, max);
3674 }
3675 else
3676 ix86_preferred_stack_boundary
3677 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3678 }
3679
3680 /* Set the default value for -mstackrealign. */
3681 if (ix86_force_align_arg_pointer == -1)
3682 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3683
3684 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3685
3686 /* Validate -mincoming-stack-boundary= value or default it to
3687 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3688 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3689 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3690 {
3691 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3692 || ix86_incoming_stack_boundary_arg > 12)
3693 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3694 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3695 else
3696 {
3697 ix86_user_incoming_stack_boundary
3698 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3699 ix86_incoming_stack_boundary
3700 = ix86_user_incoming_stack_boundary;
3701 }
3702 }
3703
3704 /* Accept -msseregparm only if at least SSE support is enabled. */
3705 if (TARGET_SSEREGPARM
3706 && ! TARGET_SSE)
3707 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3708
3709 if (global_options_set.x_ix86_fpmath)
3710 {
3711 if (ix86_fpmath & FPMATH_SSE)
3712 {
3713 if (!TARGET_SSE)
3714 {
3715 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3716 ix86_fpmath = FPMATH_387;
3717 }
3718 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3719 {
3720 warning (0, "387 instruction set disabled, using SSE arithmetics");
3721 ix86_fpmath = FPMATH_SSE;
3722 }
3723 }
3724 }
3725 else
3726 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3727
3728 /* If the i387 is disabled, then do not return values in it. */
3729 if (!TARGET_80387)
3730 target_flags &= ~MASK_FLOAT_RETURNS;
3731
3732 /* Use external vectorized library in vectorizing intrinsics. */
3733 if (global_options_set.x_ix86_veclibabi_type)
3734 switch (ix86_veclibabi_type)
3735 {
3736 case ix86_veclibabi_type_svml:
3737 ix86_veclib_handler = ix86_veclibabi_svml;
3738 break;
3739
3740 case ix86_veclibabi_type_acml:
3741 ix86_veclib_handler = ix86_veclibabi_acml;
3742 break;
3743
3744 default:
3745 gcc_unreachable ();
3746 }
3747
3748 if ((!USE_IX86_FRAME_POINTER
3749 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3750 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3751 && !optimize_size)
3752 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3753
3754 /* ??? Unwind info is not correct around the CFG unless either a frame
3755 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3756 unwind info generation to be aware of the CFG and propagating states
3757 around edges. */
3758 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3759 || flag_exceptions || flag_non_call_exceptions)
3760 && flag_omit_frame_pointer
3761 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3762 {
3763 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3764 warning (0, "unwind tables currently require either a frame pointer "
3765 "or %saccumulate-outgoing-args%s for correctness",
3766 prefix, suffix);
3767 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3768 }
3769
3770 /* If stack probes are required, the space used for large function
3771 arguments on the stack must also be probed, so enable
3772 -maccumulate-outgoing-args so this happens in the prologue. */
3773 if (TARGET_STACK_PROBE
3774 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3775 {
3776 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3777 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3778 "for correctness", prefix, suffix);
3779 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3780 }
3781
3782 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3783 {
3784 char *p;
3785 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3786 p = strchr (internal_label_prefix, 'X');
3787 internal_label_prefix_len = p - internal_label_prefix;
3788 *p = '\0';
3789 }
3790
3791 /* When scheduling description is not available, disable scheduler pass
3792 so it won't slow down the compilation and make x87 code slower. */
3793 if (!TARGET_SCHEDULE)
3794 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3795
3796 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3797 ix86_cost->simultaneous_prefetches,
3798 global_options.x_param_values,
3799 global_options_set.x_param_values);
3800 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3801 global_options.x_param_values,
3802 global_options_set.x_param_values);
3803 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3804 global_options.x_param_values,
3805 global_options_set.x_param_values);
3806 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3807 global_options.x_param_values,
3808 global_options_set.x_param_values);
3809
3810 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3811 if (flag_prefetch_loop_arrays < 0
3812 && HAVE_prefetch
3813 && optimize >= 3
3814 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3815 flag_prefetch_loop_arrays = 1;
3816
3817 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3818 can be optimized to ap = __builtin_next_arg (0). */
3819 if (!TARGET_64BIT && !flag_split_stack)
3820 targetm.expand_builtin_va_start = NULL;
3821
3822 if (TARGET_64BIT)
3823 {
3824 ix86_gen_leave = gen_leave_rex64;
3825 if (Pmode == DImode)
3826 {
3827 ix86_gen_monitor = gen_sse3_monitor64_di;
3828 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3829 ix86_gen_tls_local_dynamic_base_64
3830 = gen_tls_local_dynamic_base_64_di;
3831 }
3832 else
3833 {
3834 ix86_gen_monitor = gen_sse3_monitor64_si;
3835 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3836 ix86_gen_tls_local_dynamic_base_64
3837 = gen_tls_local_dynamic_base_64_si;
3838 }
3839 }
3840 else
3841 {
3842 ix86_gen_leave = gen_leave;
3843 ix86_gen_monitor = gen_sse3_monitor;
3844 }
3845
3846 if (Pmode == DImode)
3847 {
3848 ix86_gen_add3 = gen_adddi3;
3849 ix86_gen_sub3 = gen_subdi3;
3850 ix86_gen_sub3_carry = gen_subdi3_carry;
3851 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3852 ix86_gen_andsp = gen_anddi3;
3853 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3854 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3855 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3856 }
3857 else
3858 {
3859 ix86_gen_add3 = gen_addsi3;
3860 ix86_gen_sub3 = gen_subsi3;
3861 ix86_gen_sub3_carry = gen_subsi3_carry;
3862 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3863 ix86_gen_andsp = gen_andsi3;
3864 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3865 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3866 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3867 }
3868
3869 #ifdef USE_IX86_CLD
3870 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3871 if (!TARGET_64BIT)
3872 target_flags |= MASK_CLD & ~target_flags_explicit;
3873 #endif
3874
3875 if (!TARGET_64BIT && flag_pic)
3876 {
3877 if (flag_fentry > 0)
3878 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3879 "with -fpic");
3880 flag_fentry = 0;
3881 }
3882 else if (TARGET_SEH)
3883 {
3884 if (flag_fentry == 0)
3885 sorry ("-mno-fentry isn%'t compatible with SEH");
3886 flag_fentry = 1;
3887 }
3888 else if (flag_fentry < 0)
3889 {
3890 #if defined(PROFILE_BEFORE_PROLOGUE)
3891 flag_fentry = 1;
3892 #else
3893 flag_fentry = 0;
3894 #endif
3895 }
3896
3897 if (TARGET_AVX)
3898 {
3899 /* When not optimize for size, enable vzeroupper optimization for
3900 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3901 AVX unaligned load/store. */
3902 if (!optimize_size)
3903 {
3904 if (flag_expensive_optimizations
3905 && !(target_flags_explicit & MASK_VZEROUPPER))
3906 target_flags |= MASK_VZEROUPPER;
3907 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3908 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3909 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3910 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3911 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3912 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3913 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3914 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3915 target_flags |= MASK_PREFER_AVX128;
3916 }
3917 }
3918 else
3919 {
3920 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3921 target_flags &= ~MASK_VZEROUPPER;
3922 }
3923
3924 if (ix86_recip_name)
3925 {
3926 char *p = ASTRDUP (ix86_recip_name);
3927 char *q;
3928 unsigned int mask, i;
3929 bool invert;
3930
3931 while ((q = strtok (p, ",")) != NULL)
3932 {
3933 p = NULL;
3934 if (*q == '!')
3935 {
3936 invert = true;
3937 q++;
3938 }
3939 else
3940 invert = false;
3941
3942 if (!strcmp (q, "default"))
3943 mask = RECIP_MASK_ALL;
3944 else
3945 {
3946 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3947 if (!strcmp (q, recip_options[i].string))
3948 {
3949 mask = recip_options[i].mask;
3950 break;
3951 }
3952
3953 if (i == ARRAY_SIZE (recip_options))
3954 {
3955 error ("unknown option for -mrecip=%s", q);
3956 invert = false;
3957 mask = RECIP_MASK_NONE;
3958 }
3959 }
3960
3961 recip_mask_explicit |= mask;
3962 if (invert)
3963 recip_mask &= ~mask;
3964 else
3965 recip_mask |= mask;
3966 }
3967 }
3968
3969 if (TARGET_RECIP)
3970 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3971 else if (target_flags_explicit & MASK_RECIP)
3972 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3973
3974 /* Save the initial options in case the user does function specific
3975 options. */
3976 if (main_args_p)
3977 target_option_default_node = target_option_current_node
3978 = build_target_option_node ();
3979 }
3980
3981 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3982
3983 static bool
3984 function_pass_avx256_p (const_rtx val)
3985 {
3986 if (!val)
3987 return false;
3988
3989 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3990 return true;
3991
3992 if (GET_CODE (val) == PARALLEL)
3993 {
3994 int i;
3995 rtx r;
3996
3997 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3998 {
3999 r = XVECEXP (val, 0, i);
4000 if (GET_CODE (r) == EXPR_LIST
4001 && XEXP (r, 0)
4002 && REG_P (XEXP (r, 0))
4003 && (GET_MODE (XEXP (r, 0)) == OImode
4004 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4005 return true;
4006 }
4007 }
4008
4009 return false;
4010 }
4011
4012 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4013
4014 static void
4015 ix86_option_override (void)
4016 {
4017 ix86_option_override_internal (true);
4018 }
4019
4020 /* Update register usage after having seen the compiler flags. */
4021
4022 static void
4023 ix86_conditional_register_usage (void)
4024 {
4025 int i;
4026 unsigned int j;
4027
4028 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4029 {
4030 if (fixed_regs[i] > 1)
4031 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4032 if (call_used_regs[i] > 1)
4033 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4034 }
4035
4036 /* The PIC register, if it exists, is fixed. */
4037 j = PIC_OFFSET_TABLE_REGNUM;
4038 if (j != INVALID_REGNUM)
4039 fixed_regs[j] = call_used_regs[j] = 1;
4040
4041 /* The 64-bit MS_ABI changes the set of call-used registers. */
4042 if (TARGET_64BIT_MS_ABI)
4043 {
4044 call_used_regs[SI_REG] = 0;
4045 call_used_regs[DI_REG] = 0;
4046 call_used_regs[XMM6_REG] = 0;
4047 call_used_regs[XMM7_REG] = 0;
4048 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4049 call_used_regs[i] = 0;
4050 }
4051
4052 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4053 other call-clobbered regs for 64-bit. */
4054 if (TARGET_64BIT)
4055 {
4056 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4057
4058 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4059 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4060 && call_used_regs[i])
4061 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4062 }
4063
4064 /* If MMX is disabled, squash the registers. */
4065 if (! TARGET_MMX)
4066 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4067 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4068 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4069
4070 /* If SSE is disabled, squash the registers. */
4071 if (! TARGET_SSE)
4072 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4073 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4074 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4075
4076 /* If the FPU is disabled, squash the registers. */
4077 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4078 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4079 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4080 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4081
4082 /* If 32-bit, squash the 64-bit registers. */
4083 if (! TARGET_64BIT)
4084 {
4085 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4086 reg_names[i] = "";
4087 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4088 reg_names[i] = "";
4089 }
4090 }
4091
4092 \f
4093 /* Save the current options */
4094
4095 static void
4096 ix86_function_specific_save (struct cl_target_option *ptr)
4097 {
4098 ptr->arch = ix86_arch;
4099 ptr->schedule = ix86_schedule;
4100 ptr->tune = ix86_tune;
4101 ptr->branch_cost = ix86_branch_cost;
4102 ptr->tune_defaulted = ix86_tune_defaulted;
4103 ptr->arch_specified = ix86_arch_specified;
4104 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4105 ptr->ix86_target_flags_explicit = target_flags_explicit;
4106 ptr->x_recip_mask_explicit = recip_mask_explicit;
4107
4108 /* The fields are char but the variables are not; make sure the
4109 values fit in the fields. */
4110 gcc_assert (ptr->arch == ix86_arch);
4111 gcc_assert (ptr->schedule == ix86_schedule);
4112 gcc_assert (ptr->tune == ix86_tune);
4113 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4114 }
4115
4116 /* Restore the current options */
4117
4118 static void
4119 ix86_function_specific_restore (struct cl_target_option *ptr)
4120 {
4121 enum processor_type old_tune = ix86_tune;
4122 enum processor_type old_arch = ix86_arch;
4123 unsigned int ix86_arch_mask, ix86_tune_mask;
4124 int i;
4125
4126 ix86_arch = (enum processor_type) ptr->arch;
4127 ix86_schedule = (enum attr_cpu) ptr->schedule;
4128 ix86_tune = (enum processor_type) ptr->tune;
4129 ix86_branch_cost = ptr->branch_cost;
4130 ix86_tune_defaulted = ptr->tune_defaulted;
4131 ix86_arch_specified = ptr->arch_specified;
4132 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4133 target_flags_explicit = ptr->ix86_target_flags_explicit;
4134 recip_mask_explicit = ptr->x_recip_mask_explicit;
4135
4136 /* Recreate the arch feature tests if the arch changed */
4137 if (old_arch != ix86_arch)
4138 {
4139 ix86_arch_mask = 1u << ix86_arch;
4140 for (i = 0; i < X86_ARCH_LAST; ++i)
4141 ix86_arch_features[i]
4142 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4143 }
4144
4145 /* Recreate the tune optimization tests */
4146 if (old_tune != ix86_tune)
4147 {
4148 ix86_tune_mask = 1u << ix86_tune;
4149 for (i = 0; i < X86_TUNE_LAST; ++i)
4150 ix86_tune_features[i]
4151 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4152 }
4153 }
4154
4155 /* Print the current options */
4156
4157 static void
4158 ix86_function_specific_print (FILE *file, int indent,
4159 struct cl_target_option *ptr)
4160 {
4161 char *target_string
4162 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4163 NULL, NULL, ptr->x_ix86_fpmath, false);
4164
4165 fprintf (file, "%*sarch = %d (%s)\n",
4166 indent, "",
4167 ptr->arch,
4168 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4169 ? cpu_names[ptr->arch]
4170 : "<unknown>"));
4171
4172 fprintf (file, "%*stune = %d (%s)\n",
4173 indent, "",
4174 ptr->tune,
4175 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4176 ? cpu_names[ptr->tune]
4177 : "<unknown>"));
4178
4179 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4180
4181 if (target_string)
4182 {
4183 fprintf (file, "%*s%s\n", indent, "", target_string);
4184 free (target_string);
4185 }
4186 }
4187
4188 \f
4189 /* Inner function to process the attribute((target(...))), take an argument and
4190 set the current options from the argument. If we have a list, recursively go
4191 over the list. */
4192
4193 static bool
4194 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4195 struct gcc_options *enum_opts_set)
4196 {
4197 char *next_optstr;
4198 bool ret = true;
4199
4200 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4201 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4202 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4203 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4204 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4205
4206 enum ix86_opt_type
4207 {
4208 ix86_opt_unknown,
4209 ix86_opt_yes,
4210 ix86_opt_no,
4211 ix86_opt_str,
4212 ix86_opt_enum,
4213 ix86_opt_isa
4214 };
4215
4216 static const struct
4217 {
4218 const char *string;
4219 size_t len;
4220 enum ix86_opt_type type;
4221 int opt;
4222 int mask;
4223 } attrs[] = {
4224 /* isa options */
4225 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4226 IX86_ATTR_ISA ("abm", OPT_mabm),
4227 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4228 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4229 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4230 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4231 IX86_ATTR_ISA ("aes", OPT_maes),
4232 IX86_ATTR_ISA ("avx", OPT_mavx),
4233 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4234 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4235 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4236 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4237 IX86_ATTR_ISA ("sse", OPT_msse),
4238 IX86_ATTR_ISA ("sse2", OPT_msse2),
4239 IX86_ATTR_ISA ("sse3", OPT_msse3),
4240 IX86_ATTR_ISA ("sse4", OPT_msse4),
4241 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4242 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4243 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4244 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4245 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4246 IX86_ATTR_ISA ("fma", OPT_mfma),
4247 IX86_ATTR_ISA ("xop", OPT_mxop),
4248 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4249 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4250 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4251 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4252 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4253 IX86_ATTR_ISA ("hle", OPT_mhle),
4254
4255 /* enum options */
4256 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4257
4258 /* string options */
4259 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4260 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4261
4262 /* flag options */
4263 IX86_ATTR_YES ("cld",
4264 OPT_mcld,
4265 MASK_CLD),
4266
4267 IX86_ATTR_NO ("fancy-math-387",
4268 OPT_mfancy_math_387,
4269 MASK_NO_FANCY_MATH_387),
4270
4271 IX86_ATTR_YES ("ieee-fp",
4272 OPT_mieee_fp,
4273 MASK_IEEE_FP),
4274
4275 IX86_ATTR_YES ("inline-all-stringops",
4276 OPT_minline_all_stringops,
4277 MASK_INLINE_ALL_STRINGOPS),
4278
4279 IX86_ATTR_YES ("inline-stringops-dynamically",
4280 OPT_minline_stringops_dynamically,
4281 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4282
4283 IX86_ATTR_NO ("align-stringops",
4284 OPT_mno_align_stringops,
4285 MASK_NO_ALIGN_STRINGOPS),
4286
4287 IX86_ATTR_YES ("recip",
4288 OPT_mrecip,
4289 MASK_RECIP),
4290
4291 };
4292
4293 /* If this is a list, recurse to get the options. */
4294 if (TREE_CODE (args) == TREE_LIST)
4295 {
4296 bool ret = true;
4297
4298 for (; args; args = TREE_CHAIN (args))
4299 if (TREE_VALUE (args)
4300 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4301 p_strings, enum_opts_set))
4302 ret = false;
4303
4304 return ret;
4305 }
4306
4307 else if (TREE_CODE (args) != STRING_CST)
4308 gcc_unreachable ();
4309
4310 /* Handle multiple arguments separated by commas. */
4311 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4312
4313 while (next_optstr && *next_optstr != '\0')
4314 {
4315 char *p = next_optstr;
4316 char *orig_p = p;
4317 char *comma = strchr (next_optstr, ',');
4318 const char *opt_string;
4319 size_t len, opt_len;
4320 int opt;
4321 bool opt_set_p;
4322 char ch;
4323 unsigned i;
4324 enum ix86_opt_type type = ix86_opt_unknown;
4325 int mask = 0;
4326
4327 if (comma)
4328 {
4329 *comma = '\0';
4330 len = comma - next_optstr;
4331 next_optstr = comma + 1;
4332 }
4333 else
4334 {
4335 len = strlen (p);
4336 next_optstr = NULL;
4337 }
4338
4339 /* Recognize no-xxx. */
4340 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4341 {
4342 opt_set_p = false;
4343 p += 3;
4344 len -= 3;
4345 }
4346 else
4347 opt_set_p = true;
4348
4349 /* Find the option. */
4350 ch = *p;
4351 opt = N_OPTS;
4352 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4353 {
4354 type = attrs[i].type;
4355 opt_len = attrs[i].len;
4356 if (ch == attrs[i].string[0]
4357 && ((type != ix86_opt_str && type != ix86_opt_enum)
4358 ? len == opt_len
4359 : len > opt_len)
4360 && memcmp (p, attrs[i].string, opt_len) == 0)
4361 {
4362 opt = attrs[i].opt;
4363 mask = attrs[i].mask;
4364 opt_string = attrs[i].string;
4365 break;
4366 }
4367 }
4368
4369 /* Process the option. */
4370 if (opt == N_OPTS)
4371 {
4372 error ("attribute(target(\"%s\")) is unknown", orig_p);
4373 ret = false;
4374 }
4375
4376 else if (type == ix86_opt_isa)
4377 {
4378 struct cl_decoded_option decoded;
4379
4380 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4381 ix86_handle_option (&global_options, &global_options_set,
4382 &decoded, input_location);
4383 }
4384
4385 else if (type == ix86_opt_yes || type == ix86_opt_no)
4386 {
4387 if (type == ix86_opt_no)
4388 opt_set_p = !opt_set_p;
4389
4390 if (opt_set_p)
4391 target_flags |= mask;
4392 else
4393 target_flags &= ~mask;
4394 }
4395
4396 else if (type == ix86_opt_str)
4397 {
4398 if (p_strings[opt])
4399 {
4400 error ("option(\"%s\") was already specified", opt_string);
4401 ret = false;
4402 }
4403 else
4404 p_strings[opt] = xstrdup (p + opt_len);
4405 }
4406
4407 else if (type == ix86_opt_enum)
4408 {
4409 bool arg_ok;
4410 int value;
4411
4412 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4413 if (arg_ok)
4414 set_option (&global_options, enum_opts_set, opt, value,
4415 p + opt_len, DK_UNSPECIFIED, input_location,
4416 global_dc);
4417 else
4418 {
4419 error ("attribute(target(\"%s\")) is unknown", orig_p);
4420 ret = false;
4421 }
4422 }
4423
4424 else
4425 gcc_unreachable ();
4426 }
4427
4428 return ret;
4429 }
4430
4431 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4432
4433 tree
4434 ix86_valid_target_attribute_tree (tree args)
4435 {
4436 const char *orig_arch_string = ix86_arch_string;
4437 const char *orig_tune_string = ix86_tune_string;
4438 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4439 int orig_tune_defaulted = ix86_tune_defaulted;
4440 int orig_arch_specified = ix86_arch_specified;
4441 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4442 tree t = NULL_TREE;
4443 int i;
4444 struct cl_target_option *def
4445 = TREE_TARGET_OPTION (target_option_default_node);
4446 struct gcc_options enum_opts_set;
4447
4448 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4449
4450 /* Process each of the options on the chain. */
4451 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4452 &enum_opts_set))
4453 return NULL_TREE;
4454
4455 /* If the changed options are different from the default, rerun
4456 ix86_option_override_internal, and then save the options away.
4457 The string options are are attribute options, and will be undone
4458 when we copy the save structure. */
4459 if (ix86_isa_flags != def->x_ix86_isa_flags
4460 || target_flags != def->x_target_flags
4461 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4462 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4463 || enum_opts_set.x_ix86_fpmath)
4464 {
4465 /* If we are using the default tune= or arch=, undo the string assigned,
4466 and use the default. */
4467 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4468 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4469 else if (!orig_arch_specified)
4470 ix86_arch_string = NULL;
4471
4472 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4473 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4474 else if (orig_tune_defaulted)
4475 ix86_tune_string = NULL;
4476
4477 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4478 if (enum_opts_set.x_ix86_fpmath)
4479 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4480 else if (!TARGET_64BIT && TARGET_SSE)
4481 {
4482 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4483 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4484 }
4485
4486 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4487 ix86_option_override_internal (false);
4488
4489 /* Add any builtin functions with the new isa if any. */
4490 ix86_add_new_builtins (ix86_isa_flags);
4491
4492 /* Save the current options unless we are validating options for
4493 #pragma. */
4494 t = build_target_option_node ();
4495
4496 ix86_arch_string = orig_arch_string;
4497 ix86_tune_string = orig_tune_string;
4498 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4499
4500 /* Free up memory allocated to hold the strings */
4501 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4502 free (option_strings[i]);
4503 }
4504
4505 return t;
4506 }
4507
4508 /* Hook to validate attribute((target("string"))). */
4509
4510 static bool
4511 ix86_valid_target_attribute_p (tree fndecl,
4512 tree ARG_UNUSED (name),
4513 tree args,
4514 int ARG_UNUSED (flags))
4515 {
4516 struct cl_target_option cur_target;
4517 bool ret = true;
4518 tree old_optimize = build_optimization_node ();
4519 tree new_target, new_optimize;
4520 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4521
4522 /* If the function changed the optimization levels as well as setting target
4523 options, start with the optimizations specified. */
4524 if (func_optimize && func_optimize != old_optimize)
4525 cl_optimization_restore (&global_options,
4526 TREE_OPTIMIZATION (func_optimize));
4527
4528 /* The target attributes may also change some optimization flags, so update
4529 the optimization options if necessary. */
4530 cl_target_option_save (&cur_target, &global_options);
4531 new_target = ix86_valid_target_attribute_tree (args);
4532 new_optimize = build_optimization_node ();
4533
4534 if (!new_target)
4535 ret = false;
4536
4537 else if (fndecl)
4538 {
4539 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4540
4541 if (old_optimize != new_optimize)
4542 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4543 }
4544
4545 cl_target_option_restore (&global_options, &cur_target);
4546
4547 if (old_optimize != new_optimize)
4548 cl_optimization_restore (&global_options,
4549 TREE_OPTIMIZATION (old_optimize));
4550
4551 return ret;
4552 }
4553
4554 \f
4555 /* Hook to determine if one function can safely inline another. */
4556
4557 static bool
4558 ix86_can_inline_p (tree caller, tree callee)
4559 {
4560 bool ret = false;
4561 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4562 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4563
4564 /* If callee has no option attributes, then it is ok to inline. */
4565 if (!callee_tree)
4566 ret = true;
4567
4568 /* If caller has no option attributes, but callee does then it is not ok to
4569 inline. */
4570 else if (!caller_tree)
4571 ret = false;
4572
4573 else
4574 {
4575 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4576 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4577
4578 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4579 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4580 function. */
4581 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4582 != callee_opts->x_ix86_isa_flags)
4583 ret = false;
4584
4585 /* See if we have the same non-isa options. */
4586 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4587 ret = false;
4588
4589 /* See if arch, tune, etc. are the same. */
4590 else if (caller_opts->arch != callee_opts->arch)
4591 ret = false;
4592
4593 else if (caller_opts->tune != callee_opts->tune)
4594 ret = false;
4595
4596 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4597 ret = false;
4598
4599 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4600 ret = false;
4601
4602 else
4603 ret = true;
4604 }
4605
4606 return ret;
4607 }
4608
4609 \f
4610 /* Remember the last target of ix86_set_current_function. */
4611 static GTY(()) tree ix86_previous_fndecl;
4612
4613 /* Establish appropriate back-end context for processing the function
4614 FNDECL. The argument might be NULL to indicate processing at top
4615 level, outside of any function scope. */
4616 static void
4617 ix86_set_current_function (tree fndecl)
4618 {
4619 /* Only change the context if the function changes. This hook is called
4620 several times in the course of compiling a function, and we don't want to
4621 slow things down too much or call target_reinit when it isn't safe. */
4622 if (fndecl && fndecl != ix86_previous_fndecl)
4623 {
4624 tree old_tree = (ix86_previous_fndecl
4625 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4626 : NULL_TREE);
4627
4628 tree new_tree = (fndecl
4629 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4630 : NULL_TREE);
4631
4632 ix86_previous_fndecl = fndecl;
4633 if (old_tree == new_tree)
4634 ;
4635
4636 else if (new_tree)
4637 {
4638 cl_target_option_restore (&global_options,
4639 TREE_TARGET_OPTION (new_tree));
4640 target_reinit ();
4641 }
4642
4643 else if (old_tree)
4644 {
4645 struct cl_target_option *def
4646 = TREE_TARGET_OPTION (target_option_current_node);
4647
4648 cl_target_option_restore (&global_options, def);
4649 target_reinit ();
4650 }
4651 }
4652 }
4653
4654 \f
4655 /* Return true if this goes in large data/bss. */
4656
4657 static bool
4658 ix86_in_large_data_p (tree exp)
4659 {
4660 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4661 return false;
4662
4663 /* Functions are never large data. */
4664 if (TREE_CODE (exp) == FUNCTION_DECL)
4665 return false;
4666
4667 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4668 {
4669 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4670 if (strcmp (section, ".ldata") == 0
4671 || strcmp (section, ".lbss") == 0)
4672 return true;
4673 return false;
4674 }
4675 else
4676 {
4677 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4678
4679 /* If this is an incomplete type with size 0, then we can't put it
4680 in data because it might be too big when completed. */
4681 if (!size || size > ix86_section_threshold)
4682 return true;
4683 }
4684
4685 return false;
4686 }
4687
4688 /* Switch to the appropriate section for output of DECL.
4689 DECL is either a `VAR_DECL' node or a constant of some sort.
4690 RELOC indicates whether forming the initial value of DECL requires
4691 link-time relocations. */
4692
4693 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4694 ATTRIBUTE_UNUSED;
4695
4696 static section *
4697 x86_64_elf_select_section (tree decl, int reloc,
4698 unsigned HOST_WIDE_INT align)
4699 {
4700 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4701 && ix86_in_large_data_p (decl))
4702 {
4703 const char *sname = NULL;
4704 unsigned int flags = SECTION_WRITE;
4705 switch (categorize_decl_for_section (decl, reloc))
4706 {
4707 case SECCAT_DATA:
4708 sname = ".ldata";
4709 break;
4710 case SECCAT_DATA_REL:
4711 sname = ".ldata.rel";
4712 break;
4713 case SECCAT_DATA_REL_LOCAL:
4714 sname = ".ldata.rel.local";
4715 break;
4716 case SECCAT_DATA_REL_RO:
4717 sname = ".ldata.rel.ro";
4718 break;
4719 case SECCAT_DATA_REL_RO_LOCAL:
4720 sname = ".ldata.rel.ro.local";
4721 break;
4722 case SECCAT_BSS:
4723 sname = ".lbss";
4724 flags |= SECTION_BSS;
4725 break;
4726 case SECCAT_RODATA:
4727 case SECCAT_RODATA_MERGE_STR:
4728 case SECCAT_RODATA_MERGE_STR_INIT:
4729 case SECCAT_RODATA_MERGE_CONST:
4730 sname = ".lrodata";
4731 flags = 0;
4732 break;
4733 case SECCAT_SRODATA:
4734 case SECCAT_SDATA:
4735 case SECCAT_SBSS:
4736 gcc_unreachable ();
4737 case SECCAT_TEXT:
4738 case SECCAT_TDATA:
4739 case SECCAT_TBSS:
4740 /* We don't split these for medium model. Place them into
4741 default sections and hope for best. */
4742 break;
4743 }
4744 if (sname)
4745 {
4746 /* We might get called with string constants, but get_named_section
4747 doesn't like them as they are not DECLs. Also, we need to set
4748 flags in that case. */
4749 if (!DECL_P (decl))
4750 return get_section (sname, flags, NULL);
4751 return get_named_section (decl, sname, reloc);
4752 }
4753 }
4754 return default_elf_select_section (decl, reloc, align);
4755 }
4756
4757 /* Build up a unique section name, expressed as a
4758 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4759 RELOC indicates whether the initial value of EXP requires
4760 link-time relocations. */
4761
4762 static void ATTRIBUTE_UNUSED
4763 x86_64_elf_unique_section (tree decl, int reloc)
4764 {
4765 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4766 && ix86_in_large_data_p (decl))
4767 {
4768 const char *prefix = NULL;
4769 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4770 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4771
4772 switch (categorize_decl_for_section (decl, reloc))
4773 {
4774 case SECCAT_DATA:
4775 case SECCAT_DATA_REL:
4776 case SECCAT_DATA_REL_LOCAL:
4777 case SECCAT_DATA_REL_RO:
4778 case SECCAT_DATA_REL_RO_LOCAL:
4779 prefix = one_only ? ".ld" : ".ldata";
4780 break;
4781 case SECCAT_BSS:
4782 prefix = one_only ? ".lb" : ".lbss";
4783 break;
4784 case SECCAT_RODATA:
4785 case SECCAT_RODATA_MERGE_STR:
4786 case SECCAT_RODATA_MERGE_STR_INIT:
4787 case SECCAT_RODATA_MERGE_CONST:
4788 prefix = one_only ? ".lr" : ".lrodata";
4789 break;
4790 case SECCAT_SRODATA:
4791 case SECCAT_SDATA:
4792 case SECCAT_SBSS:
4793 gcc_unreachable ();
4794 case SECCAT_TEXT:
4795 case SECCAT_TDATA:
4796 case SECCAT_TBSS:
4797 /* We don't split these for medium model. Place them into
4798 default sections and hope for best. */
4799 break;
4800 }
4801 if (prefix)
4802 {
4803 const char *name, *linkonce;
4804 char *string;
4805
4806 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4807 name = targetm.strip_name_encoding (name);
4808
4809 /* If we're using one_only, then there needs to be a .gnu.linkonce
4810 prefix to the section name. */
4811 linkonce = one_only ? ".gnu.linkonce" : "";
4812
4813 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4814
4815 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4816 return;
4817 }
4818 }
4819 default_unique_section (decl, reloc);
4820 }
4821
4822 #ifdef COMMON_ASM_OP
4823 /* This says how to output assembler code to declare an
4824 uninitialized external linkage data object.
4825
4826 For medium model x86-64 we need to use .largecomm opcode for
4827 large objects. */
4828 void
4829 x86_elf_aligned_common (FILE *file,
4830 const char *name, unsigned HOST_WIDE_INT size,
4831 int align)
4832 {
4833 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4834 && size > (unsigned int)ix86_section_threshold)
4835 fputs (".largecomm\t", file);
4836 else
4837 fputs (COMMON_ASM_OP, file);
4838 assemble_name (file, name);
4839 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4840 size, align / BITS_PER_UNIT);
4841 }
4842 #endif
4843
4844 /* Utility function for targets to use in implementing
4845 ASM_OUTPUT_ALIGNED_BSS. */
4846
4847 void
4848 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4849 const char *name, unsigned HOST_WIDE_INT size,
4850 int align)
4851 {
4852 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4853 && size > (unsigned int)ix86_section_threshold)
4854 switch_to_section (get_named_section (decl, ".lbss", 0));
4855 else
4856 switch_to_section (bss_section);
4857 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4858 #ifdef ASM_DECLARE_OBJECT_NAME
4859 last_assemble_variable_decl = decl;
4860 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4861 #else
4862 /* Standard thing is just output label for the object. */
4863 ASM_OUTPUT_LABEL (file, name);
4864 #endif /* ASM_DECLARE_OBJECT_NAME */
4865 ASM_OUTPUT_SKIP (file, size ? size : 1);
4866 }
4867 \f
4868 /* Decide whether we must probe the stack before any space allocation
4869 on this target. It's essentially TARGET_STACK_PROBE except when
4870 -fstack-check causes the stack to be already probed differently. */
4871
4872 bool
4873 ix86_target_stack_probe (void)
4874 {
4875 /* Do not probe the stack twice if static stack checking is enabled. */
4876 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4877 return false;
4878
4879 return TARGET_STACK_PROBE;
4880 }
4881 \f
4882 /* Decide whether we can make a sibling call to a function. DECL is the
4883 declaration of the function being targeted by the call and EXP is the
4884 CALL_EXPR representing the call. */
4885
4886 static bool
4887 ix86_function_ok_for_sibcall (tree decl, tree exp)
4888 {
4889 tree type, decl_or_type;
4890 rtx a, b;
4891
4892 /* If we are generating position-independent code, we cannot sibcall
4893 optimize any indirect call, or a direct call to a global function,
4894 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4895 if (!TARGET_MACHO
4896 && !TARGET_64BIT
4897 && flag_pic
4898 && (!decl || !targetm.binds_local_p (decl)))
4899 return false;
4900
4901 /* If we need to align the outgoing stack, then sibcalling would
4902 unalign the stack, which may break the called function. */
4903 if (ix86_minimum_incoming_stack_boundary (true)
4904 < PREFERRED_STACK_BOUNDARY)
4905 return false;
4906
4907 if (decl)
4908 {
4909 decl_or_type = decl;
4910 type = TREE_TYPE (decl);
4911 }
4912 else
4913 {
4914 /* We're looking at the CALL_EXPR, we need the type of the function. */
4915 type = CALL_EXPR_FN (exp); /* pointer expression */
4916 type = TREE_TYPE (type); /* pointer type */
4917 type = TREE_TYPE (type); /* function type */
4918 decl_or_type = type;
4919 }
4920
4921 /* Check that the return value locations are the same. Like
4922 if we are returning floats on the 80387 register stack, we cannot
4923 make a sibcall from a function that doesn't return a float to a
4924 function that does or, conversely, from a function that does return
4925 a float to a function that doesn't; the necessary stack adjustment
4926 would not be executed. This is also the place we notice
4927 differences in the return value ABI. Note that it is ok for one
4928 of the functions to have void return type as long as the return
4929 value of the other is passed in a register. */
4930 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4931 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4932 cfun->decl, false);
4933 if (STACK_REG_P (a) || STACK_REG_P (b))
4934 {
4935 if (!rtx_equal_p (a, b))
4936 return false;
4937 }
4938 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4939 {
4940 /* Disable sibcall if we need to generate vzeroupper after
4941 callee returns. */
4942 if (TARGET_VZEROUPPER
4943 && cfun->machine->callee_return_avx256_p
4944 && !cfun->machine->caller_return_avx256_p)
4945 return false;
4946 }
4947 else if (!rtx_equal_p (a, b))
4948 return false;
4949
4950 if (TARGET_64BIT)
4951 {
4952 /* The SYSV ABI has more call-clobbered registers;
4953 disallow sibcalls from MS to SYSV. */
4954 if (cfun->machine->call_abi == MS_ABI
4955 && ix86_function_type_abi (type) == SYSV_ABI)
4956 return false;
4957 }
4958 else
4959 {
4960 /* If this call is indirect, we'll need to be able to use a
4961 call-clobbered register for the address of the target function.
4962 Make sure that all such registers are not used for passing
4963 parameters. Note that DLLIMPORT functions are indirect. */
4964 if (!decl
4965 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4966 {
4967 if (ix86_function_regparm (type, NULL) >= 3)
4968 {
4969 /* ??? Need to count the actual number of registers to be used,
4970 not the possible number of registers. Fix later. */
4971 return false;
4972 }
4973 }
4974 }
4975
4976 /* Otherwise okay. That also includes certain types of indirect calls. */
4977 return true;
4978 }
4979
4980 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4981 and "sseregparm" calling convention attributes;
4982 arguments as in struct attribute_spec.handler. */
4983
4984 static tree
4985 ix86_handle_cconv_attribute (tree *node, tree name,
4986 tree args,
4987 int flags ATTRIBUTE_UNUSED,
4988 bool *no_add_attrs)
4989 {
4990 if (TREE_CODE (*node) != FUNCTION_TYPE
4991 && TREE_CODE (*node) != METHOD_TYPE
4992 && TREE_CODE (*node) != FIELD_DECL
4993 && TREE_CODE (*node) != TYPE_DECL)
4994 {
4995 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4996 name);
4997 *no_add_attrs = true;
4998 return NULL_TREE;
4999 }
5000
5001 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5002 if (is_attribute_p ("regparm", name))
5003 {
5004 tree cst;
5005
5006 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5007 {
5008 error ("fastcall and regparm attributes are not compatible");
5009 }
5010
5011 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5012 {
5013 error ("regparam and thiscall attributes are not compatible");
5014 }
5015
5016 cst = TREE_VALUE (args);
5017 if (TREE_CODE (cst) != INTEGER_CST)
5018 {
5019 warning (OPT_Wattributes,
5020 "%qE attribute requires an integer constant argument",
5021 name);
5022 *no_add_attrs = true;
5023 }
5024 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5025 {
5026 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5027 name, REGPARM_MAX);
5028 *no_add_attrs = true;
5029 }
5030
5031 return NULL_TREE;
5032 }
5033
5034 if (TARGET_64BIT)
5035 {
5036 /* Do not warn when emulating the MS ABI. */
5037 if ((TREE_CODE (*node) != FUNCTION_TYPE
5038 && TREE_CODE (*node) != METHOD_TYPE)
5039 || ix86_function_type_abi (*node) != MS_ABI)
5040 warning (OPT_Wattributes, "%qE attribute ignored",
5041 name);
5042 *no_add_attrs = true;
5043 return NULL_TREE;
5044 }
5045
5046 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5047 if (is_attribute_p ("fastcall", name))
5048 {
5049 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5050 {
5051 error ("fastcall and cdecl attributes are not compatible");
5052 }
5053 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5054 {
5055 error ("fastcall and stdcall attributes are not compatible");
5056 }
5057 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5058 {
5059 error ("fastcall and regparm attributes are not compatible");
5060 }
5061 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5062 {
5063 error ("fastcall and thiscall attributes are not compatible");
5064 }
5065 }
5066
5067 /* Can combine stdcall with fastcall (redundant), regparm and
5068 sseregparm. */
5069 else if (is_attribute_p ("stdcall", name))
5070 {
5071 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5072 {
5073 error ("stdcall and cdecl attributes are not compatible");
5074 }
5075 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5076 {
5077 error ("stdcall and fastcall attributes are not compatible");
5078 }
5079 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5080 {
5081 error ("stdcall and thiscall attributes are not compatible");
5082 }
5083 }
5084
5085 /* Can combine cdecl with regparm and sseregparm. */
5086 else if (is_attribute_p ("cdecl", name))
5087 {
5088 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5089 {
5090 error ("stdcall and cdecl attributes are not compatible");
5091 }
5092 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5093 {
5094 error ("fastcall and cdecl attributes are not compatible");
5095 }
5096 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5097 {
5098 error ("cdecl and thiscall attributes are not compatible");
5099 }
5100 }
5101 else if (is_attribute_p ("thiscall", name))
5102 {
5103 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5104 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5105 name);
5106 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5107 {
5108 error ("stdcall and thiscall attributes are not compatible");
5109 }
5110 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5111 {
5112 error ("fastcall and thiscall attributes are not compatible");
5113 }
5114 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5115 {
5116 error ("cdecl and thiscall attributes are not compatible");
5117 }
5118 }
5119
5120 /* Can combine sseregparm with all attributes. */
5121
5122 return NULL_TREE;
5123 }
5124
5125 /* The transactional memory builtins are implicitly regparm or fastcall
5126 depending on the ABI. Override the generic do-nothing attribute that
5127 these builtins were declared with, and replace it with one of the two
5128 attributes that we expect elsewhere. */
5129
5130 static tree
5131 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5132 tree args ATTRIBUTE_UNUSED,
5133 int flags ATTRIBUTE_UNUSED,
5134 bool *no_add_attrs)
5135 {
5136 tree alt;
5137
5138 /* In no case do we want to add the placeholder attribute. */
5139 *no_add_attrs = true;
5140
5141 /* The 64-bit ABI is unchanged for transactional memory. */
5142 if (TARGET_64BIT)
5143 return NULL_TREE;
5144
5145 /* ??? Is there a better way to validate 32-bit windows? We have
5146 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5147 if (CHECK_STACK_LIMIT > 0)
5148 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5149 else
5150 {
5151 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5152 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5153 }
5154 decl_attributes (node, alt, flags);
5155
5156 return NULL_TREE;
5157 }
5158
5159 /* This function determines from TYPE the calling-convention. */
5160
5161 unsigned int
5162 ix86_get_callcvt (const_tree type)
5163 {
5164 unsigned int ret = 0;
5165 bool is_stdarg;
5166 tree attrs;
5167
5168 if (TARGET_64BIT)
5169 return IX86_CALLCVT_CDECL;
5170
5171 attrs = TYPE_ATTRIBUTES (type);
5172 if (attrs != NULL_TREE)
5173 {
5174 if (lookup_attribute ("cdecl", attrs))
5175 ret |= IX86_CALLCVT_CDECL;
5176 else if (lookup_attribute ("stdcall", attrs))
5177 ret |= IX86_CALLCVT_STDCALL;
5178 else if (lookup_attribute ("fastcall", attrs))
5179 ret |= IX86_CALLCVT_FASTCALL;
5180 else if (lookup_attribute ("thiscall", attrs))
5181 ret |= IX86_CALLCVT_THISCALL;
5182
5183 /* Regparam isn't allowed for thiscall and fastcall. */
5184 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5185 {
5186 if (lookup_attribute ("regparm", attrs))
5187 ret |= IX86_CALLCVT_REGPARM;
5188 if (lookup_attribute ("sseregparm", attrs))
5189 ret |= IX86_CALLCVT_SSEREGPARM;
5190 }
5191
5192 if (IX86_BASE_CALLCVT(ret) != 0)
5193 return ret;
5194 }
5195
5196 is_stdarg = stdarg_p (type);
5197 if (TARGET_RTD && !is_stdarg)
5198 return IX86_CALLCVT_STDCALL | ret;
5199
5200 if (ret != 0
5201 || is_stdarg
5202 || TREE_CODE (type) != METHOD_TYPE
5203 || ix86_function_type_abi (type) != MS_ABI)
5204 return IX86_CALLCVT_CDECL | ret;
5205
5206 return IX86_CALLCVT_THISCALL;
5207 }
5208
5209 /* Return 0 if the attributes for two types are incompatible, 1 if they
5210 are compatible, and 2 if they are nearly compatible (which causes a
5211 warning to be generated). */
5212
5213 static int
5214 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5215 {
5216 unsigned int ccvt1, ccvt2;
5217
5218 if (TREE_CODE (type1) != FUNCTION_TYPE
5219 && TREE_CODE (type1) != METHOD_TYPE)
5220 return 1;
5221
5222 ccvt1 = ix86_get_callcvt (type1);
5223 ccvt2 = ix86_get_callcvt (type2);
5224 if (ccvt1 != ccvt2)
5225 return 0;
5226 if (ix86_function_regparm (type1, NULL)
5227 != ix86_function_regparm (type2, NULL))
5228 return 0;
5229
5230 return 1;
5231 }
5232 \f
5233 /* Return the regparm value for a function with the indicated TYPE and DECL.
5234 DECL may be NULL when calling function indirectly
5235 or considering a libcall. */
5236
5237 static int
5238 ix86_function_regparm (const_tree type, const_tree decl)
5239 {
5240 tree attr;
5241 int regparm;
5242 unsigned int ccvt;
5243
5244 if (TARGET_64BIT)
5245 return (ix86_function_type_abi (type) == SYSV_ABI
5246 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5247 ccvt = ix86_get_callcvt (type);
5248 regparm = ix86_regparm;
5249
5250 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5251 {
5252 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5253 if (attr)
5254 {
5255 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5256 return regparm;
5257 }
5258 }
5259 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5260 return 2;
5261 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5262 return 1;
5263
5264 /* Use register calling convention for local functions when possible. */
5265 if (decl
5266 && TREE_CODE (decl) == FUNCTION_DECL
5267 && optimize
5268 && !(profile_flag && !flag_fentry))
5269 {
5270 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5271 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5272 if (i && i->local && i->can_change_signature)
5273 {
5274 int local_regparm, globals = 0, regno;
5275
5276 /* Make sure no regparm register is taken by a
5277 fixed register variable. */
5278 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5279 if (fixed_regs[local_regparm])
5280 break;
5281
5282 /* We don't want to use regparm(3) for nested functions as
5283 these use a static chain pointer in the third argument. */
5284 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5285 local_regparm = 2;
5286
5287 /* In 32-bit mode save a register for the split stack. */
5288 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5289 local_regparm = 2;
5290
5291 /* Each fixed register usage increases register pressure,
5292 so less registers should be used for argument passing.
5293 This functionality can be overriden by an explicit
5294 regparm value. */
5295 for (regno = 0; regno <= DI_REG; regno++)
5296 if (fixed_regs[regno])
5297 globals++;
5298
5299 local_regparm
5300 = globals < local_regparm ? local_regparm - globals : 0;
5301
5302 if (local_regparm > regparm)
5303 regparm = local_regparm;
5304 }
5305 }
5306
5307 return regparm;
5308 }
5309
5310 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5311 DFmode (2) arguments in SSE registers for a function with the
5312 indicated TYPE and DECL. DECL may be NULL when calling function
5313 indirectly or considering a libcall. Otherwise return 0. */
5314
5315 static int
5316 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5317 {
5318 gcc_assert (!TARGET_64BIT);
5319
5320 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5321 by the sseregparm attribute. */
5322 if (TARGET_SSEREGPARM
5323 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5324 {
5325 if (!TARGET_SSE)
5326 {
5327 if (warn)
5328 {
5329 if (decl)
5330 error ("calling %qD with attribute sseregparm without "
5331 "SSE/SSE2 enabled", decl);
5332 else
5333 error ("calling %qT with attribute sseregparm without "
5334 "SSE/SSE2 enabled", type);
5335 }
5336 return 0;
5337 }
5338
5339 return 2;
5340 }
5341
5342 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5343 (and DFmode for SSE2) arguments in SSE registers. */
5344 if (decl && TARGET_SSE_MATH && optimize
5345 && !(profile_flag && !flag_fentry))
5346 {
5347 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5348 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5349 if (i && i->local && i->can_change_signature)
5350 return TARGET_SSE2 ? 2 : 1;
5351 }
5352
5353 return 0;
5354 }
5355
5356 /* Return true if EAX is live at the start of the function. Used by
5357 ix86_expand_prologue to determine if we need special help before
5358 calling allocate_stack_worker. */
5359
5360 static bool
5361 ix86_eax_live_at_start_p (void)
5362 {
5363 /* Cheat. Don't bother working forward from ix86_function_regparm
5364 to the function type to whether an actual argument is located in
5365 eax. Instead just look at cfg info, which is still close enough
5366 to correct at this point. This gives false positives for broken
5367 functions that might use uninitialized data that happens to be
5368 allocated in eax, but who cares? */
5369 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5370 }
5371
5372 static bool
5373 ix86_keep_aggregate_return_pointer (tree fntype)
5374 {
5375 tree attr;
5376
5377 if (!TARGET_64BIT)
5378 {
5379 attr = lookup_attribute ("callee_pop_aggregate_return",
5380 TYPE_ATTRIBUTES (fntype));
5381 if (attr)
5382 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5383
5384 /* For 32-bit MS-ABI the default is to keep aggregate
5385 return pointer. */
5386 if (ix86_function_type_abi (fntype) == MS_ABI)
5387 return true;
5388 }
5389 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5390 }
5391
5392 /* Value is the number of bytes of arguments automatically
5393 popped when returning from a subroutine call.
5394 FUNDECL is the declaration node of the function (as a tree),
5395 FUNTYPE is the data type of the function (as a tree),
5396 or for a library call it is an identifier node for the subroutine name.
5397 SIZE is the number of bytes of arguments passed on the stack.
5398
5399 On the 80386, the RTD insn may be used to pop them if the number
5400 of args is fixed, but if the number is variable then the caller
5401 must pop them all. RTD can't be used for library calls now
5402 because the library is compiled with the Unix compiler.
5403 Use of RTD is a selectable option, since it is incompatible with
5404 standard Unix calling sequences. If the option is not selected,
5405 the caller must always pop the args.
5406
5407 The attribute stdcall is equivalent to RTD on a per module basis. */
5408
5409 static int
5410 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5411 {
5412 unsigned int ccvt;
5413
5414 /* None of the 64-bit ABIs pop arguments. */
5415 if (TARGET_64BIT)
5416 return 0;
5417
5418 ccvt = ix86_get_callcvt (funtype);
5419
5420 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5421 | IX86_CALLCVT_THISCALL)) != 0
5422 && ! stdarg_p (funtype))
5423 return size;
5424
5425 /* Lose any fake structure return argument if it is passed on the stack. */
5426 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5427 && !ix86_keep_aggregate_return_pointer (funtype))
5428 {
5429 int nregs = ix86_function_regparm (funtype, fundecl);
5430 if (nregs == 0)
5431 return GET_MODE_SIZE (Pmode);
5432 }
5433
5434 return 0;
5435 }
5436 \f
5437 /* Argument support functions. */
5438
5439 /* Return true when register may be used to pass function parameters. */
5440 bool
5441 ix86_function_arg_regno_p (int regno)
5442 {
5443 int i;
5444 const int *parm_regs;
5445
5446 if (!TARGET_64BIT)
5447 {
5448 if (TARGET_MACHO)
5449 return (regno < REGPARM_MAX
5450 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5451 else
5452 return (regno < REGPARM_MAX
5453 || (TARGET_MMX && MMX_REGNO_P (regno)
5454 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5455 || (TARGET_SSE && SSE_REGNO_P (regno)
5456 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5457 }
5458
5459 if (TARGET_MACHO)
5460 {
5461 if (SSE_REGNO_P (regno) && TARGET_SSE)
5462 return true;
5463 }
5464 else
5465 {
5466 if (TARGET_SSE && SSE_REGNO_P (regno)
5467 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5468 return true;
5469 }
5470
5471 /* TODO: The function should depend on current function ABI but
5472 builtins.c would need updating then. Therefore we use the
5473 default ABI. */
5474
5475 /* RAX is used as hidden argument to va_arg functions. */
5476 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5477 return true;
5478
5479 if (ix86_abi == MS_ABI)
5480 parm_regs = x86_64_ms_abi_int_parameter_registers;
5481 else
5482 parm_regs = x86_64_int_parameter_registers;
5483 for (i = 0; i < (ix86_abi == MS_ABI
5484 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5485 if (regno == parm_regs[i])
5486 return true;
5487 return false;
5488 }
5489
5490 /* Return if we do not know how to pass TYPE solely in registers. */
5491
5492 static bool
5493 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5494 {
5495 if (must_pass_in_stack_var_size_or_pad (mode, type))
5496 return true;
5497
5498 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5499 The layout_type routine is crafty and tries to trick us into passing
5500 currently unsupported vector types on the stack by using TImode. */
5501 return (!TARGET_64BIT && mode == TImode
5502 && type && TREE_CODE (type) != VECTOR_TYPE);
5503 }
5504
5505 /* It returns the size, in bytes, of the area reserved for arguments passed
5506 in registers for the function represented by fndecl dependent to the used
5507 abi format. */
5508 int
5509 ix86_reg_parm_stack_space (const_tree fndecl)
5510 {
5511 enum calling_abi call_abi = SYSV_ABI;
5512 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5513 call_abi = ix86_function_abi (fndecl);
5514 else
5515 call_abi = ix86_function_type_abi (fndecl);
5516 if (TARGET_64BIT && call_abi == MS_ABI)
5517 return 32;
5518 return 0;
5519 }
5520
5521 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5522 call abi used. */
5523 enum calling_abi
5524 ix86_function_type_abi (const_tree fntype)
5525 {
5526 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5527 {
5528 enum calling_abi abi = ix86_abi;
5529 if (abi == SYSV_ABI)
5530 {
5531 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5532 abi = MS_ABI;
5533 }
5534 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5535 abi = SYSV_ABI;
5536 return abi;
5537 }
5538 return ix86_abi;
5539 }
5540
5541 static bool
5542 ix86_function_ms_hook_prologue (const_tree fn)
5543 {
5544 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5545 {
5546 if (decl_function_context (fn) != NULL_TREE)
5547 error_at (DECL_SOURCE_LOCATION (fn),
5548 "ms_hook_prologue is not compatible with nested function");
5549 else
5550 return true;
5551 }
5552 return false;
5553 }
5554
5555 static enum calling_abi
5556 ix86_function_abi (const_tree fndecl)
5557 {
5558 if (! fndecl)
5559 return ix86_abi;
5560 return ix86_function_type_abi (TREE_TYPE (fndecl));
5561 }
5562
5563 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5564 call abi used. */
5565 enum calling_abi
5566 ix86_cfun_abi (void)
5567 {
5568 if (! cfun)
5569 return ix86_abi;
5570 return cfun->machine->call_abi;
5571 }
5572
5573 /* Write the extra assembler code needed to declare a function properly. */
5574
5575 void
5576 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5577 tree decl)
5578 {
5579 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5580
5581 if (is_ms_hook)
5582 {
5583 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5584 unsigned int filler_cc = 0xcccccccc;
5585
5586 for (i = 0; i < filler_count; i += 4)
5587 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5588 }
5589
5590 #ifdef SUBTARGET_ASM_UNWIND_INIT
5591 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5592 #endif
5593
5594 ASM_OUTPUT_LABEL (asm_out_file, fname);
5595
5596 /* Output magic byte marker, if hot-patch attribute is set. */
5597 if (is_ms_hook)
5598 {
5599 if (TARGET_64BIT)
5600 {
5601 /* leaq [%rsp + 0], %rsp */
5602 asm_fprintf (asm_out_file, ASM_BYTE
5603 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5604 }
5605 else
5606 {
5607 /* movl.s %edi, %edi
5608 push %ebp
5609 movl.s %esp, %ebp */
5610 asm_fprintf (asm_out_file, ASM_BYTE
5611 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5612 }
5613 }
5614 }
5615
5616 /* regclass.c */
5617 extern void init_regs (void);
5618
5619 /* Implementation of call abi switching target hook. Specific to FNDECL
5620 the specific call register sets are set. See also
5621 ix86_conditional_register_usage for more details. */
5622 void
5623 ix86_call_abi_override (const_tree fndecl)
5624 {
5625 if (fndecl == NULL_TREE)
5626 cfun->machine->call_abi = ix86_abi;
5627 else
5628 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5629 }
5630
5631 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5632 expensive re-initialization of init_regs each time we switch function context
5633 since this is needed only during RTL expansion. */
5634 static void
5635 ix86_maybe_switch_abi (void)
5636 {
5637 if (TARGET_64BIT &&
5638 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5639 reinit_regs ();
5640 }
5641
5642 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5643 for a call to a function whose data type is FNTYPE.
5644 For a library call, FNTYPE is 0. */
5645
5646 void
5647 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5648 tree fntype, /* tree ptr for function decl */
5649 rtx libname, /* SYMBOL_REF of library name or 0 */
5650 tree fndecl,
5651 int caller)
5652 {
5653 struct cgraph_local_info *i;
5654 tree fnret_type;
5655
5656 memset (cum, 0, sizeof (*cum));
5657
5658 /* Initialize for the current callee. */
5659 if (caller)
5660 {
5661 cfun->machine->callee_pass_avx256_p = false;
5662 cfun->machine->callee_return_avx256_p = false;
5663 }
5664
5665 if (fndecl)
5666 {
5667 i = cgraph_local_info (fndecl);
5668 cum->call_abi = ix86_function_abi (fndecl);
5669 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5670 }
5671 else
5672 {
5673 i = NULL;
5674 cum->call_abi = ix86_function_type_abi (fntype);
5675 if (fntype)
5676 fnret_type = TREE_TYPE (fntype);
5677 else
5678 fnret_type = NULL;
5679 }
5680
5681 if (TARGET_VZEROUPPER && fnret_type)
5682 {
5683 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5684 false);
5685 if (function_pass_avx256_p (fnret_value))
5686 {
5687 /* The return value of this function uses 256bit AVX modes. */
5688 if (caller)
5689 cfun->machine->callee_return_avx256_p = true;
5690 else
5691 cfun->machine->caller_return_avx256_p = true;
5692 }
5693 }
5694
5695 cum->caller = caller;
5696
5697 /* Set up the number of registers to use for passing arguments. */
5698
5699 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5700 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5701 "or subtarget optimization implying it");
5702 cum->nregs = ix86_regparm;
5703 if (TARGET_64BIT)
5704 {
5705 cum->nregs = (cum->call_abi == SYSV_ABI
5706 ? X86_64_REGPARM_MAX
5707 : X86_64_MS_REGPARM_MAX);
5708 }
5709 if (TARGET_SSE)
5710 {
5711 cum->sse_nregs = SSE_REGPARM_MAX;
5712 if (TARGET_64BIT)
5713 {
5714 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5715 ? X86_64_SSE_REGPARM_MAX
5716 : X86_64_MS_SSE_REGPARM_MAX);
5717 }
5718 }
5719 if (TARGET_MMX)
5720 cum->mmx_nregs = MMX_REGPARM_MAX;
5721 cum->warn_avx = true;
5722 cum->warn_sse = true;
5723 cum->warn_mmx = true;
5724
5725 /* Because type might mismatch in between caller and callee, we need to
5726 use actual type of function for local calls.
5727 FIXME: cgraph_analyze can be told to actually record if function uses
5728 va_start so for local functions maybe_vaarg can be made aggressive
5729 helping K&R code.
5730 FIXME: once typesytem is fixed, we won't need this code anymore. */
5731 if (i && i->local && i->can_change_signature)
5732 fntype = TREE_TYPE (fndecl);
5733 cum->maybe_vaarg = (fntype
5734 ? (!prototype_p (fntype) || stdarg_p (fntype))
5735 : !libname);
5736
5737 if (!TARGET_64BIT)
5738 {
5739 /* If there are variable arguments, then we won't pass anything
5740 in registers in 32-bit mode. */
5741 if (stdarg_p (fntype))
5742 {
5743 cum->nregs = 0;
5744 cum->sse_nregs = 0;
5745 cum->mmx_nregs = 0;
5746 cum->warn_avx = 0;
5747 cum->warn_sse = 0;
5748 cum->warn_mmx = 0;
5749 return;
5750 }
5751
5752 /* Use ecx and edx registers if function has fastcall attribute,
5753 else look for regparm information. */
5754 if (fntype)
5755 {
5756 unsigned int ccvt = ix86_get_callcvt (fntype);
5757 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5758 {
5759 cum->nregs = 1;
5760 cum->fastcall = 1; /* Same first register as in fastcall. */
5761 }
5762 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5763 {
5764 cum->nregs = 2;
5765 cum->fastcall = 1;
5766 }
5767 else
5768 cum->nregs = ix86_function_regparm (fntype, fndecl);
5769 }
5770
5771 /* Set up the number of SSE registers used for passing SFmode
5772 and DFmode arguments. Warn for mismatching ABI. */
5773 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5774 }
5775 }
5776
5777 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5778 But in the case of vector types, it is some vector mode.
5779
5780 When we have only some of our vector isa extensions enabled, then there
5781 are some modes for which vector_mode_supported_p is false. For these
5782 modes, the generic vector support in gcc will choose some non-vector mode
5783 in order to implement the type. By computing the natural mode, we'll
5784 select the proper ABI location for the operand and not depend on whatever
5785 the middle-end decides to do with these vector types.
5786
5787 The midde-end can't deal with the vector types > 16 bytes. In this
5788 case, we return the original mode and warn ABI change if CUM isn't
5789 NULL. */
5790
5791 static enum machine_mode
5792 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5793 {
5794 enum machine_mode mode = TYPE_MODE (type);
5795
5796 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5797 {
5798 HOST_WIDE_INT size = int_size_in_bytes (type);
5799 if ((size == 8 || size == 16 || size == 32)
5800 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5801 && TYPE_VECTOR_SUBPARTS (type) > 1)
5802 {
5803 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5804
5805 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5806 mode = MIN_MODE_VECTOR_FLOAT;
5807 else
5808 mode = MIN_MODE_VECTOR_INT;
5809
5810 /* Get the mode which has this inner mode and number of units. */
5811 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5812 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5813 && GET_MODE_INNER (mode) == innermode)
5814 {
5815 if (size == 32 && !TARGET_AVX)
5816 {
5817 static bool warnedavx;
5818
5819 if (cum
5820 && !warnedavx
5821 && cum->warn_avx)
5822 {
5823 warnedavx = true;
5824 warning (0, "AVX vector argument without AVX "
5825 "enabled changes the ABI");
5826 }
5827 return TYPE_MODE (type);
5828 }
5829 else if ((size == 8 || size == 16) && !TARGET_SSE)
5830 {
5831 static bool warnedsse;
5832
5833 if (cum
5834 && !warnedsse
5835 && cum->warn_sse)
5836 {
5837 warnedsse = true;
5838 warning (0, "SSE vector argument without SSE "
5839 "enabled changes the ABI");
5840 }
5841 return mode;
5842 }
5843 else
5844 return mode;
5845 }
5846
5847 gcc_unreachable ();
5848 }
5849 }
5850
5851 return mode;
5852 }
5853
5854 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5855 this may not agree with the mode that the type system has chosen for the
5856 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5857 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5858
5859 static rtx
5860 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5861 unsigned int regno)
5862 {
5863 rtx tmp;
5864
5865 if (orig_mode != BLKmode)
5866 tmp = gen_rtx_REG (orig_mode, regno);
5867 else
5868 {
5869 tmp = gen_rtx_REG (mode, regno);
5870 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5871 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5872 }
5873
5874 return tmp;
5875 }
5876
5877 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5878 of this code is to classify each 8bytes of incoming argument by the register
5879 class and assign registers accordingly. */
5880
5881 /* Return the union class of CLASS1 and CLASS2.
5882 See the x86-64 PS ABI for details. */
5883
5884 static enum x86_64_reg_class
5885 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5886 {
5887 /* Rule #1: If both classes are equal, this is the resulting class. */
5888 if (class1 == class2)
5889 return class1;
5890
5891 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5892 the other class. */
5893 if (class1 == X86_64_NO_CLASS)
5894 return class2;
5895 if (class2 == X86_64_NO_CLASS)
5896 return class1;
5897
5898 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5899 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5900 return X86_64_MEMORY_CLASS;
5901
5902 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5903 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5904 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5905 return X86_64_INTEGERSI_CLASS;
5906 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5907 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5908 return X86_64_INTEGER_CLASS;
5909
5910 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5911 MEMORY is used. */
5912 if (class1 == X86_64_X87_CLASS
5913 || class1 == X86_64_X87UP_CLASS
5914 || class1 == X86_64_COMPLEX_X87_CLASS
5915 || class2 == X86_64_X87_CLASS
5916 || class2 == X86_64_X87UP_CLASS
5917 || class2 == X86_64_COMPLEX_X87_CLASS)
5918 return X86_64_MEMORY_CLASS;
5919
5920 /* Rule #6: Otherwise class SSE is used. */
5921 return X86_64_SSE_CLASS;
5922 }
5923
5924 /* Classify the argument of type TYPE and mode MODE.
5925 CLASSES will be filled by the register class used to pass each word
5926 of the operand. The number of words is returned. In case the parameter
5927 should be passed in memory, 0 is returned. As a special case for zero
5928 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5929
5930 BIT_OFFSET is used internally for handling records and specifies offset
5931 of the offset in bits modulo 256 to avoid overflow cases.
5932
5933 See the x86-64 PS ABI for details.
5934 */
5935
5936 static int
5937 classify_argument (enum machine_mode mode, const_tree type,
5938 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5939 {
5940 HOST_WIDE_INT bytes =
5941 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5942 int words
5943 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5944
5945 /* Variable sized entities are always passed/returned in memory. */
5946 if (bytes < 0)
5947 return 0;
5948
5949 if (mode != VOIDmode
5950 && targetm.calls.must_pass_in_stack (mode, type))
5951 return 0;
5952
5953 if (type && AGGREGATE_TYPE_P (type))
5954 {
5955 int i;
5956 tree field;
5957 enum x86_64_reg_class subclasses[MAX_CLASSES];
5958
5959 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5960 if (bytes > 32)
5961 return 0;
5962
5963 for (i = 0; i < words; i++)
5964 classes[i] = X86_64_NO_CLASS;
5965
5966 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5967 signalize memory class, so handle it as special case. */
5968 if (!words)
5969 {
5970 classes[0] = X86_64_NO_CLASS;
5971 return 1;
5972 }
5973
5974 /* Classify each field of record and merge classes. */
5975 switch (TREE_CODE (type))
5976 {
5977 case RECORD_TYPE:
5978 /* And now merge the fields of structure. */
5979 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5980 {
5981 if (TREE_CODE (field) == FIELD_DECL)
5982 {
5983 int num;
5984
5985 if (TREE_TYPE (field) == error_mark_node)
5986 continue;
5987
5988 /* Bitfields are always classified as integer. Handle them
5989 early, since later code would consider them to be
5990 misaligned integers. */
5991 if (DECL_BIT_FIELD (field))
5992 {
5993 for (i = (int_bit_position (field)
5994 + (bit_offset % 64)) / 8 / 8;
5995 i < ((int_bit_position (field) + (bit_offset % 64))
5996 + tree_low_cst (DECL_SIZE (field), 0)
5997 + 63) / 8 / 8; i++)
5998 classes[i] =
5999 merge_classes (X86_64_INTEGER_CLASS,
6000 classes[i]);
6001 }
6002 else
6003 {
6004 int pos;
6005
6006 type = TREE_TYPE (field);
6007
6008 /* Flexible array member is ignored. */
6009 if (TYPE_MODE (type) == BLKmode
6010 && TREE_CODE (type) == ARRAY_TYPE
6011 && TYPE_SIZE (type) == NULL_TREE
6012 && TYPE_DOMAIN (type) != NULL_TREE
6013 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6014 == NULL_TREE))
6015 {
6016 static bool warned;
6017
6018 if (!warned && warn_psabi)
6019 {
6020 warned = true;
6021 inform (input_location,
6022 "the ABI of passing struct with"
6023 " a flexible array member has"
6024 " changed in GCC 4.4");
6025 }
6026 continue;
6027 }
6028 num = classify_argument (TYPE_MODE (type), type,
6029 subclasses,
6030 (int_bit_position (field)
6031 + bit_offset) % 256);
6032 if (!num)
6033 return 0;
6034 pos = (int_bit_position (field)
6035 + (bit_offset % 64)) / 8 / 8;
6036 for (i = 0; i < num && (i + pos) < words; i++)
6037 classes[i + pos] =
6038 merge_classes (subclasses[i], classes[i + pos]);
6039 }
6040 }
6041 }
6042 break;
6043
6044 case ARRAY_TYPE:
6045 /* Arrays are handled as small records. */
6046 {
6047 int num;
6048 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6049 TREE_TYPE (type), subclasses, bit_offset);
6050 if (!num)
6051 return 0;
6052
6053 /* The partial classes are now full classes. */
6054 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6055 subclasses[0] = X86_64_SSE_CLASS;
6056 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6057 && !((bit_offset % 64) == 0 && bytes == 4))
6058 subclasses[0] = X86_64_INTEGER_CLASS;
6059
6060 for (i = 0; i < words; i++)
6061 classes[i] = subclasses[i % num];
6062
6063 break;
6064 }
6065 case UNION_TYPE:
6066 case QUAL_UNION_TYPE:
6067 /* Unions are similar to RECORD_TYPE but offset is always 0.
6068 */
6069 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6070 {
6071 if (TREE_CODE (field) == FIELD_DECL)
6072 {
6073 int num;
6074
6075 if (TREE_TYPE (field) == error_mark_node)
6076 continue;
6077
6078 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6079 TREE_TYPE (field), subclasses,
6080 bit_offset);
6081 if (!num)
6082 return 0;
6083 for (i = 0; i < num; i++)
6084 classes[i] = merge_classes (subclasses[i], classes[i]);
6085 }
6086 }
6087 break;
6088
6089 default:
6090 gcc_unreachable ();
6091 }
6092
6093 if (words > 2)
6094 {
6095 /* When size > 16 bytes, if the first one isn't
6096 X86_64_SSE_CLASS or any other ones aren't
6097 X86_64_SSEUP_CLASS, everything should be passed in
6098 memory. */
6099 if (classes[0] != X86_64_SSE_CLASS)
6100 return 0;
6101
6102 for (i = 1; i < words; i++)
6103 if (classes[i] != X86_64_SSEUP_CLASS)
6104 return 0;
6105 }
6106
6107 /* Final merger cleanup. */
6108 for (i = 0; i < words; i++)
6109 {
6110 /* If one class is MEMORY, everything should be passed in
6111 memory. */
6112 if (classes[i] == X86_64_MEMORY_CLASS)
6113 return 0;
6114
6115 /* The X86_64_SSEUP_CLASS should be always preceded by
6116 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6117 if (classes[i] == X86_64_SSEUP_CLASS
6118 && classes[i - 1] != X86_64_SSE_CLASS
6119 && classes[i - 1] != X86_64_SSEUP_CLASS)
6120 {
6121 /* The first one should never be X86_64_SSEUP_CLASS. */
6122 gcc_assert (i != 0);
6123 classes[i] = X86_64_SSE_CLASS;
6124 }
6125
6126 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6127 everything should be passed in memory. */
6128 if (classes[i] == X86_64_X87UP_CLASS
6129 && (classes[i - 1] != X86_64_X87_CLASS))
6130 {
6131 static bool warned;
6132
6133 /* The first one should never be X86_64_X87UP_CLASS. */
6134 gcc_assert (i != 0);
6135 if (!warned && warn_psabi)
6136 {
6137 warned = true;
6138 inform (input_location,
6139 "the ABI of passing union with long double"
6140 " has changed in GCC 4.4");
6141 }
6142 return 0;
6143 }
6144 }
6145 return words;
6146 }
6147
6148 /* Compute alignment needed. We align all types to natural boundaries with
6149 exception of XFmode that is aligned to 64bits. */
6150 if (mode != VOIDmode && mode != BLKmode)
6151 {
6152 int mode_alignment = GET_MODE_BITSIZE (mode);
6153
6154 if (mode == XFmode)
6155 mode_alignment = 128;
6156 else if (mode == XCmode)
6157 mode_alignment = 256;
6158 if (COMPLEX_MODE_P (mode))
6159 mode_alignment /= 2;
6160 /* Misaligned fields are always returned in memory. */
6161 if (bit_offset % mode_alignment)
6162 return 0;
6163 }
6164
6165 /* for V1xx modes, just use the base mode */
6166 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6167 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6168 mode = GET_MODE_INNER (mode);
6169
6170 /* Classification of atomic types. */
6171 switch (mode)
6172 {
6173 case SDmode:
6174 case DDmode:
6175 classes[0] = X86_64_SSE_CLASS;
6176 return 1;
6177 case TDmode:
6178 classes[0] = X86_64_SSE_CLASS;
6179 classes[1] = X86_64_SSEUP_CLASS;
6180 return 2;
6181 case DImode:
6182 case SImode:
6183 case HImode:
6184 case QImode:
6185 case CSImode:
6186 case CHImode:
6187 case CQImode:
6188 {
6189 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6190
6191 if (size <= 32)
6192 {
6193 classes[0] = X86_64_INTEGERSI_CLASS;
6194 return 1;
6195 }
6196 else if (size <= 64)
6197 {
6198 classes[0] = X86_64_INTEGER_CLASS;
6199 return 1;
6200 }
6201 else if (size <= 64+32)
6202 {
6203 classes[0] = X86_64_INTEGER_CLASS;
6204 classes[1] = X86_64_INTEGERSI_CLASS;
6205 return 2;
6206 }
6207 else if (size <= 64+64)
6208 {
6209 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6210 return 2;
6211 }
6212 else
6213 gcc_unreachable ();
6214 }
6215 case CDImode:
6216 case TImode:
6217 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6218 return 2;
6219 case COImode:
6220 case OImode:
6221 /* OImode shouldn't be used directly. */
6222 gcc_unreachable ();
6223 case CTImode:
6224 return 0;
6225 case SFmode:
6226 if (!(bit_offset % 64))
6227 classes[0] = X86_64_SSESF_CLASS;
6228 else
6229 classes[0] = X86_64_SSE_CLASS;
6230 return 1;
6231 case DFmode:
6232 classes[0] = X86_64_SSEDF_CLASS;
6233 return 1;
6234 case XFmode:
6235 classes[0] = X86_64_X87_CLASS;
6236 classes[1] = X86_64_X87UP_CLASS;
6237 return 2;
6238 case TFmode:
6239 classes[0] = X86_64_SSE_CLASS;
6240 classes[1] = X86_64_SSEUP_CLASS;
6241 return 2;
6242 case SCmode:
6243 classes[0] = X86_64_SSE_CLASS;
6244 if (!(bit_offset % 64))
6245 return 1;
6246 else
6247 {
6248 static bool warned;
6249
6250 if (!warned && warn_psabi)
6251 {
6252 warned = true;
6253 inform (input_location,
6254 "the ABI of passing structure with complex float"
6255 " member has changed in GCC 4.4");
6256 }
6257 classes[1] = X86_64_SSESF_CLASS;
6258 return 2;
6259 }
6260 case DCmode:
6261 classes[0] = X86_64_SSEDF_CLASS;
6262 classes[1] = X86_64_SSEDF_CLASS;
6263 return 2;
6264 case XCmode:
6265 classes[0] = X86_64_COMPLEX_X87_CLASS;
6266 return 1;
6267 case TCmode:
6268 /* This modes is larger than 16 bytes. */
6269 return 0;
6270 case V8SFmode:
6271 case V8SImode:
6272 case V32QImode:
6273 case V16HImode:
6274 case V4DFmode:
6275 case V4DImode:
6276 classes[0] = X86_64_SSE_CLASS;
6277 classes[1] = X86_64_SSEUP_CLASS;
6278 classes[2] = X86_64_SSEUP_CLASS;
6279 classes[3] = X86_64_SSEUP_CLASS;
6280 return 4;
6281 case V4SFmode:
6282 case V4SImode:
6283 case V16QImode:
6284 case V8HImode:
6285 case V2DFmode:
6286 case V2DImode:
6287 classes[0] = X86_64_SSE_CLASS;
6288 classes[1] = X86_64_SSEUP_CLASS;
6289 return 2;
6290 case V1TImode:
6291 case V1DImode:
6292 case V2SFmode:
6293 case V2SImode:
6294 case V4HImode:
6295 case V8QImode:
6296 classes[0] = X86_64_SSE_CLASS;
6297 return 1;
6298 case BLKmode:
6299 case VOIDmode:
6300 return 0;
6301 default:
6302 gcc_assert (VECTOR_MODE_P (mode));
6303
6304 if (bytes > 16)
6305 return 0;
6306
6307 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6308
6309 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6310 classes[0] = X86_64_INTEGERSI_CLASS;
6311 else
6312 classes[0] = X86_64_INTEGER_CLASS;
6313 classes[1] = X86_64_INTEGER_CLASS;
6314 return 1 + (bytes > 8);
6315 }
6316 }
6317
6318 /* Examine the argument and return set number of register required in each
6319 class. Return 0 iff parameter should be passed in memory. */
6320 static int
6321 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6322 int *int_nregs, int *sse_nregs)
6323 {
6324 enum x86_64_reg_class regclass[MAX_CLASSES];
6325 int n = classify_argument (mode, type, regclass, 0);
6326
6327 *int_nregs = 0;
6328 *sse_nregs = 0;
6329 if (!n)
6330 return 0;
6331 for (n--; n >= 0; n--)
6332 switch (regclass[n])
6333 {
6334 case X86_64_INTEGER_CLASS:
6335 case X86_64_INTEGERSI_CLASS:
6336 (*int_nregs)++;
6337 break;
6338 case X86_64_SSE_CLASS:
6339 case X86_64_SSESF_CLASS:
6340 case X86_64_SSEDF_CLASS:
6341 (*sse_nregs)++;
6342 break;
6343 case X86_64_NO_CLASS:
6344 case X86_64_SSEUP_CLASS:
6345 break;
6346 case X86_64_X87_CLASS:
6347 case X86_64_X87UP_CLASS:
6348 if (!in_return)
6349 return 0;
6350 break;
6351 case X86_64_COMPLEX_X87_CLASS:
6352 return in_return ? 2 : 0;
6353 case X86_64_MEMORY_CLASS:
6354 gcc_unreachable ();
6355 }
6356 return 1;
6357 }
6358
6359 /* Construct container for the argument used by GCC interface. See
6360 FUNCTION_ARG for the detailed description. */
6361
6362 static rtx
6363 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6364 const_tree type, int in_return, int nintregs, int nsseregs,
6365 const int *intreg, int sse_regno)
6366 {
6367 /* The following variables hold the static issued_error state. */
6368 static bool issued_sse_arg_error;
6369 static bool issued_sse_ret_error;
6370 static bool issued_x87_ret_error;
6371
6372 enum machine_mode tmpmode;
6373 int bytes =
6374 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6375 enum x86_64_reg_class regclass[MAX_CLASSES];
6376 int n;
6377 int i;
6378 int nexps = 0;
6379 int needed_sseregs, needed_intregs;
6380 rtx exp[MAX_CLASSES];
6381 rtx ret;
6382
6383 n = classify_argument (mode, type, regclass, 0);
6384 if (!n)
6385 return NULL;
6386 if (!examine_argument (mode, type, in_return, &needed_intregs,
6387 &needed_sseregs))
6388 return NULL;
6389 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6390 return NULL;
6391
6392 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6393 some less clueful developer tries to use floating-point anyway. */
6394 if (needed_sseregs && !TARGET_SSE)
6395 {
6396 if (in_return)
6397 {
6398 if (!issued_sse_ret_error)
6399 {
6400 error ("SSE register return with SSE disabled");
6401 issued_sse_ret_error = true;
6402 }
6403 }
6404 else if (!issued_sse_arg_error)
6405 {
6406 error ("SSE register argument with SSE disabled");
6407 issued_sse_arg_error = true;
6408 }
6409 return NULL;
6410 }
6411
6412 /* Likewise, error if the ABI requires us to return values in the
6413 x87 registers and the user specified -mno-80387. */
6414 if (!TARGET_80387 && in_return)
6415 for (i = 0; i < n; i++)
6416 if (regclass[i] == X86_64_X87_CLASS
6417 || regclass[i] == X86_64_X87UP_CLASS
6418 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6419 {
6420 if (!issued_x87_ret_error)
6421 {
6422 error ("x87 register return with x87 disabled");
6423 issued_x87_ret_error = true;
6424 }
6425 return NULL;
6426 }
6427
6428 /* First construct simple cases. Avoid SCmode, since we want to use
6429 single register to pass this type. */
6430 if (n == 1 && mode != SCmode)
6431 switch (regclass[0])
6432 {
6433 case X86_64_INTEGER_CLASS:
6434 case X86_64_INTEGERSI_CLASS:
6435 return gen_rtx_REG (mode, intreg[0]);
6436 case X86_64_SSE_CLASS:
6437 case X86_64_SSESF_CLASS:
6438 case X86_64_SSEDF_CLASS:
6439 if (mode != BLKmode)
6440 return gen_reg_or_parallel (mode, orig_mode,
6441 SSE_REGNO (sse_regno));
6442 break;
6443 case X86_64_X87_CLASS:
6444 case X86_64_COMPLEX_X87_CLASS:
6445 return gen_rtx_REG (mode, FIRST_STACK_REG);
6446 case X86_64_NO_CLASS:
6447 /* Zero sized array, struct or class. */
6448 return NULL;
6449 default:
6450 gcc_unreachable ();
6451 }
6452 if (n == 2
6453 && regclass[0] == X86_64_SSE_CLASS
6454 && regclass[1] == X86_64_SSEUP_CLASS
6455 && mode != BLKmode)
6456 return gen_reg_or_parallel (mode, orig_mode,
6457 SSE_REGNO (sse_regno));
6458 if (n == 4
6459 && regclass[0] == X86_64_SSE_CLASS
6460 && regclass[1] == X86_64_SSEUP_CLASS
6461 && regclass[2] == X86_64_SSEUP_CLASS
6462 && regclass[3] == X86_64_SSEUP_CLASS
6463 && mode != BLKmode)
6464 return gen_reg_or_parallel (mode, orig_mode,
6465 SSE_REGNO (sse_regno));
6466 if (n == 2
6467 && regclass[0] == X86_64_X87_CLASS
6468 && regclass[1] == X86_64_X87UP_CLASS)
6469 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6470
6471 if (n == 2
6472 && regclass[0] == X86_64_INTEGER_CLASS
6473 && regclass[1] == X86_64_INTEGER_CLASS
6474 && (mode == CDImode || mode == TImode || mode == TFmode)
6475 && intreg[0] + 1 == intreg[1])
6476 return gen_rtx_REG (mode, intreg[0]);
6477
6478 /* Otherwise figure out the entries of the PARALLEL. */
6479 for (i = 0; i < n; i++)
6480 {
6481 int pos;
6482
6483 switch (regclass[i])
6484 {
6485 case X86_64_NO_CLASS:
6486 break;
6487 case X86_64_INTEGER_CLASS:
6488 case X86_64_INTEGERSI_CLASS:
6489 /* Merge TImodes on aligned occasions here too. */
6490 if (i * 8 + 8 > bytes)
6491 tmpmode
6492 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6493 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6494 tmpmode = SImode;
6495 else
6496 tmpmode = DImode;
6497 /* We've requested 24 bytes we
6498 don't have mode for. Use DImode. */
6499 if (tmpmode == BLKmode)
6500 tmpmode = DImode;
6501 exp [nexps++]
6502 = gen_rtx_EXPR_LIST (VOIDmode,
6503 gen_rtx_REG (tmpmode, *intreg),
6504 GEN_INT (i*8));
6505 intreg++;
6506 break;
6507 case X86_64_SSESF_CLASS:
6508 exp [nexps++]
6509 = gen_rtx_EXPR_LIST (VOIDmode,
6510 gen_rtx_REG (SFmode,
6511 SSE_REGNO (sse_regno)),
6512 GEN_INT (i*8));
6513 sse_regno++;
6514 break;
6515 case X86_64_SSEDF_CLASS:
6516 exp [nexps++]
6517 = gen_rtx_EXPR_LIST (VOIDmode,
6518 gen_rtx_REG (DFmode,
6519 SSE_REGNO (sse_regno)),
6520 GEN_INT (i*8));
6521 sse_regno++;
6522 break;
6523 case X86_64_SSE_CLASS:
6524 pos = i;
6525 switch (n)
6526 {
6527 case 1:
6528 tmpmode = DImode;
6529 break;
6530 case 2:
6531 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6532 {
6533 tmpmode = TImode;
6534 i++;
6535 }
6536 else
6537 tmpmode = DImode;
6538 break;
6539 case 4:
6540 gcc_assert (i == 0
6541 && regclass[1] == X86_64_SSEUP_CLASS
6542 && regclass[2] == X86_64_SSEUP_CLASS
6543 && regclass[3] == X86_64_SSEUP_CLASS);
6544 tmpmode = OImode;
6545 i += 3;
6546 break;
6547 default:
6548 gcc_unreachable ();
6549 }
6550 exp [nexps++]
6551 = gen_rtx_EXPR_LIST (VOIDmode,
6552 gen_rtx_REG (tmpmode,
6553 SSE_REGNO (sse_regno)),
6554 GEN_INT (pos*8));
6555 sse_regno++;
6556 break;
6557 default:
6558 gcc_unreachable ();
6559 }
6560 }
6561
6562 /* Empty aligned struct, union or class. */
6563 if (nexps == 0)
6564 return NULL;
6565
6566 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6567 for (i = 0; i < nexps; i++)
6568 XVECEXP (ret, 0, i) = exp [i];
6569 return ret;
6570 }
6571
6572 /* Update the data in CUM to advance over an argument of mode MODE
6573 and data type TYPE. (TYPE is null for libcalls where that information
6574 may not be available.) */
6575
6576 static void
6577 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6578 const_tree type, HOST_WIDE_INT bytes,
6579 HOST_WIDE_INT words)
6580 {
6581 switch (mode)
6582 {
6583 default:
6584 break;
6585
6586 case BLKmode:
6587 if (bytes < 0)
6588 break;
6589 /* FALLTHRU */
6590
6591 case DImode:
6592 case SImode:
6593 case HImode:
6594 case QImode:
6595 cum->words += words;
6596 cum->nregs -= words;
6597 cum->regno += words;
6598
6599 if (cum->nregs <= 0)
6600 {
6601 cum->nregs = 0;
6602 cum->regno = 0;
6603 }
6604 break;
6605
6606 case OImode:
6607 /* OImode shouldn't be used directly. */
6608 gcc_unreachable ();
6609
6610 case DFmode:
6611 if (cum->float_in_sse < 2)
6612 break;
6613 case SFmode:
6614 if (cum->float_in_sse < 1)
6615 break;
6616 /* FALLTHRU */
6617
6618 case V8SFmode:
6619 case V8SImode:
6620 case V32QImode:
6621 case V16HImode:
6622 case V4DFmode:
6623 case V4DImode:
6624 case TImode:
6625 case V16QImode:
6626 case V8HImode:
6627 case V4SImode:
6628 case V2DImode:
6629 case V4SFmode:
6630 case V2DFmode:
6631 if (!type || !AGGREGATE_TYPE_P (type))
6632 {
6633 cum->sse_words += words;
6634 cum->sse_nregs -= 1;
6635 cum->sse_regno += 1;
6636 if (cum->sse_nregs <= 0)
6637 {
6638 cum->sse_nregs = 0;
6639 cum->sse_regno = 0;
6640 }
6641 }
6642 break;
6643
6644 case V8QImode:
6645 case V4HImode:
6646 case V2SImode:
6647 case V2SFmode:
6648 case V1TImode:
6649 case V1DImode:
6650 if (!type || !AGGREGATE_TYPE_P (type))
6651 {
6652 cum->mmx_words += words;
6653 cum->mmx_nregs -= 1;
6654 cum->mmx_regno += 1;
6655 if (cum->mmx_nregs <= 0)
6656 {
6657 cum->mmx_nregs = 0;
6658 cum->mmx_regno = 0;
6659 }
6660 }
6661 break;
6662 }
6663 }
6664
6665 static void
6666 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6667 const_tree type, HOST_WIDE_INT words, bool named)
6668 {
6669 int int_nregs, sse_nregs;
6670
6671 /* Unnamed 256bit vector mode parameters are passed on stack. */
6672 if (!named && VALID_AVX256_REG_MODE (mode))
6673 return;
6674
6675 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6676 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6677 {
6678 cum->nregs -= int_nregs;
6679 cum->sse_nregs -= sse_nregs;
6680 cum->regno += int_nregs;
6681 cum->sse_regno += sse_nregs;
6682 }
6683 else
6684 {
6685 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6686 cum->words = (cum->words + align - 1) & ~(align - 1);
6687 cum->words += words;
6688 }
6689 }
6690
6691 static void
6692 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6693 HOST_WIDE_INT words)
6694 {
6695 /* Otherwise, this should be passed indirect. */
6696 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6697
6698 cum->words += words;
6699 if (cum->nregs > 0)
6700 {
6701 cum->nregs -= 1;
6702 cum->regno += 1;
6703 }
6704 }
6705
6706 /* Update the data in CUM to advance over an argument of mode MODE and
6707 data type TYPE. (TYPE is null for libcalls where that information
6708 may not be available.) */
6709
6710 static void
6711 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6712 const_tree type, bool named)
6713 {
6714 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6715 HOST_WIDE_INT bytes, words;
6716
6717 if (mode == BLKmode)
6718 bytes = int_size_in_bytes (type);
6719 else
6720 bytes = GET_MODE_SIZE (mode);
6721 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6722
6723 if (type)
6724 mode = type_natural_mode (type, NULL);
6725
6726 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6727 function_arg_advance_ms_64 (cum, bytes, words);
6728 else if (TARGET_64BIT)
6729 function_arg_advance_64 (cum, mode, type, words, named);
6730 else
6731 function_arg_advance_32 (cum, mode, type, bytes, words);
6732 }
6733
6734 /* Define where to put the arguments to a function.
6735 Value is zero to push the argument on the stack,
6736 or a hard register in which to store the argument.
6737
6738 MODE is the argument's machine mode.
6739 TYPE is the data type of the argument (as a tree).
6740 This is null for libcalls where that information may
6741 not be available.
6742 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6743 the preceding args and about the function being called.
6744 NAMED is nonzero if this argument is a named parameter
6745 (otherwise it is an extra parameter matching an ellipsis). */
6746
6747 static rtx
6748 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6749 enum machine_mode orig_mode, const_tree type,
6750 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6751 {
6752 static bool warnedsse, warnedmmx;
6753
6754 /* Avoid the AL settings for the Unix64 ABI. */
6755 if (mode == VOIDmode)
6756 return constm1_rtx;
6757
6758 switch (mode)
6759 {
6760 default:
6761 break;
6762
6763 case BLKmode:
6764 if (bytes < 0)
6765 break;
6766 /* FALLTHRU */
6767 case DImode:
6768 case SImode:
6769 case HImode:
6770 case QImode:
6771 if (words <= cum->nregs)
6772 {
6773 int regno = cum->regno;
6774
6775 /* Fastcall allocates the first two DWORD (SImode) or
6776 smaller arguments to ECX and EDX if it isn't an
6777 aggregate type . */
6778 if (cum->fastcall)
6779 {
6780 if (mode == BLKmode
6781 || mode == DImode
6782 || (type && AGGREGATE_TYPE_P (type)))
6783 break;
6784
6785 /* ECX not EAX is the first allocated register. */
6786 if (regno == AX_REG)
6787 regno = CX_REG;
6788 }
6789 return gen_rtx_REG (mode, regno);
6790 }
6791 break;
6792
6793 case DFmode:
6794 if (cum->float_in_sse < 2)
6795 break;
6796 case SFmode:
6797 if (cum->float_in_sse < 1)
6798 break;
6799 /* FALLTHRU */
6800 case TImode:
6801 /* In 32bit, we pass TImode in xmm registers. */
6802 case V16QImode:
6803 case V8HImode:
6804 case V4SImode:
6805 case V2DImode:
6806 case V4SFmode:
6807 case V2DFmode:
6808 if (!type || !AGGREGATE_TYPE_P (type))
6809 {
6810 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6811 {
6812 warnedsse = true;
6813 warning (0, "SSE vector argument without SSE enabled "
6814 "changes the ABI");
6815 }
6816 if (cum->sse_nregs)
6817 return gen_reg_or_parallel (mode, orig_mode,
6818 cum->sse_regno + FIRST_SSE_REG);
6819 }
6820 break;
6821
6822 case OImode:
6823 /* OImode shouldn't be used directly. */
6824 gcc_unreachable ();
6825
6826 case V8SFmode:
6827 case V8SImode:
6828 case V32QImode:
6829 case V16HImode:
6830 case V4DFmode:
6831 case V4DImode:
6832 if (!type || !AGGREGATE_TYPE_P (type))
6833 {
6834 if (cum->sse_nregs)
6835 return gen_reg_or_parallel (mode, orig_mode,
6836 cum->sse_regno + FIRST_SSE_REG);
6837 }
6838 break;
6839
6840 case V8QImode:
6841 case V4HImode:
6842 case V2SImode:
6843 case V2SFmode:
6844 case V1TImode:
6845 case V1DImode:
6846 if (!type || !AGGREGATE_TYPE_P (type))
6847 {
6848 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6849 {
6850 warnedmmx = true;
6851 warning (0, "MMX vector argument without MMX enabled "
6852 "changes the ABI");
6853 }
6854 if (cum->mmx_nregs)
6855 return gen_reg_or_parallel (mode, orig_mode,
6856 cum->mmx_regno + FIRST_MMX_REG);
6857 }
6858 break;
6859 }
6860
6861 return NULL_RTX;
6862 }
6863
6864 static rtx
6865 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6866 enum machine_mode orig_mode, const_tree type, bool named)
6867 {
6868 /* Handle a hidden AL argument containing number of registers
6869 for varargs x86-64 functions. */
6870 if (mode == VOIDmode)
6871 return GEN_INT (cum->maybe_vaarg
6872 ? (cum->sse_nregs < 0
6873 ? X86_64_SSE_REGPARM_MAX
6874 : cum->sse_regno)
6875 : -1);
6876
6877 switch (mode)
6878 {
6879 default:
6880 break;
6881
6882 case V8SFmode:
6883 case V8SImode:
6884 case V32QImode:
6885 case V16HImode:
6886 case V4DFmode:
6887 case V4DImode:
6888 /* Unnamed 256bit vector mode parameters are passed on stack. */
6889 if (!named)
6890 return NULL;
6891 break;
6892 }
6893
6894 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6895 cum->sse_nregs,
6896 &x86_64_int_parameter_registers [cum->regno],
6897 cum->sse_regno);
6898 }
6899
6900 static rtx
6901 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6902 enum machine_mode orig_mode, bool named,
6903 HOST_WIDE_INT bytes)
6904 {
6905 unsigned int regno;
6906
6907 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6908 We use value of -2 to specify that current function call is MSABI. */
6909 if (mode == VOIDmode)
6910 return GEN_INT (-2);
6911
6912 /* If we've run out of registers, it goes on the stack. */
6913 if (cum->nregs == 0)
6914 return NULL_RTX;
6915
6916 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6917
6918 /* Only floating point modes are passed in anything but integer regs. */
6919 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6920 {
6921 if (named)
6922 regno = cum->regno + FIRST_SSE_REG;
6923 else
6924 {
6925 rtx t1, t2;
6926
6927 /* Unnamed floating parameters are passed in both the
6928 SSE and integer registers. */
6929 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6930 t2 = gen_rtx_REG (mode, regno);
6931 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6932 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6933 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6934 }
6935 }
6936 /* Handle aggregated types passed in register. */
6937 if (orig_mode == BLKmode)
6938 {
6939 if (bytes > 0 && bytes <= 8)
6940 mode = (bytes > 4 ? DImode : SImode);
6941 if (mode == BLKmode)
6942 mode = DImode;
6943 }
6944
6945 return gen_reg_or_parallel (mode, orig_mode, regno);
6946 }
6947
6948 /* Return where to put the arguments to a function.
6949 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6950
6951 MODE is the argument's machine mode. TYPE is the data type of the
6952 argument. It is null for libcalls where that information may not be
6953 available. CUM gives information about the preceding args and about
6954 the function being called. NAMED is nonzero if this argument is a
6955 named parameter (otherwise it is an extra parameter matching an
6956 ellipsis). */
6957
6958 static rtx
6959 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6960 const_tree type, bool named)
6961 {
6962 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6963 enum machine_mode mode = omode;
6964 HOST_WIDE_INT bytes, words;
6965 rtx arg;
6966
6967 if (mode == BLKmode)
6968 bytes = int_size_in_bytes (type);
6969 else
6970 bytes = GET_MODE_SIZE (mode);
6971 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6972
6973 /* To simplify the code below, represent vector types with a vector mode
6974 even if MMX/SSE are not active. */
6975 if (type && TREE_CODE (type) == VECTOR_TYPE)
6976 mode = type_natural_mode (type, cum);
6977
6978 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6979 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6980 else if (TARGET_64BIT)
6981 arg = function_arg_64 (cum, mode, omode, type, named);
6982 else
6983 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6984
6985 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6986 {
6987 /* This argument uses 256bit AVX modes. */
6988 if (cum->caller)
6989 cfun->machine->callee_pass_avx256_p = true;
6990 else
6991 cfun->machine->caller_pass_avx256_p = true;
6992 }
6993
6994 return arg;
6995 }
6996
6997 /* A C expression that indicates when an argument must be passed by
6998 reference. If nonzero for an argument, a copy of that argument is
6999 made in memory and a pointer to the argument is passed instead of
7000 the argument itself. The pointer is passed in whatever way is
7001 appropriate for passing a pointer to that type. */
7002
7003 static bool
7004 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7005 enum machine_mode mode ATTRIBUTE_UNUSED,
7006 const_tree type, bool named ATTRIBUTE_UNUSED)
7007 {
7008 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7009
7010 /* See Windows x64 Software Convention. */
7011 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7012 {
7013 int msize = (int) GET_MODE_SIZE (mode);
7014 if (type)
7015 {
7016 /* Arrays are passed by reference. */
7017 if (TREE_CODE (type) == ARRAY_TYPE)
7018 return true;
7019
7020 if (AGGREGATE_TYPE_P (type))
7021 {
7022 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7023 are passed by reference. */
7024 msize = int_size_in_bytes (type);
7025 }
7026 }
7027
7028 /* __m128 is passed by reference. */
7029 switch (msize) {
7030 case 1: case 2: case 4: case 8:
7031 break;
7032 default:
7033 return true;
7034 }
7035 }
7036 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7037 return 1;
7038
7039 return 0;
7040 }
7041
7042 /* Return true when TYPE should be 128bit aligned for 32bit argument
7043 passing ABI. XXX: This function is obsolete and is only used for
7044 checking psABI compatibility with previous versions of GCC. */
7045
7046 static bool
7047 ix86_compat_aligned_value_p (const_tree type)
7048 {
7049 enum machine_mode mode = TYPE_MODE (type);
7050 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7051 || mode == TDmode
7052 || mode == TFmode
7053 || mode == TCmode)
7054 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7055 return true;
7056 if (TYPE_ALIGN (type) < 128)
7057 return false;
7058
7059 if (AGGREGATE_TYPE_P (type))
7060 {
7061 /* Walk the aggregates recursively. */
7062 switch (TREE_CODE (type))
7063 {
7064 case RECORD_TYPE:
7065 case UNION_TYPE:
7066 case QUAL_UNION_TYPE:
7067 {
7068 tree field;
7069
7070 /* Walk all the structure fields. */
7071 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7072 {
7073 if (TREE_CODE (field) == FIELD_DECL
7074 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7075 return true;
7076 }
7077 break;
7078 }
7079
7080 case ARRAY_TYPE:
7081 /* Just for use if some languages passes arrays by value. */
7082 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7083 return true;
7084 break;
7085
7086 default:
7087 gcc_unreachable ();
7088 }
7089 }
7090 return false;
7091 }
7092
7093 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7094 XXX: This function is obsolete and is only used for checking psABI
7095 compatibility with previous versions of GCC. */
7096
7097 static unsigned int
7098 ix86_compat_function_arg_boundary (enum machine_mode mode,
7099 const_tree type, unsigned int align)
7100 {
7101 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7102 natural boundaries. */
7103 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7104 {
7105 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7106 make an exception for SSE modes since these require 128bit
7107 alignment.
7108
7109 The handling here differs from field_alignment. ICC aligns MMX
7110 arguments to 4 byte boundaries, while structure fields are aligned
7111 to 8 byte boundaries. */
7112 if (!type)
7113 {
7114 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7115 align = PARM_BOUNDARY;
7116 }
7117 else
7118 {
7119 if (!ix86_compat_aligned_value_p (type))
7120 align = PARM_BOUNDARY;
7121 }
7122 }
7123 if (align > BIGGEST_ALIGNMENT)
7124 align = BIGGEST_ALIGNMENT;
7125 return align;
7126 }
7127
7128 /* Return true when TYPE should be 128bit aligned for 32bit argument
7129 passing ABI. */
7130
7131 static bool
7132 ix86_contains_aligned_value_p (const_tree type)
7133 {
7134 enum machine_mode mode = TYPE_MODE (type);
7135
7136 if (mode == XFmode || mode == XCmode)
7137 return false;
7138
7139 if (TYPE_ALIGN (type) < 128)
7140 return false;
7141
7142 if (AGGREGATE_TYPE_P (type))
7143 {
7144 /* Walk the aggregates recursively. */
7145 switch (TREE_CODE (type))
7146 {
7147 case RECORD_TYPE:
7148 case UNION_TYPE:
7149 case QUAL_UNION_TYPE:
7150 {
7151 tree field;
7152
7153 /* Walk all the structure fields. */
7154 for (field = TYPE_FIELDS (type);
7155 field;
7156 field = DECL_CHAIN (field))
7157 {
7158 if (TREE_CODE (field) == FIELD_DECL
7159 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7160 return true;
7161 }
7162 break;
7163 }
7164
7165 case ARRAY_TYPE:
7166 /* Just for use if some languages passes arrays by value. */
7167 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7168 return true;
7169 break;
7170
7171 default:
7172 gcc_unreachable ();
7173 }
7174 }
7175 else
7176 return TYPE_ALIGN (type) >= 128;
7177
7178 return false;
7179 }
7180
7181 /* Gives the alignment boundary, in bits, of an argument with the
7182 specified mode and type. */
7183
7184 static unsigned int
7185 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7186 {
7187 unsigned int align;
7188 if (type)
7189 {
7190 /* Since the main variant type is used for call, we convert it to
7191 the main variant type. */
7192 type = TYPE_MAIN_VARIANT (type);
7193 align = TYPE_ALIGN (type);
7194 }
7195 else
7196 align = GET_MODE_ALIGNMENT (mode);
7197 if (align < PARM_BOUNDARY)
7198 align = PARM_BOUNDARY;
7199 else
7200 {
7201 static bool warned;
7202 unsigned int saved_align = align;
7203
7204 if (!TARGET_64BIT)
7205 {
7206 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7207 if (!type)
7208 {
7209 if (mode == XFmode || mode == XCmode)
7210 align = PARM_BOUNDARY;
7211 }
7212 else if (!ix86_contains_aligned_value_p (type))
7213 align = PARM_BOUNDARY;
7214
7215 if (align < 128)
7216 align = PARM_BOUNDARY;
7217 }
7218
7219 if (warn_psabi
7220 && !warned
7221 && align != ix86_compat_function_arg_boundary (mode, type,
7222 saved_align))
7223 {
7224 warned = true;
7225 inform (input_location,
7226 "The ABI for passing parameters with %d-byte"
7227 " alignment has changed in GCC 4.6",
7228 align / BITS_PER_UNIT);
7229 }
7230 }
7231
7232 return align;
7233 }
7234
7235 /* Return true if N is a possible register number of function value. */
7236
7237 static bool
7238 ix86_function_value_regno_p (const unsigned int regno)
7239 {
7240 switch (regno)
7241 {
7242 case AX_REG:
7243 return true;
7244
7245 case FIRST_FLOAT_REG:
7246 /* TODO: The function should depend on current function ABI but
7247 builtins.c would need updating then. Therefore we use the
7248 default ABI. */
7249 if (TARGET_64BIT && ix86_abi == MS_ABI)
7250 return false;
7251 return TARGET_FLOAT_RETURNS_IN_80387;
7252
7253 case FIRST_SSE_REG:
7254 return TARGET_SSE;
7255
7256 case FIRST_MMX_REG:
7257 if (TARGET_MACHO || TARGET_64BIT)
7258 return false;
7259 return TARGET_MMX;
7260 }
7261
7262 return false;
7263 }
7264
7265 /* Define how to find the value returned by a function.
7266 VALTYPE is the data type of the value (as a tree).
7267 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7268 otherwise, FUNC is 0. */
7269
7270 static rtx
7271 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7272 const_tree fntype, const_tree fn)
7273 {
7274 unsigned int regno;
7275
7276 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7277 we normally prevent this case when mmx is not available. However
7278 some ABIs may require the result to be returned like DImode. */
7279 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7280 regno = FIRST_MMX_REG;
7281
7282 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7283 we prevent this case when sse is not available. However some ABIs
7284 may require the result to be returned like integer TImode. */
7285 else if (mode == TImode
7286 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7287 regno = FIRST_SSE_REG;
7288
7289 /* 32-byte vector modes in %ymm0. */
7290 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7291 regno = FIRST_SSE_REG;
7292
7293 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7294 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7295 regno = FIRST_FLOAT_REG;
7296 else
7297 /* Most things go in %eax. */
7298 regno = AX_REG;
7299
7300 /* Override FP return register with %xmm0 for local functions when
7301 SSE math is enabled or for functions with sseregparm attribute. */
7302 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7303 {
7304 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7305 if ((sse_level >= 1 && mode == SFmode)
7306 || (sse_level == 2 && mode == DFmode))
7307 regno = FIRST_SSE_REG;
7308 }
7309
7310 /* OImode shouldn't be used directly. */
7311 gcc_assert (mode != OImode);
7312
7313 return gen_rtx_REG (orig_mode, regno);
7314 }
7315
7316 static rtx
7317 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7318 const_tree valtype)
7319 {
7320 rtx ret;
7321
7322 /* Handle libcalls, which don't provide a type node. */
7323 if (valtype == NULL)
7324 {
7325 unsigned int regno;
7326
7327 switch (mode)
7328 {
7329 case SFmode:
7330 case SCmode:
7331 case DFmode:
7332 case DCmode:
7333 case TFmode:
7334 case SDmode:
7335 case DDmode:
7336 case TDmode:
7337 regno = FIRST_SSE_REG;
7338 break;
7339 case XFmode:
7340 case XCmode:
7341 regno = FIRST_FLOAT_REG;
7342 break;
7343 case TCmode:
7344 return NULL;
7345 default:
7346 regno = AX_REG;
7347 }
7348
7349 return gen_rtx_REG (mode, regno);
7350 }
7351 else if (POINTER_TYPE_P (valtype))
7352 {
7353 /* Pointers are always returned in word_mode. */
7354 mode = word_mode;
7355 }
7356
7357 ret = construct_container (mode, orig_mode, valtype, 1,
7358 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7359 x86_64_int_return_registers, 0);
7360
7361 /* For zero sized structures, construct_container returns NULL, but we
7362 need to keep rest of compiler happy by returning meaningful value. */
7363 if (!ret)
7364 ret = gen_rtx_REG (orig_mode, AX_REG);
7365
7366 return ret;
7367 }
7368
7369 static rtx
7370 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7371 {
7372 unsigned int regno = AX_REG;
7373
7374 if (TARGET_SSE)
7375 {
7376 switch (GET_MODE_SIZE (mode))
7377 {
7378 case 16:
7379 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7380 && !COMPLEX_MODE_P (mode))
7381 regno = FIRST_SSE_REG;
7382 break;
7383 case 8:
7384 case 4:
7385 if (mode == SFmode || mode == DFmode)
7386 regno = FIRST_SSE_REG;
7387 break;
7388 default:
7389 break;
7390 }
7391 }
7392 return gen_rtx_REG (orig_mode, regno);
7393 }
7394
7395 static rtx
7396 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7397 enum machine_mode orig_mode, enum machine_mode mode)
7398 {
7399 const_tree fn, fntype;
7400
7401 fn = NULL_TREE;
7402 if (fntype_or_decl && DECL_P (fntype_or_decl))
7403 fn = fntype_or_decl;
7404 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7405
7406 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7407 return function_value_ms_64 (orig_mode, mode);
7408 else if (TARGET_64BIT)
7409 return function_value_64 (orig_mode, mode, valtype);
7410 else
7411 return function_value_32 (orig_mode, mode, fntype, fn);
7412 }
7413
7414 static rtx
7415 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7416 bool outgoing ATTRIBUTE_UNUSED)
7417 {
7418 enum machine_mode mode, orig_mode;
7419
7420 orig_mode = TYPE_MODE (valtype);
7421 mode = type_natural_mode (valtype, NULL);
7422 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7423 }
7424
7425 /* Pointer function arguments and return values are promoted to
7426 word_mode. */
7427
7428 static enum machine_mode
7429 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7430 int *punsignedp, const_tree fntype,
7431 int for_return)
7432 {
7433 if (type != NULL_TREE && POINTER_TYPE_P (type))
7434 {
7435 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7436 return word_mode;
7437 }
7438 return default_promote_function_mode (type, mode, punsignedp, fntype,
7439 for_return);
7440 }
7441
7442 rtx
7443 ix86_libcall_value (enum machine_mode mode)
7444 {
7445 return ix86_function_value_1 (NULL, NULL, mode, mode);
7446 }
7447
7448 /* Return true iff type is returned in memory. */
7449
7450 static bool ATTRIBUTE_UNUSED
7451 return_in_memory_32 (const_tree type, enum machine_mode mode)
7452 {
7453 HOST_WIDE_INT size;
7454
7455 if (mode == BLKmode)
7456 return true;
7457
7458 size = int_size_in_bytes (type);
7459
7460 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7461 return false;
7462
7463 if (VECTOR_MODE_P (mode) || mode == TImode)
7464 {
7465 /* User-created vectors small enough to fit in EAX. */
7466 if (size < 8)
7467 return false;
7468
7469 /* MMX/3dNow values are returned in MM0,
7470 except when it doesn't exits or the ABI prescribes otherwise. */
7471 if (size == 8)
7472 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7473
7474 /* SSE values are returned in XMM0, except when it doesn't exist. */
7475 if (size == 16)
7476 return !TARGET_SSE;
7477
7478 /* AVX values are returned in YMM0, except when it doesn't exist. */
7479 if (size == 32)
7480 return !TARGET_AVX;
7481 }
7482
7483 if (mode == XFmode)
7484 return false;
7485
7486 if (size > 12)
7487 return true;
7488
7489 /* OImode shouldn't be used directly. */
7490 gcc_assert (mode != OImode);
7491
7492 return false;
7493 }
7494
7495 static bool ATTRIBUTE_UNUSED
7496 return_in_memory_64 (const_tree type, enum machine_mode mode)
7497 {
7498 int needed_intregs, needed_sseregs;
7499 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7500 }
7501
7502 static bool ATTRIBUTE_UNUSED
7503 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7504 {
7505 HOST_WIDE_INT size = int_size_in_bytes (type);
7506
7507 /* __m128 is returned in xmm0. */
7508 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7509 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7510 return false;
7511
7512 /* Otherwise, the size must be exactly in [1248]. */
7513 return size != 1 && size != 2 && size != 4 && size != 8;
7514 }
7515
7516 static bool
7517 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7518 {
7519 #ifdef SUBTARGET_RETURN_IN_MEMORY
7520 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7521 #else
7522 const enum machine_mode mode = type_natural_mode (type, NULL);
7523
7524 if (TARGET_64BIT)
7525 {
7526 if (ix86_function_type_abi (fntype) == MS_ABI)
7527 return return_in_memory_ms_64 (type, mode);
7528 else
7529 return return_in_memory_64 (type, mode);
7530 }
7531 else
7532 return return_in_memory_32 (type, mode);
7533 #endif
7534 }
7535
7536 /* When returning SSE vector types, we have a choice of either
7537 (1) being abi incompatible with a -march switch, or
7538 (2) generating an error.
7539 Given no good solution, I think the safest thing is one warning.
7540 The user won't be able to use -Werror, but....
7541
7542 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7543 called in response to actually generating a caller or callee that
7544 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7545 via aggregate_value_p for general type probing from tree-ssa. */
7546
7547 static rtx
7548 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7549 {
7550 static bool warnedsse, warnedmmx;
7551
7552 if (!TARGET_64BIT && type)
7553 {
7554 /* Look at the return type of the function, not the function type. */
7555 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7556
7557 if (!TARGET_SSE && !warnedsse)
7558 {
7559 if (mode == TImode
7560 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7561 {
7562 warnedsse = true;
7563 warning (0, "SSE vector return without SSE enabled "
7564 "changes the ABI");
7565 }
7566 }
7567
7568 if (!TARGET_MMX && !warnedmmx)
7569 {
7570 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7571 {
7572 warnedmmx = true;
7573 warning (0, "MMX vector return without MMX enabled "
7574 "changes the ABI");
7575 }
7576 }
7577 }
7578
7579 return NULL;
7580 }
7581
7582 \f
7583 /* Create the va_list data type. */
7584
7585 /* Returns the calling convention specific va_list date type.
7586 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7587
7588 static tree
7589 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7590 {
7591 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7592
7593 /* For i386 we use plain pointer to argument area. */
7594 if (!TARGET_64BIT || abi == MS_ABI)
7595 return build_pointer_type (char_type_node);
7596
7597 record = lang_hooks.types.make_type (RECORD_TYPE);
7598 type_decl = build_decl (BUILTINS_LOCATION,
7599 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7600
7601 f_gpr = build_decl (BUILTINS_LOCATION,
7602 FIELD_DECL, get_identifier ("gp_offset"),
7603 unsigned_type_node);
7604 f_fpr = build_decl (BUILTINS_LOCATION,
7605 FIELD_DECL, get_identifier ("fp_offset"),
7606 unsigned_type_node);
7607 f_ovf = build_decl (BUILTINS_LOCATION,
7608 FIELD_DECL, get_identifier ("overflow_arg_area"),
7609 ptr_type_node);
7610 f_sav = build_decl (BUILTINS_LOCATION,
7611 FIELD_DECL, get_identifier ("reg_save_area"),
7612 ptr_type_node);
7613
7614 va_list_gpr_counter_field = f_gpr;
7615 va_list_fpr_counter_field = f_fpr;
7616
7617 DECL_FIELD_CONTEXT (f_gpr) = record;
7618 DECL_FIELD_CONTEXT (f_fpr) = record;
7619 DECL_FIELD_CONTEXT (f_ovf) = record;
7620 DECL_FIELD_CONTEXT (f_sav) = record;
7621
7622 TYPE_STUB_DECL (record) = type_decl;
7623 TYPE_NAME (record) = type_decl;
7624 TYPE_FIELDS (record) = f_gpr;
7625 DECL_CHAIN (f_gpr) = f_fpr;
7626 DECL_CHAIN (f_fpr) = f_ovf;
7627 DECL_CHAIN (f_ovf) = f_sav;
7628
7629 layout_type (record);
7630
7631 /* The correct type is an array type of one element. */
7632 return build_array_type (record, build_index_type (size_zero_node));
7633 }
7634
7635 /* Setup the builtin va_list data type and for 64-bit the additional
7636 calling convention specific va_list data types. */
7637
7638 static tree
7639 ix86_build_builtin_va_list (void)
7640 {
7641 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7642
7643 /* Initialize abi specific va_list builtin types. */
7644 if (TARGET_64BIT)
7645 {
7646 tree t;
7647 if (ix86_abi == MS_ABI)
7648 {
7649 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7650 if (TREE_CODE (t) != RECORD_TYPE)
7651 t = build_variant_type_copy (t);
7652 sysv_va_list_type_node = t;
7653 }
7654 else
7655 {
7656 t = ret;
7657 if (TREE_CODE (t) != RECORD_TYPE)
7658 t = build_variant_type_copy (t);
7659 sysv_va_list_type_node = t;
7660 }
7661 if (ix86_abi != MS_ABI)
7662 {
7663 t = ix86_build_builtin_va_list_abi (MS_ABI);
7664 if (TREE_CODE (t) != RECORD_TYPE)
7665 t = build_variant_type_copy (t);
7666 ms_va_list_type_node = t;
7667 }
7668 else
7669 {
7670 t = ret;
7671 if (TREE_CODE (t) != RECORD_TYPE)
7672 t = build_variant_type_copy (t);
7673 ms_va_list_type_node = t;
7674 }
7675 }
7676
7677 return ret;
7678 }
7679
7680 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7681
7682 static void
7683 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7684 {
7685 rtx save_area, mem;
7686 alias_set_type set;
7687 int i, max;
7688
7689 /* GPR size of varargs save area. */
7690 if (cfun->va_list_gpr_size)
7691 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7692 else
7693 ix86_varargs_gpr_size = 0;
7694
7695 /* FPR size of varargs save area. We don't need it if we don't pass
7696 anything in SSE registers. */
7697 if (TARGET_SSE && cfun->va_list_fpr_size)
7698 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7699 else
7700 ix86_varargs_fpr_size = 0;
7701
7702 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7703 return;
7704
7705 save_area = frame_pointer_rtx;
7706 set = get_varargs_alias_set ();
7707
7708 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7709 if (max > X86_64_REGPARM_MAX)
7710 max = X86_64_REGPARM_MAX;
7711
7712 for (i = cum->regno; i < max; i++)
7713 {
7714 mem = gen_rtx_MEM (word_mode,
7715 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7716 MEM_NOTRAP_P (mem) = 1;
7717 set_mem_alias_set (mem, set);
7718 emit_move_insn (mem,
7719 gen_rtx_REG (word_mode,
7720 x86_64_int_parameter_registers[i]));
7721 }
7722
7723 if (ix86_varargs_fpr_size)
7724 {
7725 enum machine_mode smode;
7726 rtx label, test;
7727
7728 /* Now emit code to save SSE registers. The AX parameter contains number
7729 of SSE parameter registers used to call this function, though all we
7730 actually check here is the zero/non-zero status. */
7731
7732 label = gen_label_rtx ();
7733 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7734 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7735 label));
7736
7737 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7738 we used movdqa (i.e. TImode) instead? Perhaps even better would
7739 be if we could determine the real mode of the data, via a hook
7740 into pass_stdarg. Ignore all that for now. */
7741 smode = V4SFmode;
7742 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7743 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7744
7745 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7746 if (max > X86_64_SSE_REGPARM_MAX)
7747 max = X86_64_SSE_REGPARM_MAX;
7748
7749 for (i = cum->sse_regno; i < max; ++i)
7750 {
7751 mem = plus_constant (Pmode, save_area,
7752 i * 16 + ix86_varargs_gpr_size);
7753 mem = gen_rtx_MEM (smode, mem);
7754 MEM_NOTRAP_P (mem) = 1;
7755 set_mem_alias_set (mem, set);
7756 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7757
7758 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7759 }
7760
7761 emit_label (label);
7762 }
7763 }
7764
7765 static void
7766 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7767 {
7768 alias_set_type set = get_varargs_alias_set ();
7769 int i;
7770
7771 /* Reset to zero, as there might be a sysv vaarg used
7772 before. */
7773 ix86_varargs_gpr_size = 0;
7774 ix86_varargs_fpr_size = 0;
7775
7776 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7777 {
7778 rtx reg, mem;
7779
7780 mem = gen_rtx_MEM (Pmode,
7781 plus_constant (Pmode, virtual_incoming_args_rtx,
7782 i * UNITS_PER_WORD));
7783 MEM_NOTRAP_P (mem) = 1;
7784 set_mem_alias_set (mem, set);
7785
7786 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7787 emit_move_insn (mem, reg);
7788 }
7789 }
7790
7791 static void
7792 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7793 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7794 int no_rtl)
7795 {
7796 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7797 CUMULATIVE_ARGS next_cum;
7798 tree fntype;
7799
7800 /* This argument doesn't appear to be used anymore. Which is good,
7801 because the old code here didn't suppress rtl generation. */
7802 gcc_assert (!no_rtl);
7803
7804 if (!TARGET_64BIT)
7805 return;
7806
7807 fntype = TREE_TYPE (current_function_decl);
7808
7809 /* For varargs, we do not want to skip the dummy va_dcl argument.
7810 For stdargs, we do want to skip the last named argument. */
7811 next_cum = *cum;
7812 if (stdarg_p (fntype))
7813 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7814 true);
7815
7816 if (cum->call_abi == MS_ABI)
7817 setup_incoming_varargs_ms_64 (&next_cum);
7818 else
7819 setup_incoming_varargs_64 (&next_cum);
7820 }
7821
7822 /* Checks if TYPE is of kind va_list char *. */
7823
7824 static bool
7825 is_va_list_char_pointer (tree type)
7826 {
7827 tree canonic;
7828
7829 /* For 32-bit it is always true. */
7830 if (!TARGET_64BIT)
7831 return true;
7832 canonic = ix86_canonical_va_list_type (type);
7833 return (canonic == ms_va_list_type_node
7834 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7835 }
7836
7837 /* Implement va_start. */
7838
7839 static void
7840 ix86_va_start (tree valist, rtx nextarg)
7841 {
7842 HOST_WIDE_INT words, n_gpr, n_fpr;
7843 tree f_gpr, f_fpr, f_ovf, f_sav;
7844 tree gpr, fpr, ovf, sav, t;
7845 tree type;
7846 rtx ovf_rtx;
7847
7848 if (flag_split_stack
7849 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7850 {
7851 unsigned int scratch_regno;
7852
7853 /* When we are splitting the stack, we can't refer to the stack
7854 arguments using internal_arg_pointer, because they may be on
7855 the old stack. The split stack prologue will arrange to
7856 leave a pointer to the old stack arguments in a scratch
7857 register, which we here copy to a pseudo-register. The split
7858 stack prologue can't set the pseudo-register directly because
7859 it (the prologue) runs before any registers have been saved. */
7860
7861 scratch_regno = split_stack_prologue_scratch_regno ();
7862 if (scratch_regno != INVALID_REGNUM)
7863 {
7864 rtx reg, seq;
7865
7866 reg = gen_reg_rtx (Pmode);
7867 cfun->machine->split_stack_varargs_pointer = reg;
7868
7869 start_sequence ();
7870 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7871 seq = get_insns ();
7872 end_sequence ();
7873
7874 push_topmost_sequence ();
7875 emit_insn_after (seq, entry_of_function ());
7876 pop_topmost_sequence ();
7877 }
7878 }
7879
7880 /* Only 64bit target needs something special. */
7881 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7882 {
7883 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7884 std_expand_builtin_va_start (valist, nextarg);
7885 else
7886 {
7887 rtx va_r, next;
7888
7889 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7890 next = expand_binop (ptr_mode, add_optab,
7891 cfun->machine->split_stack_varargs_pointer,
7892 crtl->args.arg_offset_rtx,
7893 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7894 convert_move (va_r, next, 0);
7895 }
7896 return;
7897 }
7898
7899 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7900 f_fpr = DECL_CHAIN (f_gpr);
7901 f_ovf = DECL_CHAIN (f_fpr);
7902 f_sav = DECL_CHAIN (f_ovf);
7903
7904 valist = build_simple_mem_ref (valist);
7905 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7906 /* The following should be folded into the MEM_REF offset. */
7907 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7908 f_gpr, NULL_TREE);
7909 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7910 f_fpr, NULL_TREE);
7911 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7912 f_ovf, NULL_TREE);
7913 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7914 f_sav, NULL_TREE);
7915
7916 /* Count number of gp and fp argument registers used. */
7917 words = crtl->args.info.words;
7918 n_gpr = crtl->args.info.regno;
7919 n_fpr = crtl->args.info.sse_regno;
7920
7921 if (cfun->va_list_gpr_size)
7922 {
7923 type = TREE_TYPE (gpr);
7924 t = build2 (MODIFY_EXPR, type,
7925 gpr, build_int_cst (type, n_gpr * 8));
7926 TREE_SIDE_EFFECTS (t) = 1;
7927 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7928 }
7929
7930 if (TARGET_SSE && cfun->va_list_fpr_size)
7931 {
7932 type = TREE_TYPE (fpr);
7933 t = build2 (MODIFY_EXPR, type, fpr,
7934 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7935 TREE_SIDE_EFFECTS (t) = 1;
7936 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7937 }
7938
7939 /* Find the overflow area. */
7940 type = TREE_TYPE (ovf);
7941 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7942 ovf_rtx = crtl->args.internal_arg_pointer;
7943 else
7944 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7945 t = make_tree (type, ovf_rtx);
7946 if (words != 0)
7947 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7948 t = build2 (MODIFY_EXPR, type, ovf, t);
7949 TREE_SIDE_EFFECTS (t) = 1;
7950 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7951
7952 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7953 {
7954 /* Find the register save area.
7955 Prologue of the function save it right above stack frame. */
7956 type = TREE_TYPE (sav);
7957 t = make_tree (type, frame_pointer_rtx);
7958 if (!ix86_varargs_gpr_size)
7959 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7960 t = build2 (MODIFY_EXPR, type, sav, t);
7961 TREE_SIDE_EFFECTS (t) = 1;
7962 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7963 }
7964 }
7965
7966 /* Implement va_arg. */
7967
7968 static tree
7969 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7970 gimple_seq *post_p)
7971 {
7972 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7973 tree f_gpr, f_fpr, f_ovf, f_sav;
7974 tree gpr, fpr, ovf, sav, t;
7975 int size, rsize;
7976 tree lab_false, lab_over = NULL_TREE;
7977 tree addr, t2;
7978 rtx container;
7979 int indirect_p = 0;
7980 tree ptrtype;
7981 enum machine_mode nat_mode;
7982 unsigned int arg_boundary;
7983
7984 /* Only 64bit target needs something special. */
7985 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7986 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7987
7988 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7989 f_fpr = DECL_CHAIN (f_gpr);
7990 f_ovf = DECL_CHAIN (f_fpr);
7991 f_sav = DECL_CHAIN (f_ovf);
7992
7993 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7994 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7995 valist = build_va_arg_indirect_ref (valist);
7996 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7997 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7998 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7999
8000 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8001 if (indirect_p)
8002 type = build_pointer_type (type);
8003 size = int_size_in_bytes (type);
8004 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8005
8006 nat_mode = type_natural_mode (type, NULL);
8007 switch (nat_mode)
8008 {
8009 case V8SFmode:
8010 case V8SImode:
8011 case V32QImode:
8012 case V16HImode:
8013 case V4DFmode:
8014 case V4DImode:
8015 /* Unnamed 256bit vector mode parameters are passed on stack. */
8016 if (!TARGET_64BIT_MS_ABI)
8017 {
8018 container = NULL;
8019 break;
8020 }
8021
8022 default:
8023 container = construct_container (nat_mode, TYPE_MODE (type),
8024 type, 0, X86_64_REGPARM_MAX,
8025 X86_64_SSE_REGPARM_MAX, intreg,
8026 0);
8027 break;
8028 }
8029
8030 /* Pull the value out of the saved registers. */
8031
8032 addr = create_tmp_var (ptr_type_node, "addr");
8033
8034 if (container)
8035 {
8036 int needed_intregs, needed_sseregs;
8037 bool need_temp;
8038 tree int_addr, sse_addr;
8039
8040 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8041 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8042
8043 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8044
8045 need_temp = (!REG_P (container)
8046 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8047 || TYPE_ALIGN (type) > 128));
8048
8049 /* In case we are passing structure, verify that it is consecutive block
8050 on the register save area. If not we need to do moves. */
8051 if (!need_temp && !REG_P (container))
8052 {
8053 /* Verify that all registers are strictly consecutive */
8054 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8055 {
8056 int i;
8057
8058 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8059 {
8060 rtx slot = XVECEXP (container, 0, i);
8061 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8062 || INTVAL (XEXP (slot, 1)) != i * 16)
8063 need_temp = 1;
8064 }
8065 }
8066 else
8067 {
8068 int i;
8069
8070 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8071 {
8072 rtx slot = XVECEXP (container, 0, i);
8073 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8074 || INTVAL (XEXP (slot, 1)) != i * 8)
8075 need_temp = 1;
8076 }
8077 }
8078 }
8079 if (!need_temp)
8080 {
8081 int_addr = addr;
8082 sse_addr = addr;
8083 }
8084 else
8085 {
8086 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8087 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8088 }
8089
8090 /* First ensure that we fit completely in registers. */
8091 if (needed_intregs)
8092 {
8093 t = build_int_cst (TREE_TYPE (gpr),
8094 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8095 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8096 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8097 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8098 gimplify_and_add (t, pre_p);
8099 }
8100 if (needed_sseregs)
8101 {
8102 t = build_int_cst (TREE_TYPE (fpr),
8103 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8104 + X86_64_REGPARM_MAX * 8);
8105 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8106 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8107 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8108 gimplify_and_add (t, pre_p);
8109 }
8110
8111 /* Compute index to start of area used for integer regs. */
8112 if (needed_intregs)
8113 {
8114 /* int_addr = gpr + sav; */
8115 t = fold_build_pointer_plus (sav, gpr);
8116 gimplify_assign (int_addr, t, pre_p);
8117 }
8118 if (needed_sseregs)
8119 {
8120 /* sse_addr = fpr + sav; */
8121 t = fold_build_pointer_plus (sav, fpr);
8122 gimplify_assign (sse_addr, t, pre_p);
8123 }
8124 if (need_temp)
8125 {
8126 int i, prev_size = 0;
8127 tree temp = create_tmp_var (type, "va_arg_tmp");
8128
8129 /* addr = &temp; */
8130 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8131 gimplify_assign (addr, t, pre_p);
8132
8133 for (i = 0; i < XVECLEN (container, 0); i++)
8134 {
8135 rtx slot = XVECEXP (container, 0, i);
8136 rtx reg = XEXP (slot, 0);
8137 enum machine_mode mode = GET_MODE (reg);
8138 tree piece_type;
8139 tree addr_type;
8140 tree daddr_type;
8141 tree src_addr, src;
8142 int src_offset;
8143 tree dest_addr, dest;
8144 int cur_size = GET_MODE_SIZE (mode);
8145
8146 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8147 prev_size = INTVAL (XEXP (slot, 1));
8148 if (prev_size + cur_size > size)
8149 {
8150 cur_size = size - prev_size;
8151 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8152 if (mode == BLKmode)
8153 mode = QImode;
8154 }
8155 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8156 if (mode == GET_MODE (reg))
8157 addr_type = build_pointer_type (piece_type);
8158 else
8159 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8160 true);
8161 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8162 true);
8163
8164 if (SSE_REGNO_P (REGNO (reg)))
8165 {
8166 src_addr = sse_addr;
8167 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8168 }
8169 else
8170 {
8171 src_addr = int_addr;
8172 src_offset = REGNO (reg) * 8;
8173 }
8174 src_addr = fold_convert (addr_type, src_addr);
8175 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8176
8177 dest_addr = fold_convert (daddr_type, addr);
8178 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8179 if (cur_size == GET_MODE_SIZE (mode))
8180 {
8181 src = build_va_arg_indirect_ref (src_addr);
8182 dest = build_va_arg_indirect_ref (dest_addr);
8183
8184 gimplify_assign (dest, src, pre_p);
8185 }
8186 else
8187 {
8188 tree copy
8189 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8190 3, dest_addr, src_addr,
8191 size_int (cur_size));
8192 gimplify_and_add (copy, pre_p);
8193 }
8194 prev_size += cur_size;
8195 }
8196 }
8197
8198 if (needed_intregs)
8199 {
8200 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8201 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8202 gimplify_assign (gpr, t, pre_p);
8203 }
8204
8205 if (needed_sseregs)
8206 {
8207 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8208 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8209 gimplify_assign (fpr, t, pre_p);
8210 }
8211
8212 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8213
8214 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8215 }
8216
8217 /* ... otherwise out of the overflow area. */
8218
8219 /* When we align parameter on stack for caller, if the parameter
8220 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8221 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8222 here with caller. */
8223 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8224 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8225 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8226
8227 /* Care for on-stack alignment if needed. */
8228 if (arg_boundary <= 64 || size == 0)
8229 t = ovf;
8230 else
8231 {
8232 HOST_WIDE_INT align = arg_boundary / 8;
8233 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8234 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8235 build_int_cst (TREE_TYPE (t), -align));
8236 }
8237
8238 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8239 gimplify_assign (addr, t, pre_p);
8240
8241 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8242 gimplify_assign (unshare_expr (ovf), t, pre_p);
8243
8244 if (container)
8245 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8246
8247 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8248 addr = fold_convert (ptrtype, addr);
8249
8250 if (indirect_p)
8251 addr = build_va_arg_indirect_ref (addr);
8252 return build_va_arg_indirect_ref (addr);
8253 }
8254 \f
8255 /* Return true if OPNUM's MEM should be matched
8256 in movabs* patterns. */
8257
8258 bool
8259 ix86_check_movabs (rtx insn, int opnum)
8260 {
8261 rtx set, mem;
8262
8263 set = PATTERN (insn);
8264 if (GET_CODE (set) == PARALLEL)
8265 set = XVECEXP (set, 0, 0);
8266 gcc_assert (GET_CODE (set) == SET);
8267 mem = XEXP (set, opnum);
8268 while (GET_CODE (mem) == SUBREG)
8269 mem = SUBREG_REG (mem);
8270 gcc_assert (MEM_P (mem));
8271 return volatile_ok || !MEM_VOLATILE_P (mem);
8272 }
8273 \f
8274 /* Initialize the table of extra 80387 mathematical constants. */
8275
8276 static void
8277 init_ext_80387_constants (void)
8278 {
8279 static const char * cst[5] =
8280 {
8281 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8282 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8283 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8284 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8285 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8286 };
8287 int i;
8288
8289 for (i = 0; i < 5; i++)
8290 {
8291 real_from_string (&ext_80387_constants_table[i], cst[i]);
8292 /* Ensure each constant is rounded to XFmode precision. */
8293 real_convert (&ext_80387_constants_table[i],
8294 XFmode, &ext_80387_constants_table[i]);
8295 }
8296
8297 ext_80387_constants_init = 1;
8298 }
8299
8300 /* Return non-zero if the constant is something that
8301 can be loaded with a special instruction. */
8302
8303 int
8304 standard_80387_constant_p (rtx x)
8305 {
8306 enum machine_mode mode = GET_MODE (x);
8307
8308 REAL_VALUE_TYPE r;
8309
8310 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8311 return -1;
8312
8313 if (x == CONST0_RTX (mode))
8314 return 1;
8315 if (x == CONST1_RTX (mode))
8316 return 2;
8317
8318 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8319
8320 /* For XFmode constants, try to find a special 80387 instruction when
8321 optimizing for size or on those CPUs that benefit from them. */
8322 if (mode == XFmode
8323 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8324 {
8325 int i;
8326
8327 if (! ext_80387_constants_init)
8328 init_ext_80387_constants ();
8329
8330 for (i = 0; i < 5; i++)
8331 if (real_identical (&r, &ext_80387_constants_table[i]))
8332 return i + 3;
8333 }
8334
8335 /* Load of the constant -0.0 or -1.0 will be split as
8336 fldz;fchs or fld1;fchs sequence. */
8337 if (real_isnegzero (&r))
8338 return 8;
8339 if (real_identical (&r, &dconstm1))
8340 return 9;
8341
8342 return 0;
8343 }
8344
8345 /* Return the opcode of the special instruction to be used to load
8346 the constant X. */
8347
8348 const char *
8349 standard_80387_constant_opcode (rtx x)
8350 {
8351 switch (standard_80387_constant_p (x))
8352 {
8353 case 1:
8354 return "fldz";
8355 case 2:
8356 return "fld1";
8357 case 3:
8358 return "fldlg2";
8359 case 4:
8360 return "fldln2";
8361 case 5:
8362 return "fldl2e";
8363 case 6:
8364 return "fldl2t";
8365 case 7:
8366 return "fldpi";
8367 case 8:
8368 case 9:
8369 return "#";
8370 default:
8371 gcc_unreachable ();
8372 }
8373 }
8374
8375 /* Return the CONST_DOUBLE representing the 80387 constant that is
8376 loaded by the specified special instruction. The argument IDX
8377 matches the return value from standard_80387_constant_p. */
8378
8379 rtx
8380 standard_80387_constant_rtx (int idx)
8381 {
8382 int i;
8383
8384 if (! ext_80387_constants_init)
8385 init_ext_80387_constants ();
8386
8387 switch (idx)
8388 {
8389 case 3:
8390 case 4:
8391 case 5:
8392 case 6:
8393 case 7:
8394 i = idx - 3;
8395 break;
8396
8397 default:
8398 gcc_unreachable ();
8399 }
8400
8401 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8402 XFmode);
8403 }
8404
8405 /* Return 1 if X is all 0s and 2 if x is all 1s
8406 in supported SSE/AVX vector mode. */
8407
8408 int
8409 standard_sse_constant_p (rtx x)
8410 {
8411 enum machine_mode mode = GET_MODE (x);
8412
8413 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8414 return 1;
8415 if (vector_all_ones_operand (x, mode))
8416 switch (mode)
8417 {
8418 case V16QImode:
8419 case V8HImode:
8420 case V4SImode:
8421 case V2DImode:
8422 if (TARGET_SSE2)
8423 return 2;
8424 case V32QImode:
8425 case V16HImode:
8426 case V8SImode:
8427 case V4DImode:
8428 if (TARGET_AVX2)
8429 return 2;
8430 default:
8431 break;
8432 }
8433
8434 return 0;
8435 }
8436
8437 /* Return the opcode of the special instruction to be used to load
8438 the constant X. */
8439
8440 const char *
8441 standard_sse_constant_opcode (rtx insn, rtx x)
8442 {
8443 switch (standard_sse_constant_p (x))
8444 {
8445 case 1:
8446 switch (get_attr_mode (insn))
8447 {
8448 case MODE_TI:
8449 return "%vpxor\t%0, %d0";
8450 case MODE_V2DF:
8451 return "%vxorpd\t%0, %d0";
8452 case MODE_V4SF:
8453 return "%vxorps\t%0, %d0";
8454
8455 case MODE_OI:
8456 return "vpxor\t%x0, %x0, %x0";
8457 case MODE_V4DF:
8458 return "vxorpd\t%x0, %x0, %x0";
8459 case MODE_V8SF:
8460 return "vxorps\t%x0, %x0, %x0";
8461
8462 default:
8463 break;
8464 }
8465
8466 case 2:
8467 if (TARGET_AVX)
8468 return "vpcmpeqd\t%0, %0, %0";
8469 else
8470 return "pcmpeqd\t%0, %0";
8471
8472 default:
8473 break;
8474 }
8475 gcc_unreachable ();
8476 }
8477
8478 /* Returns true if OP contains a symbol reference */
8479
8480 bool
8481 symbolic_reference_mentioned_p (rtx op)
8482 {
8483 const char *fmt;
8484 int i;
8485
8486 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8487 return true;
8488
8489 fmt = GET_RTX_FORMAT (GET_CODE (op));
8490 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8491 {
8492 if (fmt[i] == 'E')
8493 {
8494 int j;
8495
8496 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8497 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8498 return true;
8499 }
8500
8501 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8502 return true;
8503 }
8504
8505 return false;
8506 }
8507
8508 /* Return true if it is appropriate to emit `ret' instructions in the
8509 body of a function. Do this only if the epilogue is simple, needing a
8510 couple of insns. Prior to reloading, we can't tell how many registers
8511 must be saved, so return false then. Return false if there is no frame
8512 marker to de-allocate. */
8513
8514 bool
8515 ix86_can_use_return_insn_p (void)
8516 {
8517 struct ix86_frame frame;
8518
8519 if (! reload_completed || frame_pointer_needed)
8520 return 0;
8521
8522 /* Don't allow more than 32k pop, since that's all we can do
8523 with one instruction. */
8524 if (crtl->args.pops_args && crtl->args.size >= 32768)
8525 return 0;
8526
8527 ix86_compute_frame_layout (&frame);
8528 return (frame.stack_pointer_offset == UNITS_PER_WORD
8529 && (frame.nregs + frame.nsseregs) == 0);
8530 }
8531 \f
8532 /* Value should be nonzero if functions must have frame pointers.
8533 Zero means the frame pointer need not be set up (and parms may
8534 be accessed via the stack pointer) in functions that seem suitable. */
8535
8536 static bool
8537 ix86_frame_pointer_required (void)
8538 {
8539 /* If we accessed previous frames, then the generated code expects
8540 to be able to access the saved ebp value in our frame. */
8541 if (cfun->machine->accesses_prev_frame)
8542 return true;
8543
8544 /* Several x86 os'es need a frame pointer for other reasons,
8545 usually pertaining to setjmp. */
8546 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8547 return true;
8548
8549 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8550 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8551 return true;
8552
8553 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8554 turns off the frame pointer by default. Turn it back on now if
8555 we've not got a leaf function. */
8556 if (TARGET_OMIT_LEAF_FRAME_POINTER
8557 && (!current_function_is_leaf
8558 || ix86_current_function_calls_tls_descriptor))
8559 return true;
8560
8561 if (crtl->profile && !flag_fentry)
8562 return true;
8563
8564 return false;
8565 }
8566
8567 /* Record that the current function accesses previous call frames. */
8568
8569 void
8570 ix86_setup_frame_addresses (void)
8571 {
8572 cfun->machine->accesses_prev_frame = 1;
8573 }
8574 \f
8575 #ifndef USE_HIDDEN_LINKONCE
8576 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8577 # define USE_HIDDEN_LINKONCE 1
8578 # else
8579 # define USE_HIDDEN_LINKONCE 0
8580 # endif
8581 #endif
8582
8583 static int pic_labels_used;
8584
8585 /* Fills in the label name that should be used for a pc thunk for
8586 the given register. */
8587
8588 static void
8589 get_pc_thunk_name (char name[32], unsigned int regno)
8590 {
8591 gcc_assert (!TARGET_64BIT);
8592
8593 if (USE_HIDDEN_LINKONCE)
8594 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8595 else
8596 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8597 }
8598
8599
8600 /* This function generates code for -fpic that loads %ebx with
8601 the return address of the caller and then returns. */
8602
8603 static void
8604 ix86_code_end (void)
8605 {
8606 rtx xops[2];
8607 int regno;
8608
8609 for (regno = AX_REG; regno <= SP_REG; regno++)
8610 {
8611 char name[32];
8612 tree decl;
8613
8614 if (!(pic_labels_used & (1 << regno)))
8615 continue;
8616
8617 get_pc_thunk_name (name, regno);
8618
8619 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8620 get_identifier (name),
8621 build_function_type_list (void_type_node, NULL_TREE));
8622 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8623 NULL_TREE, void_type_node);
8624 TREE_PUBLIC (decl) = 1;
8625 TREE_STATIC (decl) = 1;
8626 DECL_IGNORED_P (decl) = 1;
8627
8628 #if TARGET_MACHO
8629 if (TARGET_MACHO)
8630 {
8631 switch_to_section (darwin_sections[text_coal_section]);
8632 fputs ("\t.weak_definition\t", asm_out_file);
8633 assemble_name (asm_out_file, name);
8634 fputs ("\n\t.private_extern\t", asm_out_file);
8635 assemble_name (asm_out_file, name);
8636 putc ('\n', asm_out_file);
8637 ASM_OUTPUT_LABEL (asm_out_file, name);
8638 DECL_WEAK (decl) = 1;
8639 }
8640 else
8641 #endif
8642 if (USE_HIDDEN_LINKONCE)
8643 {
8644 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8645
8646 targetm.asm_out.unique_section (decl, 0);
8647 switch_to_section (get_named_section (decl, NULL, 0));
8648
8649 targetm.asm_out.globalize_label (asm_out_file, name);
8650 fputs ("\t.hidden\t", asm_out_file);
8651 assemble_name (asm_out_file, name);
8652 putc ('\n', asm_out_file);
8653 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8654 }
8655 else
8656 {
8657 switch_to_section (text_section);
8658 ASM_OUTPUT_LABEL (asm_out_file, name);
8659 }
8660
8661 DECL_INITIAL (decl) = make_node (BLOCK);
8662 current_function_decl = decl;
8663 init_function_start (decl);
8664 first_function_block_is_cold = false;
8665 /* Make sure unwind info is emitted for the thunk if needed. */
8666 final_start_function (emit_barrier (), asm_out_file, 1);
8667
8668 /* Pad stack IP move with 4 instructions (two NOPs count
8669 as one instruction). */
8670 if (TARGET_PAD_SHORT_FUNCTION)
8671 {
8672 int i = 8;
8673
8674 while (i--)
8675 fputs ("\tnop\n", asm_out_file);
8676 }
8677
8678 xops[0] = gen_rtx_REG (Pmode, regno);
8679 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8680 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8681 fputs ("\tret\n", asm_out_file);
8682 final_end_function ();
8683 init_insn_lengths ();
8684 free_after_compilation (cfun);
8685 set_cfun (NULL);
8686 current_function_decl = NULL;
8687 }
8688
8689 if (flag_split_stack)
8690 file_end_indicate_split_stack ();
8691 }
8692
8693 /* Emit code for the SET_GOT patterns. */
8694
8695 const char *
8696 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8697 {
8698 rtx xops[3];
8699
8700 xops[0] = dest;
8701
8702 if (TARGET_VXWORKS_RTP && flag_pic)
8703 {
8704 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8705 xops[2] = gen_rtx_MEM (Pmode,
8706 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8707 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8708
8709 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8710 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8711 an unadorned address. */
8712 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8713 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8714 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8715 return "";
8716 }
8717
8718 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8719
8720 if (!flag_pic)
8721 {
8722 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8723
8724 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8725
8726 #if TARGET_MACHO
8727 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8728 is what will be referenced by the Mach-O PIC subsystem. */
8729 if (!label)
8730 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8731 #endif
8732
8733 targetm.asm_out.internal_label (asm_out_file, "L",
8734 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8735 }
8736 else
8737 {
8738 char name[32];
8739 get_pc_thunk_name (name, REGNO (dest));
8740 pic_labels_used |= 1 << REGNO (dest);
8741
8742 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8743 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8744 output_asm_insn ("call\t%X2", xops);
8745 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8746 is what will be referenced by the Mach-O PIC subsystem. */
8747 #if TARGET_MACHO
8748 if (!label)
8749 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8750 else
8751 targetm.asm_out.internal_label (asm_out_file, "L",
8752 CODE_LABEL_NUMBER (label));
8753 #endif
8754 }
8755
8756 if (!TARGET_MACHO)
8757 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8758
8759 return "";
8760 }
8761
8762 /* Generate an "push" pattern for input ARG. */
8763
8764 static rtx
8765 gen_push (rtx arg)
8766 {
8767 struct machine_function *m = cfun->machine;
8768
8769 if (m->fs.cfa_reg == stack_pointer_rtx)
8770 m->fs.cfa_offset += UNITS_PER_WORD;
8771 m->fs.sp_offset += UNITS_PER_WORD;
8772
8773 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8774 arg = gen_rtx_REG (word_mode, REGNO (arg));
8775
8776 return gen_rtx_SET (VOIDmode,
8777 gen_rtx_MEM (word_mode,
8778 gen_rtx_PRE_DEC (Pmode,
8779 stack_pointer_rtx)),
8780 arg);
8781 }
8782
8783 /* Generate an "pop" pattern for input ARG. */
8784
8785 static rtx
8786 gen_pop (rtx arg)
8787 {
8788 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8789 arg = gen_rtx_REG (word_mode, REGNO (arg));
8790
8791 return gen_rtx_SET (VOIDmode,
8792 arg,
8793 gen_rtx_MEM (word_mode,
8794 gen_rtx_POST_INC (Pmode,
8795 stack_pointer_rtx)));
8796 }
8797
8798 /* Return >= 0 if there is an unused call-clobbered register available
8799 for the entire function. */
8800
8801 static unsigned int
8802 ix86_select_alt_pic_regnum (void)
8803 {
8804 if (current_function_is_leaf
8805 && !crtl->profile
8806 && !ix86_current_function_calls_tls_descriptor)
8807 {
8808 int i, drap;
8809 /* Can't use the same register for both PIC and DRAP. */
8810 if (crtl->drap_reg)
8811 drap = REGNO (crtl->drap_reg);
8812 else
8813 drap = -1;
8814 for (i = 2; i >= 0; --i)
8815 if (i != drap && !df_regs_ever_live_p (i))
8816 return i;
8817 }
8818
8819 return INVALID_REGNUM;
8820 }
8821
8822 /* Return TRUE if we need to save REGNO. */
8823
8824 static bool
8825 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8826 {
8827 if (pic_offset_table_rtx
8828 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8829 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8830 || crtl->profile
8831 || crtl->calls_eh_return
8832 || crtl->uses_const_pool))
8833 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8834
8835 if (crtl->calls_eh_return && maybe_eh_return)
8836 {
8837 unsigned i;
8838 for (i = 0; ; i++)
8839 {
8840 unsigned test = EH_RETURN_DATA_REGNO (i);
8841 if (test == INVALID_REGNUM)
8842 break;
8843 if (test == regno)
8844 return true;
8845 }
8846 }
8847
8848 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8849 return true;
8850
8851 return (df_regs_ever_live_p (regno)
8852 && !call_used_regs[regno]
8853 && !fixed_regs[regno]
8854 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8855 }
8856
8857 /* Return number of saved general prupose registers. */
8858
8859 static int
8860 ix86_nsaved_regs (void)
8861 {
8862 int nregs = 0;
8863 int regno;
8864
8865 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8866 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8867 nregs ++;
8868 return nregs;
8869 }
8870
8871 /* Return number of saved SSE registrers. */
8872
8873 static int
8874 ix86_nsaved_sseregs (void)
8875 {
8876 int nregs = 0;
8877 int regno;
8878
8879 if (!TARGET_64BIT_MS_ABI)
8880 return 0;
8881 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8882 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8883 nregs ++;
8884 return nregs;
8885 }
8886
8887 /* Given FROM and TO register numbers, say whether this elimination is
8888 allowed. If stack alignment is needed, we can only replace argument
8889 pointer with hard frame pointer, or replace frame pointer with stack
8890 pointer. Otherwise, frame pointer elimination is automatically
8891 handled and all other eliminations are valid. */
8892
8893 static bool
8894 ix86_can_eliminate (const int from, const int to)
8895 {
8896 if (stack_realign_fp)
8897 return ((from == ARG_POINTER_REGNUM
8898 && to == HARD_FRAME_POINTER_REGNUM)
8899 || (from == FRAME_POINTER_REGNUM
8900 && to == STACK_POINTER_REGNUM));
8901 else
8902 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8903 }
8904
8905 /* Return the offset between two registers, one to be eliminated, and the other
8906 its replacement, at the start of a routine. */
8907
8908 HOST_WIDE_INT
8909 ix86_initial_elimination_offset (int from, int to)
8910 {
8911 struct ix86_frame frame;
8912 ix86_compute_frame_layout (&frame);
8913
8914 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8915 return frame.hard_frame_pointer_offset;
8916 else if (from == FRAME_POINTER_REGNUM
8917 && to == HARD_FRAME_POINTER_REGNUM)
8918 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8919 else
8920 {
8921 gcc_assert (to == STACK_POINTER_REGNUM);
8922
8923 if (from == ARG_POINTER_REGNUM)
8924 return frame.stack_pointer_offset;
8925
8926 gcc_assert (from == FRAME_POINTER_REGNUM);
8927 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8928 }
8929 }
8930
8931 /* In a dynamically-aligned function, we can't know the offset from
8932 stack pointer to frame pointer, so we must ensure that setjmp
8933 eliminates fp against the hard fp (%ebp) rather than trying to
8934 index from %esp up to the top of the frame across a gap that is
8935 of unknown (at compile-time) size. */
8936 static rtx
8937 ix86_builtin_setjmp_frame_value (void)
8938 {
8939 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8940 }
8941
8942 /* When using -fsplit-stack, the allocation routines set a field in
8943 the TCB to the bottom of the stack plus this much space, measured
8944 in bytes. */
8945
8946 #define SPLIT_STACK_AVAILABLE 256
8947
8948 /* Fill structure ix86_frame about frame of currently computed function. */
8949
8950 static void
8951 ix86_compute_frame_layout (struct ix86_frame *frame)
8952 {
8953 unsigned HOST_WIDE_INT stack_alignment_needed;
8954 HOST_WIDE_INT offset;
8955 unsigned HOST_WIDE_INT preferred_alignment;
8956 HOST_WIDE_INT size = get_frame_size ();
8957 HOST_WIDE_INT to_allocate;
8958
8959 frame->nregs = ix86_nsaved_regs ();
8960 frame->nsseregs = ix86_nsaved_sseregs ();
8961
8962 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8963 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8964
8965 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8966 function prologues and leaf. */
8967 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8968 && (!current_function_is_leaf || cfun->calls_alloca != 0
8969 || ix86_current_function_calls_tls_descriptor))
8970 {
8971 preferred_alignment = 16;
8972 stack_alignment_needed = 16;
8973 crtl->preferred_stack_boundary = 128;
8974 crtl->stack_alignment_needed = 128;
8975 }
8976
8977 gcc_assert (!size || stack_alignment_needed);
8978 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8979 gcc_assert (preferred_alignment <= stack_alignment_needed);
8980
8981 /* For SEH we have to limit the amount of code movement into the prologue.
8982 At present we do this via a BLOCKAGE, at which point there's very little
8983 scheduling that can be done, which means that there's very little point
8984 in doing anything except PUSHs. */
8985 if (TARGET_SEH)
8986 cfun->machine->use_fast_prologue_epilogue = false;
8987
8988 /* During reload iteration the amount of registers saved can change.
8989 Recompute the value as needed. Do not recompute when amount of registers
8990 didn't change as reload does multiple calls to the function and does not
8991 expect the decision to change within single iteration. */
8992 else if (!optimize_function_for_size_p (cfun)
8993 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8994 {
8995 int count = frame->nregs;
8996 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8997
8998 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8999
9000 /* The fast prologue uses move instead of push to save registers. This
9001 is significantly longer, but also executes faster as modern hardware
9002 can execute the moves in parallel, but can't do that for push/pop.
9003
9004 Be careful about choosing what prologue to emit: When function takes
9005 many instructions to execute we may use slow version as well as in
9006 case function is known to be outside hot spot (this is known with
9007 feedback only). Weight the size of function by number of registers
9008 to save as it is cheap to use one or two push instructions but very
9009 slow to use many of them. */
9010 if (count)
9011 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9012 if (node->frequency < NODE_FREQUENCY_NORMAL
9013 || (flag_branch_probabilities
9014 && node->frequency < NODE_FREQUENCY_HOT))
9015 cfun->machine->use_fast_prologue_epilogue = false;
9016 else
9017 cfun->machine->use_fast_prologue_epilogue
9018 = !expensive_function_p (count);
9019 }
9020
9021 frame->save_regs_using_mov
9022 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9023 /* If static stack checking is enabled and done with probes,
9024 the registers need to be saved before allocating the frame. */
9025 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9026
9027 /* Skip return address. */
9028 offset = UNITS_PER_WORD;
9029
9030 /* Skip pushed static chain. */
9031 if (ix86_static_chain_on_stack)
9032 offset += UNITS_PER_WORD;
9033
9034 /* Skip saved base pointer. */
9035 if (frame_pointer_needed)
9036 offset += UNITS_PER_WORD;
9037 frame->hfp_save_offset = offset;
9038
9039 /* The traditional frame pointer location is at the top of the frame. */
9040 frame->hard_frame_pointer_offset = offset;
9041
9042 /* Register save area */
9043 offset += frame->nregs * UNITS_PER_WORD;
9044 frame->reg_save_offset = offset;
9045
9046 /* Align and set SSE register save area. */
9047 if (frame->nsseregs)
9048 {
9049 /* The only ABI that has saved SSE registers (Win64) also has a
9050 16-byte aligned default stack, and thus we don't need to be
9051 within the re-aligned local stack frame to save them. */
9052 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9053 offset = (offset + 16 - 1) & -16;
9054 offset += frame->nsseregs * 16;
9055 }
9056 frame->sse_reg_save_offset = offset;
9057
9058 /* The re-aligned stack starts here. Values before this point are not
9059 directly comparable with values below this point. In order to make
9060 sure that no value happens to be the same before and after, force
9061 the alignment computation below to add a non-zero value. */
9062 if (stack_realign_fp)
9063 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9064
9065 /* Va-arg area */
9066 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9067 offset += frame->va_arg_size;
9068
9069 /* Align start of frame for local function. */
9070 if (stack_realign_fp
9071 || offset != frame->sse_reg_save_offset
9072 || size != 0
9073 || !current_function_is_leaf
9074 || cfun->calls_alloca
9075 || ix86_current_function_calls_tls_descriptor)
9076 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9077
9078 /* Frame pointer points here. */
9079 frame->frame_pointer_offset = offset;
9080
9081 offset += size;
9082
9083 /* Add outgoing arguments area. Can be skipped if we eliminated
9084 all the function calls as dead code.
9085 Skipping is however impossible when function calls alloca. Alloca
9086 expander assumes that last crtl->outgoing_args_size
9087 of stack frame are unused. */
9088 if (ACCUMULATE_OUTGOING_ARGS
9089 && (!current_function_is_leaf || cfun->calls_alloca
9090 || ix86_current_function_calls_tls_descriptor))
9091 {
9092 offset += crtl->outgoing_args_size;
9093 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9094 }
9095 else
9096 frame->outgoing_arguments_size = 0;
9097
9098 /* Align stack boundary. Only needed if we're calling another function
9099 or using alloca. */
9100 if (!current_function_is_leaf || cfun->calls_alloca
9101 || ix86_current_function_calls_tls_descriptor)
9102 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9103
9104 /* We've reached end of stack frame. */
9105 frame->stack_pointer_offset = offset;
9106
9107 /* Size prologue needs to allocate. */
9108 to_allocate = offset - frame->sse_reg_save_offset;
9109
9110 if ((!to_allocate && frame->nregs <= 1)
9111 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9112 frame->save_regs_using_mov = false;
9113
9114 if (ix86_using_red_zone ()
9115 && current_function_sp_is_unchanging
9116 && current_function_is_leaf
9117 && !ix86_current_function_calls_tls_descriptor)
9118 {
9119 frame->red_zone_size = to_allocate;
9120 if (frame->save_regs_using_mov)
9121 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9122 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9123 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9124 }
9125 else
9126 frame->red_zone_size = 0;
9127 frame->stack_pointer_offset -= frame->red_zone_size;
9128
9129 /* The SEH frame pointer location is near the bottom of the frame.
9130 This is enforced by the fact that the difference between the
9131 stack pointer and the frame pointer is limited to 240 bytes in
9132 the unwind data structure. */
9133 if (TARGET_SEH)
9134 {
9135 HOST_WIDE_INT diff;
9136
9137 /* If we can leave the frame pointer where it is, do so. */
9138 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9139 if (diff > 240 || (diff & 15) != 0)
9140 {
9141 /* Ideally we'd determine what portion of the local stack frame
9142 (within the constraint of the lowest 240) is most heavily used.
9143 But without that complication, simply bias the frame pointer
9144 by 128 bytes so as to maximize the amount of the local stack
9145 frame that is addressable with 8-bit offsets. */
9146 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9147 }
9148 }
9149 }
9150
9151 /* This is semi-inlined memory_address_length, but simplified
9152 since we know that we're always dealing with reg+offset, and
9153 to avoid having to create and discard all that rtl. */
9154
9155 static inline int
9156 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9157 {
9158 int len = 4;
9159
9160 if (offset == 0)
9161 {
9162 /* EBP and R13 cannot be encoded without an offset. */
9163 len = (regno == BP_REG || regno == R13_REG);
9164 }
9165 else if (IN_RANGE (offset, -128, 127))
9166 len = 1;
9167
9168 /* ESP and R12 must be encoded with a SIB byte. */
9169 if (regno == SP_REG || regno == R12_REG)
9170 len++;
9171
9172 return len;
9173 }
9174
9175 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9176 The valid base registers are taken from CFUN->MACHINE->FS. */
9177
9178 static rtx
9179 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9180 {
9181 const struct machine_function *m = cfun->machine;
9182 rtx base_reg = NULL;
9183 HOST_WIDE_INT base_offset = 0;
9184
9185 if (m->use_fast_prologue_epilogue)
9186 {
9187 /* Choose the base register most likely to allow the most scheduling
9188 opportunities. Generally FP is valid throughout the function,
9189 while DRAP must be reloaded within the epilogue. But choose either
9190 over the SP due to increased encoding size. */
9191
9192 if (m->fs.fp_valid)
9193 {
9194 base_reg = hard_frame_pointer_rtx;
9195 base_offset = m->fs.fp_offset - cfa_offset;
9196 }
9197 else if (m->fs.drap_valid)
9198 {
9199 base_reg = crtl->drap_reg;
9200 base_offset = 0 - cfa_offset;
9201 }
9202 else if (m->fs.sp_valid)
9203 {
9204 base_reg = stack_pointer_rtx;
9205 base_offset = m->fs.sp_offset - cfa_offset;
9206 }
9207 }
9208 else
9209 {
9210 HOST_WIDE_INT toffset;
9211 int len = 16, tlen;
9212
9213 /* Choose the base register with the smallest address encoding.
9214 With a tie, choose FP > DRAP > SP. */
9215 if (m->fs.sp_valid)
9216 {
9217 base_reg = stack_pointer_rtx;
9218 base_offset = m->fs.sp_offset - cfa_offset;
9219 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9220 }
9221 if (m->fs.drap_valid)
9222 {
9223 toffset = 0 - cfa_offset;
9224 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9225 if (tlen <= len)
9226 {
9227 base_reg = crtl->drap_reg;
9228 base_offset = toffset;
9229 len = tlen;
9230 }
9231 }
9232 if (m->fs.fp_valid)
9233 {
9234 toffset = m->fs.fp_offset - cfa_offset;
9235 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9236 if (tlen <= len)
9237 {
9238 base_reg = hard_frame_pointer_rtx;
9239 base_offset = toffset;
9240 len = tlen;
9241 }
9242 }
9243 }
9244 gcc_assert (base_reg != NULL);
9245
9246 return plus_constant (Pmode, base_reg, base_offset);
9247 }
9248
9249 /* Emit code to save registers in the prologue. */
9250
9251 static void
9252 ix86_emit_save_regs (void)
9253 {
9254 unsigned int regno;
9255 rtx insn;
9256
9257 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9258 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9259 {
9260 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9261 RTX_FRAME_RELATED_P (insn) = 1;
9262 }
9263 }
9264
9265 /* Emit a single register save at CFA - CFA_OFFSET. */
9266
9267 static void
9268 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9269 HOST_WIDE_INT cfa_offset)
9270 {
9271 struct machine_function *m = cfun->machine;
9272 rtx reg = gen_rtx_REG (mode, regno);
9273 rtx mem, addr, base, insn;
9274
9275 addr = choose_baseaddr (cfa_offset);
9276 mem = gen_frame_mem (mode, addr);
9277
9278 /* For SSE saves, we need to indicate the 128-bit alignment. */
9279 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9280
9281 insn = emit_move_insn (mem, reg);
9282 RTX_FRAME_RELATED_P (insn) = 1;
9283
9284 base = addr;
9285 if (GET_CODE (base) == PLUS)
9286 base = XEXP (base, 0);
9287 gcc_checking_assert (REG_P (base));
9288
9289 /* When saving registers into a re-aligned local stack frame, avoid
9290 any tricky guessing by dwarf2out. */
9291 if (m->fs.realigned)
9292 {
9293 gcc_checking_assert (stack_realign_drap);
9294
9295 if (regno == REGNO (crtl->drap_reg))
9296 {
9297 /* A bit of a hack. We force the DRAP register to be saved in
9298 the re-aligned stack frame, which provides us with a copy
9299 of the CFA that will last past the prologue. Install it. */
9300 gcc_checking_assert (cfun->machine->fs.fp_valid);
9301 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9302 cfun->machine->fs.fp_offset - cfa_offset);
9303 mem = gen_rtx_MEM (mode, addr);
9304 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9305 }
9306 else
9307 {
9308 /* The frame pointer is a stable reference within the
9309 aligned frame. Use it. */
9310 gcc_checking_assert (cfun->machine->fs.fp_valid);
9311 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9312 cfun->machine->fs.fp_offset - cfa_offset);
9313 mem = gen_rtx_MEM (mode, addr);
9314 add_reg_note (insn, REG_CFA_EXPRESSION,
9315 gen_rtx_SET (VOIDmode, mem, reg));
9316 }
9317 }
9318
9319 /* The memory may not be relative to the current CFA register,
9320 which means that we may need to generate a new pattern for
9321 use by the unwind info. */
9322 else if (base != m->fs.cfa_reg)
9323 {
9324 addr = plus_constant (Pmode, m->fs.cfa_reg,
9325 m->fs.cfa_offset - cfa_offset);
9326 mem = gen_rtx_MEM (mode, addr);
9327 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9328 }
9329 }
9330
9331 /* Emit code to save registers using MOV insns.
9332 First register is stored at CFA - CFA_OFFSET. */
9333 static void
9334 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9335 {
9336 unsigned int regno;
9337
9338 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9339 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9340 {
9341 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9342 cfa_offset -= UNITS_PER_WORD;
9343 }
9344 }
9345
9346 /* Emit code to save SSE registers using MOV insns.
9347 First register is stored at CFA - CFA_OFFSET. */
9348 static void
9349 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9350 {
9351 unsigned int regno;
9352
9353 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9354 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9355 {
9356 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9357 cfa_offset -= 16;
9358 }
9359 }
9360
9361 static GTY(()) rtx queued_cfa_restores;
9362
9363 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9364 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9365 Don't add the note if the previously saved value will be left untouched
9366 within stack red-zone till return, as unwinders can find the same value
9367 in the register and on the stack. */
9368
9369 static void
9370 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9371 {
9372 if (!crtl->shrink_wrapped
9373 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9374 return;
9375
9376 if (insn)
9377 {
9378 add_reg_note (insn, REG_CFA_RESTORE, reg);
9379 RTX_FRAME_RELATED_P (insn) = 1;
9380 }
9381 else
9382 queued_cfa_restores
9383 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9384 }
9385
9386 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9387
9388 static void
9389 ix86_add_queued_cfa_restore_notes (rtx insn)
9390 {
9391 rtx last;
9392 if (!queued_cfa_restores)
9393 return;
9394 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9395 ;
9396 XEXP (last, 1) = REG_NOTES (insn);
9397 REG_NOTES (insn) = queued_cfa_restores;
9398 queued_cfa_restores = NULL_RTX;
9399 RTX_FRAME_RELATED_P (insn) = 1;
9400 }
9401
9402 /* Expand prologue or epilogue stack adjustment.
9403 The pattern exist to put a dependency on all ebp-based memory accesses.
9404 STYLE should be negative if instructions should be marked as frame related,
9405 zero if %r11 register is live and cannot be freely used and positive
9406 otherwise. */
9407
9408 static void
9409 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9410 int style, bool set_cfa)
9411 {
9412 struct machine_function *m = cfun->machine;
9413 rtx insn;
9414 bool add_frame_related_expr = false;
9415
9416 if (Pmode == SImode)
9417 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9418 else if (x86_64_immediate_operand (offset, DImode))
9419 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9420 else
9421 {
9422 rtx tmp;
9423 /* r11 is used by indirect sibcall return as well, set before the
9424 epilogue and used after the epilogue. */
9425 if (style)
9426 tmp = gen_rtx_REG (DImode, R11_REG);
9427 else
9428 {
9429 gcc_assert (src != hard_frame_pointer_rtx
9430 && dest != hard_frame_pointer_rtx);
9431 tmp = hard_frame_pointer_rtx;
9432 }
9433 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9434 if (style < 0)
9435 add_frame_related_expr = true;
9436
9437 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9438 }
9439
9440 insn = emit_insn (insn);
9441 if (style >= 0)
9442 ix86_add_queued_cfa_restore_notes (insn);
9443
9444 if (set_cfa)
9445 {
9446 rtx r;
9447
9448 gcc_assert (m->fs.cfa_reg == src);
9449 m->fs.cfa_offset += INTVAL (offset);
9450 m->fs.cfa_reg = dest;
9451
9452 r = gen_rtx_PLUS (Pmode, src, offset);
9453 r = gen_rtx_SET (VOIDmode, dest, r);
9454 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9455 RTX_FRAME_RELATED_P (insn) = 1;
9456 }
9457 else if (style < 0)
9458 {
9459 RTX_FRAME_RELATED_P (insn) = 1;
9460 if (add_frame_related_expr)
9461 {
9462 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9463 r = gen_rtx_SET (VOIDmode, dest, r);
9464 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9465 }
9466 }
9467
9468 if (dest == stack_pointer_rtx)
9469 {
9470 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9471 bool valid = m->fs.sp_valid;
9472
9473 if (src == hard_frame_pointer_rtx)
9474 {
9475 valid = m->fs.fp_valid;
9476 ooffset = m->fs.fp_offset;
9477 }
9478 else if (src == crtl->drap_reg)
9479 {
9480 valid = m->fs.drap_valid;
9481 ooffset = 0;
9482 }
9483 else
9484 {
9485 /* Else there are two possibilities: SP itself, which we set
9486 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9487 taken care of this by hand along the eh_return path. */
9488 gcc_checking_assert (src == stack_pointer_rtx
9489 || offset == const0_rtx);
9490 }
9491
9492 m->fs.sp_offset = ooffset - INTVAL (offset);
9493 m->fs.sp_valid = valid;
9494 }
9495 }
9496
9497 /* Find an available register to be used as dynamic realign argument
9498 pointer regsiter. Such a register will be written in prologue and
9499 used in begin of body, so it must not be
9500 1. parameter passing register.
9501 2. GOT pointer.
9502 We reuse static-chain register if it is available. Otherwise, we
9503 use DI for i386 and R13 for x86-64. We chose R13 since it has
9504 shorter encoding.
9505
9506 Return: the regno of chosen register. */
9507
9508 static unsigned int
9509 find_drap_reg (void)
9510 {
9511 tree decl = cfun->decl;
9512
9513 if (TARGET_64BIT)
9514 {
9515 /* Use R13 for nested function or function need static chain.
9516 Since function with tail call may use any caller-saved
9517 registers in epilogue, DRAP must not use caller-saved
9518 register in such case. */
9519 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9520 return R13_REG;
9521
9522 return R10_REG;
9523 }
9524 else
9525 {
9526 /* Use DI for nested function or function need static chain.
9527 Since function with tail call may use any caller-saved
9528 registers in epilogue, DRAP must not use caller-saved
9529 register in such case. */
9530 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9531 return DI_REG;
9532
9533 /* Reuse static chain register if it isn't used for parameter
9534 passing. */
9535 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9536 {
9537 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9538 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9539 return CX_REG;
9540 }
9541 return DI_REG;
9542 }
9543 }
9544
9545 /* Return minimum incoming stack alignment. */
9546
9547 static unsigned int
9548 ix86_minimum_incoming_stack_boundary (bool sibcall)
9549 {
9550 unsigned int incoming_stack_boundary;
9551
9552 /* Prefer the one specified at command line. */
9553 if (ix86_user_incoming_stack_boundary)
9554 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9555 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9556 if -mstackrealign is used, it isn't used for sibcall check and
9557 estimated stack alignment is 128bit. */
9558 else if (!sibcall
9559 && !TARGET_64BIT
9560 && ix86_force_align_arg_pointer
9561 && crtl->stack_alignment_estimated == 128)
9562 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9563 else
9564 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9565
9566 /* Incoming stack alignment can be changed on individual functions
9567 via force_align_arg_pointer attribute. We use the smallest
9568 incoming stack boundary. */
9569 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9570 && lookup_attribute (ix86_force_align_arg_pointer_string,
9571 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9572 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9573
9574 /* The incoming stack frame has to be aligned at least at
9575 parm_stack_boundary. */
9576 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9577 incoming_stack_boundary = crtl->parm_stack_boundary;
9578
9579 /* Stack at entrance of main is aligned by runtime. We use the
9580 smallest incoming stack boundary. */
9581 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9582 && DECL_NAME (current_function_decl)
9583 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9584 && DECL_FILE_SCOPE_P (current_function_decl))
9585 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9586
9587 return incoming_stack_boundary;
9588 }
9589
9590 /* Update incoming stack boundary and estimated stack alignment. */
9591
9592 static void
9593 ix86_update_stack_boundary (void)
9594 {
9595 ix86_incoming_stack_boundary
9596 = ix86_minimum_incoming_stack_boundary (false);
9597
9598 /* x86_64 vararg needs 16byte stack alignment for register save
9599 area. */
9600 if (TARGET_64BIT
9601 && cfun->stdarg
9602 && crtl->stack_alignment_estimated < 128)
9603 crtl->stack_alignment_estimated = 128;
9604 }
9605
9606 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9607 needed or an rtx for DRAP otherwise. */
9608
9609 static rtx
9610 ix86_get_drap_rtx (void)
9611 {
9612 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9613 crtl->need_drap = true;
9614
9615 if (stack_realign_drap)
9616 {
9617 /* Assign DRAP to vDRAP and returns vDRAP */
9618 unsigned int regno = find_drap_reg ();
9619 rtx drap_vreg;
9620 rtx arg_ptr;
9621 rtx seq, insn;
9622
9623 arg_ptr = gen_rtx_REG (Pmode, regno);
9624 crtl->drap_reg = arg_ptr;
9625
9626 start_sequence ();
9627 drap_vreg = copy_to_reg (arg_ptr);
9628 seq = get_insns ();
9629 end_sequence ();
9630
9631 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9632 if (!optimize)
9633 {
9634 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9635 RTX_FRAME_RELATED_P (insn) = 1;
9636 }
9637 return drap_vreg;
9638 }
9639 else
9640 return NULL;
9641 }
9642
9643 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9644
9645 static rtx
9646 ix86_internal_arg_pointer (void)
9647 {
9648 return virtual_incoming_args_rtx;
9649 }
9650
9651 struct scratch_reg {
9652 rtx reg;
9653 bool saved;
9654 };
9655
9656 /* Return a short-lived scratch register for use on function entry.
9657 In 32-bit mode, it is valid only after the registers are saved
9658 in the prologue. This register must be released by means of
9659 release_scratch_register_on_entry once it is dead. */
9660
9661 static void
9662 get_scratch_register_on_entry (struct scratch_reg *sr)
9663 {
9664 int regno;
9665
9666 sr->saved = false;
9667
9668 if (TARGET_64BIT)
9669 {
9670 /* We always use R11 in 64-bit mode. */
9671 regno = R11_REG;
9672 }
9673 else
9674 {
9675 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9676 bool fastcall_p
9677 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9678 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9679 int regparm = ix86_function_regparm (fntype, decl);
9680 int drap_regno
9681 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9682
9683 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9684 for the static chain register. */
9685 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9686 && drap_regno != AX_REG)
9687 regno = AX_REG;
9688 else if (regparm < 2 && drap_regno != DX_REG)
9689 regno = DX_REG;
9690 /* ecx is the static chain register. */
9691 else if (regparm < 3 && !fastcall_p && !static_chain_p
9692 && drap_regno != CX_REG)
9693 regno = CX_REG;
9694 else if (ix86_save_reg (BX_REG, true))
9695 regno = BX_REG;
9696 /* esi is the static chain register. */
9697 else if (!(regparm == 3 && static_chain_p)
9698 && ix86_save_reg (SI_REG, true))
9699 regno = SI_REG;
9700 else if (ix86_save_reg (DI_REG, true))
9701 regno = DI_REG;
9702 else
9703 {
9704 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9705 sr->saved = true;
9706 }
9707 }
9708
9709 sr->reg = gen_rtx_REG (Pmode, regno);
9710 if (sr->saved)
9711 {
9712 rtx insn = emit_insn (gen_push (sr->reg));
9713 RTX_FRAME_RELATED_P (insn) = 1;
9714 }
9715 }
9716
9717 /* Release a scratch register obtained from the preceding function. */
9718
9719 static void
9720 release_scratch_register_on_entry (struct scratch_reg *sr)
9721 {
9722 if (sr->saved)
9723 {
9724 rtx x, insn = emit_insn (gen_pop (sr->reg));
9725
9726 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9727 RTX_FRAME_RELATED_P (insn) = 1;
9728 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9729 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9730 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9731 }
9732 }
9733
9734 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9735
9736 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9737
9738 static void
9739 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9740 {
9741 /* We skip the probe for the first interval + a small dope of 4 words and
9742 probe that many bytes past the specified size to maintain a protection
9743 area at the botton of the stack. */
9744 const int dope = 4 * UNITS_PER_WORD;
9745 rtx size_rtx = GEN_INT (size), last;
9746
9747 /* See if we have a constant small number of probes to generate. If so,
9748 that's the easy case. The run-time loop is made up of 11 insns in the
9749 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9750 for n # of intervals. */
9751 if (size <= 5 * PROBE_INTERVAL)
9752 {
9753 HOST_WIDE_INT i, adjust;
9754 bool first_probe = true;
9755
9756 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9757 values of N from 1 until it exceeds SIZE. If only one probe is
9758 needed, this will not generate any code. Then adjust and probe
9759 to PROBE_INTERVAL + SIZE. */
9760 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9761 {
9762 if (first_probe)
9763 {
9764 adjust = 2 * PROBE_INTERVAL + dope;
9765 first_probe = false;
9766 }
9767 else
9768 adjust = PROBE_INTERVAL;
9769
9770 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9771 plus_constant (Pmode, stack_pointer_rtx,
9772 -adjust)));
9773 emit_stack_probe (stack_pointer_rtx);
9774 }
9775
9776 if (first_probe)
9777 adjust = size + PROBE_INTERVAL + dope;
9778 else
9779 adjust = size + PROBE_INTERVAL - i;
9780
9781 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9782 plus_constant (Pmode, stack_pointer_rtx,
9783 -adjust)));
9784 emit_stack_probe (stack_pointer_rtx);
9785
9786 /* Adjust back to account for the additional first interval. */
9787 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9788 plus_constant (Pmode, stack_pointer_rtx,
9789 PROBE_INTERVAL + dope)));
9790 }
9791
9792 /* Otherwise, do the same as above, but in a loop. Note that we must be
9793 extra careful with variables wrapping around because we might be at
9794 the very top (or the very bottom) of the address space and we have
9795 to be able to handle this case properly; in particular, we use an
9796 equality test for the loop condition. */
9797 else
9798 {
9799 HOST_WIDE_INT rounded_size;
9800 struct scratch_reg sr;
9801
9802 get_scratch_register_on_entry (&sr);
9803
9804
9805 /* Step 1: round SIZE to the previous multiple of the interval. */
9806
9807 rounded_size = size & -PROBE_INTERVAL;
9808
9809
9810 /* Step 2: compute initial and final value of the loop counter. */
9811
9812 /* SP = SP_0 + PROBE_INTERVAL. */
9813 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9814 plus_constant (Pmode, stack_pointer_rtx,
9815 - (PROBE_INTERVAL + dope))));
9816
9817 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9818 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9819 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9820 gen_rtx_PLUS (Pmode, sr.reg,
9821 stack_pointer_rtx)));
9822
9823
9824 /* Step 3: the loop
9825
9826 while (SP != LAST_ADDR)
9827 {
9828 SP = SP + PROBE_INTERVAL
9829 probe at SP
9830 }
9831
9832 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9833 values of N from 1 until it is equal to ROUNDED_SIZE. */
9834
9835 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9836
9837
9838 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9839 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9840
9841 if (size != rounded_size)
9842 {
9843 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9844 plus_constant (Pmode, stack_pointer_rtx,
9845 rounded_size - size)));
9846 emit_stack_probe (stack_pointer_rtx);
9847 }
9848
9849 /* Adjust back to account for the additional first interval. */
9850 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9851 plus_constant (Pmode, stack_pointer_rtx,
9852 PROBE_INTERVAL + dope)));
9853
9854 release_scratch_register_on_entry (&sr);
9855 }
9856
9857 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9858
9859 /* Even if the stack pointer isn't the CFA register, we need to correctly
9860 describe the adjustments made to it, in particular differentiate the
9861 frame-related ones from the frame-unrelated ones. */
9862 if (size > 0)
9863 {
9864 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9865 XVECEXP (expr, 0, 0)
9866 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9867 plus_constant (Pmode, stack_pointer_rtx, -size));
9868 XVECEXP (expr, 0, 1)
9869 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9870 plus_constant (Pmode, stack_pointer_rtx,
9871 PROBE_INTERVAL + dope + size));
9872 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9873 RTX_FRAME_RELATED_P (last) = 1;
9874
9875 cfun->machine->fs.sp_offset += size;
9876 }
9877
9878 /* Make sure nothing is scheduled before we are done. */
9879 emit_insn (gen_blockage ());
9880 }
9881
9882 /* Adjust the stack pointer up to REG while probing it. */
9883
9884 const char *
9885 output_adjust_stack_and_probe (rtx reg)
9886 {
9887 static int labelno = 0;
9888 char loop_lab[32], end_lab[32];
9889 rtx xops[2];
9890
9891 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9892 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9893
9894 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9895
9896 /* Jump to END_LAB if SP == LAST_ADDR. */
9897 xops[0] = stack_pointer_rtx;
9898 xops[1] = reg;
9899 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9900 fputs ("\tje\t", asm_out_file);
9901 assemble_name_raw (asm_out_file, end_lab);
9902 fputc ('\n', asm_out_file);
9903
9904 /* SP = SP + PROBE_INTERVAL. */
9905 xops[1] = GEN_INT (PROBE_INTERVAL);
9906 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9907
9908 /* Probe at SP. */
9909 xops[1] = const0_rtx;
9910 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9911
9912 fprintf (asm_out_file, "\tjmp\t");
9913 assemble_name_raw (asm_out_file, loop_lab);
9914 fputc ('\n', asm_out_file);
9915
9916 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9917
9918 return "";
9919 }
9920
9921 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9922 inclusive. These are offsets from the current stack pointer. */
9923
9924 static void
9925 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9926 {
9927 /* See if we have a constant small number of probes to generate. If so,
9928 that's the easy case. The run-time loop is made up of 7 insns in the
9929 generic case while the compile-time loop is made up of n insns for n #
9930 of intervals. */
9931 if (size <= 7 * PROBE_INTERVAL)
9932 {
9933 HOST_WIDE_INT i;
9934
9935 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9936 it exceeds SIZE. If only one probe is needed, this will not
9937 generate any code. Then probe at FIRST + SIZE. */
9938 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9939 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9940 -(first + i)));
9941
9942 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9943 -(first + size)));
9944 }
9945
9946 /* Otherwise, do the same as above, but in a loop. Note that we must be
9947 extra careful with variables wrapping around because we might be at
9948 the very top (or the very bottom) of the address space and we have
9949 to be able to handle this case properly; in particular, we use an
9950 equality test for the loop condition. */
9951 else
9952 {
9953 HOST_WIDE_INT rounded_size, last;
9954 struct scratch_reg sr;
9955
9956 get_scratch_register_on_entry (&sr);
9957
9958
9959 /* Step 1: round SIZE to the previous multiple of the interval. */
9960
9961 rounded_size = size & -PROBE_INTERVAL;
9962
9963
9964 /* Step 2: compute initial and final value of the loop counter. */
9965
9966 /* TEST_OFFSET = FIRST. */
9967 emit_move_insn (sr.reg, GEN_INT (-first));
9968
9969 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9970 last = first + rounded_size;
9971
9972
9973 /* Step 3: the loop
9974
9975 while (TEST_ADDR != LAST_ADDR)
9976 {
9977 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9978 probe at TEST_ADDR
9979 }
9980
9981 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9982 until it is equal to ROUNDED_SIZE. */
9983
9984 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9985
9986
9987 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9988 that SIZE is equal to ROUNDED_SIZE. */
9989
9990 if (size != rounded_size)
9991 emit_stack_probe (plus_constant (Pmode,
9992 gen_rtx_PLUS (Pmode,
9993 stack_pointer_rtx,
9994 sr.reg),
9995 rounded_size - size));
9996
9997 release_scratch_register_on_entry (&sr);
9998 }
9999
10000 /* Make sure nothing is scheduled before we are done. */
10001 emit_insn (gen_blockage ());
10002 }
10003
10004 /* Probe a range of stack addresses from REG to END, inclusive. These are
10005 offsets from the current stack pointer. */
10006
10007 const char *
10008 output_probe_stack_range (rtx reg, rtx end)
10009 {
10010 static int labelno = 0;
10011 char loop_lab[32], end_lab[32];
10012 rtx xops[3];
10013
10014 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10015 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10016
10017 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10018
10019 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10020 xops[0] = reg;
10021 xops[1] = end;
10022 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10023 fputs ("\tje\t", asm_out_file);
10024 assemble_name_raw (asm_out_file, end_lab);
10025 fputc ('\n', asm_out_file);
10026
10027 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10028 xops[1] = GEN_INT (PROBE_INTERVAL);
10029 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10030
10031 /* Probe at TEST_ADDR. */
10032 xops[0] = stack_pointer_rtx;
10033 xops[1] = reg;
10034 xops[2] = const0_rtx;
10035 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10036
10037 fprintf (asm_out_file, "\tjmp\t");
10038 assemble_name_raw (asm_out_file, loop_lab);
10039 fputc ('\n', asm_out_file);
10040
10041 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10042
10043 return "";
10044 }
10045
10046 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10047 to be generated in correct form. */
10048 static void
10049 ix86_finalize_stack_realign_flags (void)
10050 {
10051 /* Check if stack realign is really needed after reload, and
10052 stores result in cfun */
10053 unsigned int incoming_stack_boundary
10054 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10055 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10056 unsigned int stack_realign = (incoming_stack_boundary
10057 < (current_function_is_leaf
10058 ? crtl->max_used_stack_slot_alignment
10059 : crtl->stack_alignment_needed));
10060
10061 if (crtl->stack_realign_finalized)
10062 {
10063 /* After stack_realign_needed is finalized, we can't no longer
10064 change it. */
10065 gcc_assert (crtl->stack_realign_needed == stack_realign);
10066 return;
10067 }
10068
10069 /* If the only reason for frame_pointer_needed is that we conservatively
10070 assumed stack realignment might be needed, but in the end nothing that
10071 needed the stack alignment had been spilled, clear frame_pointer_needed
10072 and say we don't need stack realignment. */
10073 if (stack_realign
10074 && !crtl->need_drap
10075 && frame_pointer_needed
10076 && current_function_is_leaf
10077 && flag_omit_frame_pointer
10078 && current_function_sp_is_unchanging
10079 && !ix86_current_function_calls_tls_descriptor
10080 && !crtl->accesses_prior_frames
10081 && !cfun->calls_alloca
10082 && !crtl->calls_eh_return
10083 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10084 && !ix86_frame_pointer_required ()
10085 && get_frame_size () == 0
10086 && ix86_nsaved_sseregs () == 0
10087 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10088 {
10089 HARD_REG_SET set_up_by_prologue, prologue_used;
10090 basic_block bb;
10091
10092 CLEAR_HARD_REG_SET (prologue_used);
10093 CLEAR_HARD_REG_SET (set_up_by_prologue);
10094 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10095 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10096 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10097 HARD_FRAME_POINTER_REGNUM);
10098 FOR_EACH_BB (bb)
10099 {
10100 rtx insn;
10101 FOR_BB_INSNS (bb, insn)
10102 if (NONDEBUG_INSN_P (insn)
10103 && requires_stack_frame_p (insn, prologue_used,
10104 set_up_by_prologue))
10105 {
10106 crtl->stack_realign_needed = stack_realign;
10107 crtl->stack_realign_finalized = true;
10108 return;
10109 }
10110 }
10111
10112 frame_pointer_needed = false;
10113 stack_realign = false;
10114 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10115 crtl->stack_alignment_needed = incoming_stack_boundary;
10116 crtl->stack_alignment_estimated = incoming_stack_boundary;
10117 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10118 crtl->preferred_stack_boundary = incoming_stack_boundary;
10119 df_finish_pass (true);
10120 df_scan_alloc (NULL);
10121 df_scan_blocks ();
10122 df_compute_regs_ever_live (true);
10123 df_analyze ();
10124 }
10125
10126 crtl->stack_realign_needed = stack_realign;
10127 crtl->stack_realign_finalized = true;
10128 }
10129
10130 /* Expand the prologue into a bunch of separate insns. */
10131
10132 void
10133 ix86_expand_prologue (void)
10134 {
10135 struct machine_function *m = cfun->machine;
10136 rtx insn, t;
10137 bool pic_reg_used;
10138 struct ix86_frame frame;
10139 HOST_WIDE_INT allocate;
10140 bool int_registers_saved;
10141
10142 ix86_finalize_stack_realign_flags ();
10143
10144 /* DRAP should not coexist with stack_realign_fp */
10145 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10146
10147 memset (&m->fs, 0, sizeof (m->fs));
10148
10149 /* Initialize CFA state for before the prologue. */
10150 m->fs.cfa_reg = stack_pointer_rtx;
10151 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10152
10153 /* Track SP offset to the CFA. We continue tracking this after we've
10154 swapped the CFA register away from SP. In the case of re-alignment
10155 this is fudged; we're interested to offsets within the local frame. */
10156 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10157 m->fs.sp_valid = true;
10158
10159 ix86_compute_frame_layout (&frame);
10160
10161 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10162 {
10163 /* We should have already generated an error for any use of
10164 ms_hook on a nested function. */
10165 gcc_checking_assert (!ix86_static_chain_on_stack);
10166
10167 /* Check if profiling is active and we shall use profiling before
10168 prologue variant. If so sorry. */
10169 if (crtl->profile && flag_fentry != 0)
10170 sorry ("ms_hook_prologue attribute isn%'t compatible "
10171 "with -mfentry for 32-bit");
10172
10173 /* In ix86_asm_output_function_label we emitted:
10174 8b ff movl.s %edi,%edi
10175 55 push %ebp
10176 8b ec movl.s %esp,%ebp
10177
10178 This matches the hookable function prologue in Win32 API
10179 functions in Microsoft Windows XP Service Pack 2 and newer.
10180 Wine uses this to enable Windows apps to hook the Win32 API
10181 functions provided by Wine.
10182
10183 What that means is that we've already set up the frame pointer. */
10184
10185 if (frame_pointer_needed
10186 && !(crtl->drap_reg && crtl->stack_realign_needed))
10187 {
10188 rtx push, mov;
10189
10190 /* We've decided to use the frame pointer already set up.
10191 Describe this to the unwinder by pretending that both
10192 push and mov insns happen right here.
10193
10194 Putting the unwind info here at the end of the ms_hook
10195 is done so that we can make absolutely certain we get
10196 the required byte sequence at the start of the function,
10197 rather than relying on an assembler that can produce
10198 the exact encoding required.
10199
10200 However it does mean (in the unpatched case) that we have
10201 a 1 insn window where the asynchronous unwind info is
10202 incorrect. However, if we placed the unwind info at
10203 its correct location we would have incorrect unwind info
10204 in the patched case. Which is probably all moot since
10205 I don't expect Wine generates dwarf2 unwind info for the
10206 system libraries that use this feature. */
10207
10208 insn = emit_insn (gen_blockage ());
10209
10210 push = gen_push (hard_frame_pointer_rtx);
10211 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10212 stack_pointer_rtx);
10213 RTX_FRAME_RELATED_P (push) = 1;
10214 RTX_FRAME_RELATED_P (mov) = 1;
10215
10216 RTX_FRAME_RELATED_P (insn) = 1;
10217 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10218 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10219
10220 /* Note that gen_push incremented m->fs.cfa_offset, even
10221 though we didn't emit the push insn here. */
10222 m->fs.cfa_reg = hard_frame_pointer_rtx;
10223 m->fs.fp_offset = m->fs.cfa_offset;
10224 m->fs.fp_valid = true;
10225 }
10226 else
10227 {
10228 /* The frame pointer is not needed so pop %ebp again.
10229 This leaves us with a pristine state. */
10230 emit_insn (gen_pop (hard_frame_pointer_rtx));
10231 }
10232 }
10233
10234 /* The first insn of a function that accepts its static chain on the
10235 stack is to push the register that would be filled in by a direct
10236 call. This insn will be skipped by the trampoline. */
10237 else if (ix86_static_chain_on_stack)
10238 {
10239 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10240 emit_insn (gen_blockage ());
10241
10242 /* We don't want to interpret this push insn as a register save,
10243 only as a stack adjustment. The real copy of the register as
10244 a save will be done later, if needed. */
10245 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10246 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10247 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10248 RTX_FRAME_RELATED_P (insn) = 1;
10249 }
10250
10251 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10252 of DRAP is needed and stack realignment is really needed after reload */
10253 if (stack_realign_drap)
10254 {
10255 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10256
10257 /* Only need to push parameter pointer reg if it is caller saved. */
10258 if (!call_used_regs[REGNO (crtl->drap_reg)])
10259 {
10260 /* Push arg pointer reg */
10261 insn = emit_insn (gen_push (crtl->drap_reg));
10262 RTX_FRAME_RELATED_P (insn) = 1;
10263 }
10264
10265 /* Grab the argument pointer. */
10266 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10267 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10268 RTX_FRAME_RELATED_P (insn) = 1;
10269 m->fs.cfa_reg = crtl->drap_reg;
10270 m->fs.cfa_offset = 0;
10271
10272 /* Align the stack. */
10273 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10274 stack_pointer_rtx,
10275 GEN_INT (-align_bytes)));
10276 RTX_FRAME_RELATED_P (insn) = 1;
10277
10278 /* Replicate the return address on the stack so that return
10279 address can be reached via (argp - 1) slot. This is needed
10280 to implement macro RETURN_ADDR_RTX and intrinsic function
10281 expand_builtin_return_addr etc. */
10282 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10283 t = gen_frame_mem (word_mode, t);
10284 insn = emit_insn (gen_push (t));
10285 RTX_FRAME_RELATED_P (insn) = 1;
10286
10287 /* For the purposes of frame and register save area addressing,
10288 we've started over with a new frame. */
10289 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10290 m->fs.realigned = true;
10291 }
10292
10293 if (frame_pointer_needed && !m->fs.fp_valid)
10294 {
10295 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10296 slower on all targets. Also sdb doesn't like it. */
10297 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10298 RTX_FRAME_RELATED_P (insn) = 1;
10299
10300 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10301 {
10302 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10303 RTX_FRAME_RELATED_P (insn) = 1;
10304
10305 if (m->fs.cfa_reg == stack_pointer_rtx)
10306 m->fs.cfa_reg = hard_frame_pointer_rtx;
10307 m->fs.fp_offset = m->fs.sp_offset;
10308 m->fs.fp_valid = true;
10309 }
10310 }
10311
10312 int_registers_saved = (frame.nregs == 0);
10313
10314 if (!int_registers_saved)
10315 {
10316 /* If saving registers via PUSH, do so now. */
10317 if (!frame.save_regs_using_mov)
10318 {
10319 ix86_emit_save_regs ();
10320 int_registers_saved = true;
10321 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10322 }
10323
10324 /* When using red zone we may start register saving before allocating
10325 the stack frame saving one cycle of the prologue. However, avoid
10326 doing this if we have to probe the stack; at least on x86_64 the
10327 stack probe can turn into a call that clobbers a red zone location. */
10328 else if (ix86_using_red_zone ()
10329 && (! TARGET_STACK_PROBE
10330 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10331 {
10332 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10333 int_registers_saved = true;
10334 }
10335 }
10336
10337 if (stack_realign_fp)
10338 {
10339 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10340 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10341
10342 /* The computation of the size of the re-aligned stack frame means
10343 that we must allocate the size of the register save area before
10344 performing the actual alignment. Otherwise we cannot guarantee
10345 that there's enough storage above the realignment point. */
10346 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10347 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10348 GEN_INT (m->fs.sp_offset
10349 - frame.sse_reg_save_offset),
10350 -1, false);
10351
10352 /* Align the stack. */
10353 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10354 stack_pointer_rtx,
10355 GEN_INT (-align_bytes)));
10356
10357 /* For the purposes of register save area addressing, the stack
10358 pointer is no longer valid. As for the value of sp_offset,
10359 see ix86_compute_frame_layout, which we need to match in order
10360 to pass verification of stack_pointer_offset at the end. */
10361 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10362 m->fs.sp_valid = false;
10363 }
10364
10365 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10366
10367 if (flag_stack_usage_info)
10368 {
10369 /* We start to count from ARG_POINTER. */
10370 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10371
10372 /* If it was realigned, take into account the fake frame. */
10373 if (stack_realign_drap)
10374 {
10375 if (ix86_static_chain_on_stack)
10376 stack_size += UNITS_PER_WORD;
10377
10378 if (!call_used_regs[REGNO (crtl->drap_reg)])
10379 stack_size += UNITS_PER_WORD;
10380
10381 /* This over-estimates by 1 minimal-stack-alignment-unit but
10382 mitigates that by counting in the new return address slot. */
10383 current_function_dynamic_stack_size
10384 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10385 }
10386
10387 current_function_static_stack_size = stack_size;
10388 }
10389
10390 /* The stack has already been decremented by the instruction calling us
10391 so probe if the size is non-negative to preserve the protection area. */
10392 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10393 {
10394 /* We expect the registers to be saved when probes are used. */
10395 gcc_assert (int_registers_saved);
10396
10397 if (STACK_CHECK_MOVING_SP)
10398 {
10399 ix86_adjust_stack_and_probe (allocate);
10400 allocate = 0;
10401 }
10402 else
10403 {
10404 HOST_WIDE_INT size = allocate;
10405
10406 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10407 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10408
10409 if (TARGET_STACK_PROBE)
10410 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10411 else
10412 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10413 }
10414 }
10415
10416 if (allocate == 0)
10417 ;
10418 else if (!ix86_target_stack_probe ()
10419 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10420 {
10421 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10422 GEN_INT (-allocate), -1,
10423 m->fs.cfa_reg == stack_pointer_rtx);
10424 }
10425 else
10426 {
10427 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10428 rtx r10 = NULL;
10429 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10430
10431 bool eax_live = false;
10432 bool r10_live = false;
10433
10434 if (TARGET_64BIT)
10435 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10436 if (!TARGET_64BIT_MS_ABI)
10437 eax_live = ix86_eax_live_at_start_p ();
10438
10439 if (eax_live)
10440 {
10441 emit_insn (gen_push (eax));
10442 allocate -= UNITS_PER_WORD;
10443 }
10444 if (r10_live)
10445 {
10446 r10 = gen_rtx_REG (Pmode, R10_REG);
10447 emit_insn (gen_push (r10));
10448 allocate -= UNITS_PER_WORD;
10449 }
10450
10451 emit_move_insn (eax, GEN_INT (allocate));
10452 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10453
10454 /* Use the fact that AX still contains ALLOCATE. */
10455 adjust_stack_insn = (Pmode == DImode
10456 ? gen_pro_epilogue_adjust_stack_di_sub
10457 : gen_pro_epilogue_adjust_stack_si_sub);
10458
10459 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10460 stack_pointer_rtx, eax));
10461
10462 /* Note that SEH directives need to continue tracking the stack
10463 pointer even after the frame pointer has been set up. */
10464 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10465 {
10466 if (m->fs.cfa_reg == stack_pointer_rtx)
10467 m->fs.cfa_offset += allocate;
10468
10469 RTX_FRAME_RELATED_P (insn) = 1;
10470 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10471 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10472 plus_constant (Pmode, stack_pointer_rtx,
10473 -allocate)));
10474 }
10475 m->fs.sp_offset += allocate;
10476
10477 if (r10_live && eax_live)
10478 {
10479 t = choose_baseaddr (m->fs.sp_offset - allocate);
10480 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10481 gen_frame_mem (word_mode, t));
10482 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10483 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10484 gen_frame_mem (word_mode, t));
10485 }
10486 else if (eax_live || r10_live)
10487 {
10488 t = choose_baseaddr (m->fs.sp_offset - allocate);
10489 emit_move_insn (gen_rtx_REG (word_mode,
10490 (eax_live ? AX_REG : R10_REG)),
10491 gen_frame_mem (word_mode, t));
10492 }
10493 }
10494 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10495
10496 /* If we havn't already set up the frame pointer, do so now. */
10497 if (frame_pointer_needed && !m->fs.fp_valid)
10498 {
10499 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10500 GEN_INT (frame.stack_pointer_offset
10501 - frame.hard_frame_pointer_offset));
10502 insn = emit_insn (insn);
10503 RTX_FRAME_RELATED_P (insn) = 1;
10504 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10505
10506 if (m->fs.cfa_reg == stack_pointer_rtx)
10507 m->fs.cfa_reg = hard_frame_pointer_rtx;
10508 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10509 m->fs.fp_valid = true;
10510 }
10511
10512 if (!int_registers_saved)
10513 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10514 if (frame.nsseregs)
10515 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10516
10517 pic_reg_used = false;
10518 if (pic_offset_table_rtx
10519 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10520 || crtl->profile))
10521 {
10522 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10523
10524 if (alt_pic_reg_used != INVALID_REGNUM)
10525 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10526
10527 pic_reg_used = true;
10528 }
10529
10530 if (pic_reg_used)
10531 {
10532 if (TARGET_64BIT)
10533 {
10534 if (ix86_cmodel == CM_LARGE_PIC)
10535 {
10536 rtx label, tmp_reg;
10537
10538 gcc_assert (Pmode == DImode);
10539 label = gen_label_rtx ();
10540 emit_label (label);
10541 LABEL_PRESERVE_P (label) = 1;
10542 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10543 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10544 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10545 label));
10546 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10547 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10548 pic_offset_table_rtx, tmp_reg));
10549 }
10550 else
10551 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10552 }
10553 else
10554 {
10555 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10558 }
10559 }
10560
10561 /* In the pic_reg_used case, make sure that the got load isn't deleted
10562 when mcount needs it. Blockage to avoid call movement across mcount
10563 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10564 note. */
10565 if (crtl->profile && !flag_fentry && pic_reg_used)
10566 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10567
10568 if (crtl->drap_reg && !crtl->stack_realign_needed)
10569 {
10570 /* vDRAP is setup but after reload it turns out stack realign
10571 isn't necessary, here we will emit prologue to setup DRAP
10572 without stack realign adjustment */
10573 t = choose_baseaddr (0);
10574 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10575 }
10576
10577 /* Prevent instructions from being scheduled into register save push
10578 sequence when access to the redzone area is done through frame pointer.
10579 The offset between the frame pointer and the stack pointer is calculated
10580 relative to the value of the stack pointer at the end of the function
10581 prologue, and moving instructions that access redzone area via frame
10582 pointer inside push sequence violates this assumption. */
10583 if (frame_pointer_needed && frame.red_zone_size)
10584 emit_insn (gen_memory_blockage ());
10585
10586 /* Emit cld instruction if stringops are used in the function. */
10587 if (TARGET_CLD && ix86_current_function_needs_cld)
10588 emit_insn (gen_cld ());
10589
10590 /* SEH requires that the prologue end within 256 bytes of the start of
10591 the function. Prevent instruction schedules that would extend that.
10592 Further, prevent alloca modifications to the stack pointer from being
10593 combined with prologue modifications. */
10594 if (TARGET_SEH)
10595 emit_insn (gen_prologue_use (stack_pointer_rtx));
10596 }
10597
10598 /* Emit code to restore REG using a POP insn. */
10599
10600 static void
10601 ix86_emit_restore_reg_using_pop (rtx reg)
10602 {
10603 struct machine_function *m = cfun->machine;
10604 rtx insn = emit_insn (gen_pop (reg));
10605
10606 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10607 m->fs.sp_offset -= UNITS_PER_WORD;
10608
10609 if (m->fs.cfa_reg == crtl->drap_reg
10610 && REGNO (reg) == REGNO (crtl->drap_reg))
10611 {
10612 /* Previously we'd represented the CFA as an expression
10613 like *(%ebp - 8). We've just popped that value from
10614 the stack, which means we need to reset the CFA to
10615 the drap register. This will remain until we restore
10616 the stack pointer. */
10617 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10618 RTX_FRAME_RELATED_P (insn) = 1;
10619
10620 /* This means that the DRAP register is valid for addressing too. */
10621 m->fs.drap_valid = true;
10622 return;
10623 }
10624
10625 if (m->fs.cfa_reg == stack_pointer_rtx)
10626 {
10627 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10628 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10629 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10630 RTX_FRAME_RELATED_P (insn) = 1;
10631
10632 m->fs.cfa_offset -= UNITS_PER_WORD;
10633 }
10634
10635 /* When the frame pointer is the CFA, and we pop it, we are
10636 swapping back to the stack pointer as the CFA. This happens
10637 for stack frames that don't allocate other data, so we assume
10638 the stack pointer is now pointing at the return address, i.e.
10639 the function entry state, which makes the offset be 1 word. */
10640 if (reg == hard_frame_pointer_rtx)
10641 {
10642 m->fs.fp_valid = false;
10643 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10644 {
10645 m->fs.cfa_reg = stack_pointer_rtx;
10646 m->fs.cfa_offset -= UNITS_PER_WORD;
10647
10648 add_reg_note (insn, REG_CFA_DEF_CFA,
10649 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10650 GEN_INT (m->fs.cfa_offset)));
10651 RTX_FRAME_RELATED_P (insn) = 1;
10652 }
10653 }
10654 }
10655
10656 /* Emit code to restore saved registers using POP insns. */
10657
10658 static void
10659 ix86_emit_restore_regs_using_pop (void)
10660 {
10661 unsigned int regno;
10662
10663 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10664 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10665 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10666 }
10667
10668 /* Emit code and notes for the LEAVE instruction. */
10669
10670 static void
10671 ix86_emit_leave (void)
10672 {
10673 struct machine_function *m = cfun->machine;
10674 rtx insn = emit_insn (ix86_gen_leave ());
10675
10676 ix86_add_queued_cfa_restore_notes (insn);
10677
10678 gcc_assert (m->fs.fp_valid);
10679 m->fs.sp_valid = true;
10680 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10681 m->fs.fp_valid = false;
10682
10683 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10684 {
10685 m->fs.cfa_reg = stack_pointer_rtx;
10686 m->fs.cfa_offset = m->fs.sp_offset;
10687
10688 add_reg_note (insn, REG_CFA_DEF_CFA,
10689 plus_constant (Pmode, stack_pointer_rtx,
10690 m->fs.sp_offset));
10691 RTX_FRAME_RELATED_P (insn) = 1;
10692 }
10693 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10694 m->fs.fp_offset);
10695 }
10696
10697 /* Emit code to restore saved registers using MOV insns.
10698 First register is restored from CFA - CFA_OFFSET. */
10699 static void
10700 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10701 bool maybe_eh_return)
10702 {
10703 struct machine_function *m = cfun->machine;
10704 unsigned int regno;
10705
10706 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10707 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10708 {
10709 rtx reg = gen_rtx_REG (word_mode, regno);
10710 rtx insn, mem;
10711
10712 mem = choose_baseaddr (cfa_offset);
10713 mem = gen_frame_mem (word_mode, mem);
10714 insn = emit_move_insn (reg, mem);
10715
10716 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10717 {
10718 /* Previously we'd represented the CFA as an expression
10719 like *(%ebp - 8). We've just popped that value from
10720 the stack, which means we need to reset the CFA to
10721 the drap register. This will remain until we restore
10722 the stack pointer. */
10723 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10724 RTX_FRAME_RELATED_P (insn) = 1;
10725
10726 /* This means that the DRAP register is valid for addressing. */
10727 m->fs.drap_valid = true;
10728 }
10729 else
10730 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10731
10732 cfa_offset -= UNITS_PER_WORD;
10733 }
10734 }
10735
10736 /* Emit code to restore saved registers using MOV insns.
10737 First register is restored from CFA - CFA_OFFSET. */
10738 static void
10739 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10740 bool maybe_eh_return)
10741 {
10742 unsigned int regno;
10743
10744 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10745 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10746 {
10747 rtx reg = gen_rtx_REG (V4SFmode, regno);
10748 rtx mem;
10749
10750 mem = choose_baseaddr (cfa_offset);
10751 mem = gen_rtx_MEM (V4SFmode, mem);
10752 set_mem_align (mem, 128);
10753 emit_move_insn (reg, mem);
10754
10755 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10756
10757 cfa_offset -= 16;
10758 }
10759 }
10760
10761 /* Emit vzeroupper if needed. */
10762
10763 void
10764 ix86_maybe_emit_epilogue_vzeroupper (void)
10765 {
10766 if (TARGET_VZEROUPPER
10767 && !TREE_THIS_VOLATILE (cfun->decl)
10768 && !cfun->machine->caller_return_avx256_p)
10769 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10770 }
10771
10772 /* Restore function stack, frame, and registers. */
10773
10774 void
10775 ix86_expand_epilogue (int style)
10776 {
10777 struct machine_function *m = cfun->machine;
10778 struct machine_frame_state frame_state_save = m->fs;
10779 struct ix86_frame frame;
10780 bool restore_regs_via_mov;
10781 bool using_drap;
10782
10783 ix86_finalize_stack_realign_flags ();
10784 ix86_compute_frame_layout (&frame);
10785
10786 m->fs.sp_valid = (!frame_pointer_needed
10787 || (current_function_sp_is_unchanging
10788 && !stack_realign_fp));
10789 gcc_assert (!m->fs.sp_valid
10790 || m->fs.sp_offset == frame.stack_pointer_offset);
10791
10792 /* The FP must be valid if the frame pointer is present. */
10793 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10794 gcc_assert (!m->fs.fp_valid
10795 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10796
10797 /* We must have *some* valid pointer to the stack frame. */
10798 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10799
10800 /* The DRAP is never valid at this point. */
10801 gcc_assert (!m->fs.drap_valid);
10802
10803 /* See the comment about red zone and frame
10804 pointer usage in ix86_expand_prologue. */
10805 if (frame_pointer_needed && frame.red_zone_size)
10806 emit_insn (gen_memory_blockage ());
10807
10808 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10809 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10810
10811 /* Determine the CFA offset of the end of the red-zone. */
10812 m->fs.red_zone_offset = 0;
10813 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10814 {
10815 /* The red-zone begins below the return address. */
10816 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10817
10818 /* When the register save area is in the aligned portion of
10819 the stack, determine the maximum runtime displacement that
10820 matches up with the aligned frame. */
10821 if (stack_realign_drap)
10822 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10823 + UNITS_PER_WORD);
10824 }
10825
10826 /* Special care must be taken for the normal return case of a function
10827 using eh_return: the eax and edx registers are marked as saved, but
10828 not restored along this path. Adjust the save location to match. */
10829 if (crtl->calls_eh_return && style != 2)
10830 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10831
10832 /* EH_RETURN requires the use of moves to function properly. */
10833 if (crtl->calls_eh_return)
10834 restore_regs_via_mov = true;
10835 /* SEH requires the use of pops to identify the epilogue. */
10836 else if (TARGET_SEH)
10837 restore_regs_via_mov = false;
10838 /* If we're only restoring one register and sp is not valid then
10839 using a move instruction to restore the register since it's
10840 less work than reloading sp and popping the register. */
10841 else if (!m->fs.sp_valid && frame.nregs <= 1)
10842 restore_regs_via_mov = true;
10843 else if (TARGET_EPILOGUE_USING_MOVE
10844 && cfun->machine->use_fast_prologue_epilogue
10845 && (frame.nregs > 1
10846 || m->fs.sp_offset != frame.reg_save_offset))
10847 restore_regs_via_mov = true;
10848 else if (frame_pointer_needed
10849 && !frame.nregs
10850 && m->fs.sp_offset != frame.reg_save_offset)
10851 restore_regs_via_mov = true;
10852 else if (frame_pointer_needed
10853 && TARGET_USE_LEAVE
10854 && cfun->machine->use_fast_prologue_epilogue
10855 && frame.nregs == 1)
10856 restore_regs_via_mov = true;
10857 else
10858 restore_regs_via_mov = false;
10859
10860 if (restore_regs_via_mov || frame.nsseregs)
10861 {
10862 /* Ensure that the entire register save area is addressable via
10863 the stack pointer, if we will restore via sp. */
10864 if (TARGET_64BIT
10865 && m->fs.sp_offset > 0x7fffffff
10866 && !(m->fs.fp_valid || m->fs.drap_valid)
10867 && (frame.nsseregs + frame.nregs) != 0)
10868 {
10869 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10870 GEN_INT (m->fs.sp_offset
10871 - frame.sse_reg_save_offset),
10872 style,
10873 m->fs.cfa_reg == stack_pointer_rtx);
10874 }
10875 }
10876
10877 /* If there are any SSE registers to restore, then we have to do it
10878 via moves, since there's obviously no pop for SSE regs. */
10879 if (frame.nsseregs)
10880 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10881 style == 2);
10882
10883 if (restore_regs_via_mov)
10884 {
10885 rtx t;
10886
10887 if (frame.nregs)
10888 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10889
10890 /* eh_return epilogues need %ecx added to the stack pointer. */
10891 if (style == 2)
10892 {
10893 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10894
10895 /* Stack align doesn't work with eh_return. */
10896 gcc_assert (!stack_realign_drap);
10897 /* Neither does regparm nested functions. */
10898 gcc_assert (!ix86_static_chain_on_stack);
10899
10900 if (frame_pointer_needed)
10901 {
10902 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10903 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10904 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10905
10906 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10907 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10908
10909 /* Note that we use SA as a temporary CFA, as the return
10910 address is at the proper place relative to it. We
10911 pretend this happens at the FP restore insn because
10912 prior to this insn the FP would be stored at the wrong
10913 offset relative to SA, and after this insn we have no
10914 other reasonable register to use for the CFA. We don't
10915 bother resetting the CFA to the SP for the duration of
10916 the return insn. */
10917 add_reg_note (insn, REG_CFA_DEF_CFA,
10918 plus_constant (Pmode, sa, UNITS_PER_WORD));
10919 ix86_add_queued_cfa_restore_notes (insn);
10920 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10921 RTX_FRAME_RELATED_P (insn) = 1;
10922
10923 m->fs.cfa_reg = sa;
10924 m->fs.cfa_offset = UNITS_PER_WORD;
10925 m->fs.fp_valid = false;
10926
10927 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10928 const0_rtx, style, false);
10929 }
10930 else
10931 {
10932 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10933 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10934 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10935 ix86_add_queued_cfa_restore_notes (insn);
10936
10937 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10938 if (m->fs.cfa_offset != UNITS_PER_WORD)
10939 {
10940 m->fs.cfa_offset = UNITS_PER_WORD;
10941 add_reg_note (insn, REG_CFA_DEF_CFA,
10942 plus_constant (Pmode, stack_pointer_rtx,
10943 UNITS_PER_WORD));
10944 RTX_FRAME_RELATED_P (insn) = 1;
10945 }
10946 }
10947 m->fs.sp_offset = UNITS_PER_WORD;
10948 m->fs.sp_valid = true;
10949 }
10950 }
10951 else
10952 {
10953 /* SEH requires that the function end with (1) a stack adjustment
10954 if necessary, (2) a sequence of pops, and (3) a return or
10955 jump instruction. Prevent insns from the function body from
10956 being scheduled into this sequence. */
10957 if (TARGET_SEH)
10958 {
10959 /* Prevent a catch region from being adjacent to the standard
10960 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10961 several other flags that would be interesting to test are
10962 not yet set up. */
10963 if (flag_non_call_exceptions)
10964 emit_insn (gen_nops (const1_rtx));
10965 else
10966 emit_insn (gen_blockage ());
10967 }
10968
10969 /* First step is to deallocate the stack frame so that we can
10970 pop the registers. */
10971 if (!m->fs.sp_valid)
10972 {
10973 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10974 GEN_INT (m->fs.fp_offset
10975 - frame.reg_save_offset),
10976 style, false);
10977 }
10978 else if (m->fs.sp_offset != frame.reg_save_offset)
10979 {
10980 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10981 GEN_INT (m->fs.sp_offset
10982 - frame.reg_save_offset),
10983 style,
10984 m->fs.cfa_reg == stack_pointer_rtx);
10985 }
10986
10987 ix86_emit_restore_regs_using_pop ();
10988 }
10989
10990 /* If we used a stack pointer and haven't already got rid of it,
10991 then do so now. */
10992 if (m->fs.fp_valid)
10993 {
10994 /* If the stack pointer is valid and pointing at the frame
10995 pointer store address, then we only need a pop. */
10996 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10997 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10998 /* Leave results in shorter dependency chains on CPUs that are
10999 able to grok it fast. */
11000 else if (TARGET_USE_LEAVE
11001 || optimize_function_for_size_p (cfun)
11002 || !cfun->machine->use_fast_prologue_epilogue)
11003 ix86_emit_leave ();
11004 else
11005 {
11006 pro_epilogue_adjust_stack (stack_pointer_rtx,
11007 hard_frame_pointer_rtx,
11008 const0_rtx, style, !using_drap);
11009 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11010 }
11011 }
11012
11013 if (using_drap)
11014 {
11015 int param_ptr_offset = UNITS_PER_WORD;
11016 rtx insn;
11017
11018 gcc_assert (stack_realign_drap);
11019
11020 if (ix86_static_chain_on_stack)
11021 param_ptr_offset += UNITS_PER_WORD;
11022 if (!call_used_regs[REGNO (crtl->drap_reg)])
11023 param_ptr_offset += UNITS_PER_WORD;
11024
11025 insn = emit_insn (gen_rtx_SET
11026 (VOIDmode, stack_pointer_rtx,
11027 gen_rtx_PLUS (Pmode,
11028 crtl->drap_reg,
11029 GEN_INT (-param_ptr_offset))));
11030 m->fs.cfa_reg = stack_pointer_rtx;
11031 m->fs.cfa_offset = param_ptr_offset;
11032 m->fs.sp_offset = param_ptr_offset;
11033 m->fs.realigned = false;
11034
11035 add_reg_note (insn, REG_CFA_DEF_CFA,
11036 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11037 GEN_INT (param_ptr_offset)));
11038 RTX_FRAME_RELATED_P (insn) = 1;
11039
11040 if (!call_used_regs[REGNO (crtl->drap_reg)])
11041 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11042 }
11043
11044 /* At this point the stack pointer must be valid, and we must have
11045 restored all of the registers. We may not have deallocated the
11046 entire stack frame. We've delayed this until now because it may
11047 be possible to merge the local stack deallocation with the
11048 deallocation forced by ix86_static_chain_on_stack. */
11049 gcc_assert (m->fs.sp_valid);
11050 gcc_assert (!m->fs.fp_valid);
11051 gcc_assert (!m->fs.realigned);
11052 if (m->fs.sp_offset != UNITS_PER_WORD)
11053 {
11054 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11055 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11056 style, true);
11057 }
11058 else
11059 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11060
11061 /* Sibcall epilogues don't want a return instruction. */
11062 if (style == 0)
11063 {
11064 m->fs = frame_state_save;
11065 return;
11066 }
11067
11068 /* Emit vzeroupper if needed. */
11069 ix86_maybe_emit_epilogue_vzeroupper ();
11070
11071 if (crtl->args.pops_args && crtl->args.size)
11072 {
11073 rtx popc = GEN_INT (crtl->args.pops_args);
11074
11075 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11076 address, do explicit add, and jump indirectly to the caller. */
11077
11078 if (crtl->args.pops_args >= 65536)
11079 {
11080 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11081 rtx insn;
11082
11083 /* There is no "pascal" calling convention in any 64bit ABI. */
11084 gcc_assert (!TARGET_64BIT);
11085
11086 insn = emit_insn (gen_pop (ecx));
11087 m->fs.cfa_offset -= UNITS_PER_WORD;
11088 m->fs.sp_offset -= UNITS_PER_WORD;
11089
11090 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11091 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11092 add_reg_note (insn, REG_CFA_REGISTER,
11093 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11094 RTX_FRAME_RELATED_P (insn) = 1;
11095
11096 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11097 popc, -1, true);
11098 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11099 }
11100 else
11101 emit_jump_insn (gen_simple_return_pop_internal (popc));
11102 }
11103 else
11104 emit_jump_insn (gen_simple_return_internal ());
11105
11106 /* Restore the state back to the state from the prologue,
11107 so that it's correct for the next epilogue. */
11108 m->fs = frame_state_save;
11109 }
11110
11111 /* Reset from the function's potential modifications. */
11112
11113 static void
11114 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11115 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11116 {
11117 if (pic_offset_table_rtx)
11118 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11119 #if TARGET_MACHO
11120 /* Mach-O doesn't support labels at the end of objects, so if
11121 it looks like we might want one, insert a NOP. */
11122 {
11123 rtx insn = get_last_insn ();
11124 rtx deleted_debug_label = NULL_RTX;
11125 while (insn
11126 && NOTE_P (insn)
11127 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11128 {
11129 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11130 notes only, instead set their CODE_LABEL_NUMBER to -1,
11131 otherwise there would be code generation differences
11132 in between -g and -g0. */
11133 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11134 deleted_debug_label = insn;
11135 insn = PREV_INSN (insn);
11136 }
11137 if (insn
11138 && (LABEL_P (insn)
11139 || (NOTE_P (insn)
11140 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11141 fputs ("\tnop\n", file);
11142 else if (deleted_debug_label)
11143 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11144 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11145 CODE_LABEL_NUMBER (insn) = -1;
11146 }
11147 #endif
11148
11149 }
11150
11151 /* Return a scratch register to use in the split stack prologue. The
11152 split stack prologue is used for -fsplit-stack. It is the first
11153 instructions in the function, even before the regular prologue.
11154 The scratch register can be any caller-saved register which is not
11155 used for parameters or for the static chain. */
11156
11157 static unsigned int
11158 split_stack_prologue_scratch_regno (void)
11159 {
11160 if (TARGET_64BIT)
11161 return R11_REG;
11162 else
11163 {
11164 bool is_fastcall;
11165 int regparm;
11166
11167 is_fastcall = (lookup_attribute ("fastcall",
11168 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11169 != NULL);
11170 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11171
11172 if (is_fastcall)
11173 {
11174 if (DECL_STATIC_CHAIN (cfun->decl))
11175 {
11176 sorry ("-fsplit-stack does not support fastcall with "
11177 "nested function");
11178 return INVALID_REGNUM;
11179 }
11180 return AX_REG;
11181 }
11182 else if (regparm < 3)
11183 {
11184 if (!DECL_STATIC_CHAIN (cfun->decl))
11185 return CX_REG;
11186 else
11187 {
11188 if (regparm >= 2)
11189 {
11190 sorry ("-fsplit-stack does not support 2 register "
11191 " parameters for a nested function");
11192 return INVALID_REGNUM;
11193 }
11194 return DX_REG;
11195 }
11196 }
11197 else
11198 {
11199 /* FIXME: We could make this work by pushing a register
11200 around the addition and comparison. */
11201 sorry ("-fsplit-stack does not support 3 register parameters");
11202 return INVALID_REGNUM;
11203 }
11204 }
11205 }
11206
11207 /* A SYMBOL_REF for the function which allocates new stackspace for
11208 -fsplit-stack. */
11209
11210 static GTY(()) rtx split_stack_fn;
11211
11212 /* A SYMBOL_REF for the more stack function when using the large
11213 model. */
11214
11215 static GTY(()) rtx split_stack_fn_large;
11216
11217 /* Handle -fsplit-stack. These are the first instructions in the
11218 function, even before the regular prologue. */
11219
11220 void
11221 ix86_expand_split_stack_prologue (void)
11222 {
11223 struct ix86_frame frame;
11224 HOST_WIDE_INT allocate;
11225 unsigned HOST_WIDE_INT args_size;
11226 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11227 rtx scratch_reg = NULL_RTX;
11228 rtx varargs_label = NULL_RTX;
11229 rtx fn;
11230
11231 gcc_assert (flag_split_stack && reload_completed);
11232
11233 ix86_finalize_stack_realign_flags ();
11234 ix86_compute_frame_layout (&frame);
11235 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11236
11237 /* This is the label we will branch to if we have enough stack
11238 space. We expect the basic block reordering pass to reverse this
11239 branch if optimizing, so that we branch in the unlikely case. */
11240 label = gen_label_rtx ();
11241
11242 /* We need to compare the stack pointer minus the frame size with
11243 the stack boundary in the TCB. The stack boundary always gives
11244 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11245 can compare directly. Otherwise we need to do an addition. */
11246
11247 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11248 UNSPEC_STACK_CHECK);
11249 limit = gen_rtx_CONST (Pmode, limit);
11250 limit = gen_rtx_MEM (Pmode, limit);
11251 if (allocate < SPLIT_STACK_AVAILABLE)
11252 current = stack_pointer_rtx;
11253 else
11254 {
11255 unsigned int scratch_regno;
11256 rtx offset;
11257
11258 /* We need a scratch register to hold the stack pointer minus
11259 the required frame size. Since this is the very start of the
11260 function, the scratch register can be any caller-saved
11261 register which is not used for parameters. */
11262 offset = GEN_INT (- allocate);
11263 scratch_regno = split_stack_prologue_scratch_regno ();
11264 if (scratch_regno == INVALID_REGNUM)
11265 return;
11266 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11267 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11268 {
11269 /* We don't use ix86_gen_add3 in this case because it will
11270 want to split to lea, but when not optimizing the insn
11271 will not be split after this point. */
11272 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11273 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11274 offset)));
11275 }
11276 else
11277 {
11278 emit_move_insn (scratch_reg, offset);
11279 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11280 stack_pointer_rtx));
11281 }
11282 current = scratch_reg;
11283 }
11284
11285 ix86_expand_branch (GEU, current, limit, label);
11286 jump_insn = get_last_insn ();
11287 JUMP_LABEL (jump_insn) = label;
11288
11289 /* Mark the jump as very likely to be taken. */
11290 add_reg_note (jump_insn, REG_BR_PROB,
11291 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11292
11293 if (split_stack_fn == NULL_RTX)
11294 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11295 fn = split_stack_fn;
11296
11297 /* Get more stack space. We pass in the desired stack space and the
11298 size of the arguments to copy to the new stack. In 32-bit mode
11299 we push the parameters; __morestack will return on a new stack
11300 anyhow. In 64-bit mode we pass the parameters in r10 and
11301 r11. */
11302 allocate_rtx = GEN_INT (allocate);
11303 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11304 call_fusage = NULL_RTX;
11305 if (TARGET_64BIT)
11306 {
11307 rtx reg10, reg11;
11308
11309 reg10 = gen_rtx_REG (Pmode, R10_REG);
11310 reg11 = gen_rtx_REG (Pmode, R11_REG);
11311
11312 /* If this function uses a static chain, it will be in %r10.
11313 Preserve it across the call to __morestack. */
11314 if (DECL_STATIC_CHAIN (cfun->decl))
11315 {
11316 rtx rax;
11317
11318 rax = gen_rtx_REG (word_mode, AX_REG);
11319 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11320 use_reg (&call_fusage, rax);
11321 }
11322
11323 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11324 {
11325 HOST_WIDE_INT argval;
11326
11327 gcc_assert (Pmode == DImode);
11328 /* When using the large model we need to load the address
11329 into a register, and we've run out of registers. So we
11330 switch to a different calling convention, and we call a
11331 different function: __morestack_large. We pass the
11332 argument size in the upper 32 bits of r10 and pass the
11333 frame size in the lower 32 bits. */
11334 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11335 gcc_assert ((args_size & 0xffffffff) == args_size);
11336
11337 if (split_stack_fn_large == NULL_RTX)
11338 split_stack_fn_large =
11339 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11340
11341 if (ix86_cmodel == CM_LARGE_PIC)
11342 {
11343 rtx label, x;
11344
11345 label = gen_label_rtx ();
11346 emit_label (label);
11347 LABEL_PRESERVE_P (label) = 1;
11348 emit_insn (gen_set_rip_rex64 (reg10, label));
11349 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11350 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11351 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11352 UNSPEC_GOT);
11353 x = gen_rtx_CONST (Pmode, x);
11354 emit_move_insn (reg11, x);
11355 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11356 x = gen_const_mem (Pmode, x);
11357 emit_move_insn (reg11, x);
11358 }
11359 else
11360 emit_move_insn (reg11, split_stack_fn_large);
11361
11362 fn = reg11;
11363
11364 argval = ((args_size << 16) << 16) + allocate;
11365 emit_move_insn (reg10, GEN_INT (argval));
11366 }
11367 else
11368 {
11369 emit_move_insn (reg10, allocate_rtx);
11370 emit_move_insn (reg11, GEN_INT (args_size));
11371 use_reg (&call_fusage, reg11);
11372 }
11373
11374 use_reg (&call_fusage, reg10);
11375 }
11376 else
11377 {
11378 emit_insn (gen_push (GEN_INT (args_size)));
11379 emit_insn (gen_push (allocate_rtx));
11380 }
11381 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11382 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11383 NULL_RTX, false);
11384 add_function_usage_to (call_insn, call_fusage);
11385
11386 /* In order to make call/return prediction work right, we now need
11387 to execute a return instruction. See
11388 libgcc/config/i386/morestack.S for the details on how this works.
11389
11390 For flow purposes gcc must not see this as a return
11391 instruction--we need control flow to continue at the subsequent
11392 label. Therefore, we use an unspec. */
11393 gcc_assert (crtl->args.pops_args < 65536);
11394 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11395
11396 /* If we are in 64-bit mode and this function uses a static chain,
11397 we saved %r10 in %rax before calling _morestack. */
11398 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11399 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11400 gen_rtx_REG (word_mode, AX_REG));
11401
11402 /* If this function calls va_start, we need to store a pointer to
11403 the arguments on the old stack, because they may not have been
11404 all copied to the new stack. At this point the old stack can be
11405 found at the frame pointer value used by __morestack, because
11406 __morestack has set that up before calling back to us. Here we
11407 store that pointer in a scratch register, and in
11408 ix86_expand_prologue we store the scratch register in a stack
11409 slot. */
11410 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11411 {
11412 unsigned int scratch_regno;
11413 rtx frame_reg;
11414 int words;
11415
11416 scratch_regno = split_stack_prologue_scratch_regno ();
11417 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11418 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11419
11420 /* 64-bit:
11421 fp -> old fp value
11422 return address within this function
11423 return address of caller of this function
11424 stack arguments
11425 So we add three words to get to the stack arguments.
11426
11427 32-bit:
11428 fp -> old fp value
11429 return address within this function
11430 first argument to __morestack
11431 second argument to __morestack
11432 return address of caller of this function
11433 stack arguments
11434 So we add five words to get to the stack arguments.
11435 */
11436 words = TARGET_64BIT ? 3 : 5;
11437 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11438 gen_rtx_PLUS (Pmode, frame_reg,
11439 GEN_INT (words * UNITS_PER_WORD))));
11440
11441 varargs_label = gen_label_rtx ();
11442 emit_jump_insn (gen_jump (varargs_label));
11443 JUMP_LABEL (get_last_insn ()) = varargs_label;
11444
11445 emit_barrier ();
11446 }
11447
11448 emit_label (label);
11449 LABEL_NUSES (label) = 1;
11450
11451 /* If this function calls va_start, we now have to set the scratch
11452 register for the case where we do not call __morestack. In this
11453 case we need to set it based on the stack pointer. */
11454 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11455 {
11456 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11457 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11458 GEN_INT (UNITS_PER_WORD))));
11459
11460 emit_label (varargs_label);
11461 LABEL_NUSES (varargs_label) = 1;
11462 }
11463 }
11464
11465 /* We may have to tell the dataflow pass that the split stack prologue
11466 is initializing a scratch register. */
11467
11468 static void
11469 ix86_live_on_entry (bitmap regs)
11470 {
11471 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11472 {
11473 gcc_assert (flag_split_stack);
11474 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11475 }
11476 }
11477 \f
11478 /* Determine if op is suitable SUBREG RTX for address. */
11479
11480 static bool
11481 ix86_address_subreg_operand (rtx op)
11482 {
11483 enum machine_mode mode;
11484
11485 if (!REG_P (op))
11486 return false;
11487
11488 mode = GET_MODE (op);
11489
11490 if (GET_MODE_CLASS (mode) != MODE_INT)
11491 return false;
11492
11493 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11494 failures when the register is one word out of a two word structure. */
11495 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11496 return false;
11497
11498 /* Allow only SUBREGs of non-eliminable hard registers. */
11499 return register_no_elim_operand (op, mode);
11500 }
11501
11502 /* Extract the parts of an RTL expression that is a valid memory address
11503 for an instruction. Return 0 if the structure of the address is
11504 grossly off. Return -1 if the address contains ASHIFT, so it is not
11505 strictly valid, but still used for computing length of lea instruction. */
11506
11507 int
11508 ix86_decompose_address (rtx addr, struct ix86_address *out)
11509 {
11510 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11511 rtx base_reg, index_reg;
11512 HOST_WIDE_INT scale = 1;
11513 rtx scale_rtx = NULL_RTX;
11514 rtx tmp;
11515 int retval = 1;
11516 enum ix86_address_seg seg = SEG_DEFAULT;
11517
11518 /* Allow zero-extended SImode addresses,
11519 they will be emitted with addr32 prefix. */
11520 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11521 {
11522 if (GET_CODE (addr) == ZERO_EXTEND
11523 && GET_MODE (XEXP (addr, 0)) == SImode)
11524 addr = XEXP (addr, 0);
11525 else if (GET_CODE (addr) == AND
11526 && const_32bit_mask (XEXP (addr, 1), DImode))
11527 {
11528 addr = XEXP (addr, 0);
11529
11530 /* Adjust SUBREGs. */
11531 if (GET_CODE (addr) == SUBREG
11532 && GET_MODE (SUBREG_REG (addr)) == SImode)
11533 addr = SUBREG_REG (addr);
11534 else if (GET_MODE (addr) == DImode)
11535 addr = gen_rtx_SUBREG (SImode, addr, 0);
11536 else if (GET_MODE (addr) != VOIDmode)
11537 return 0;
11538 }
11539 }
11540
11541 if (REG_P (addr))
11542 base = addr;
11543 else if (GET_CODE (addr) == SUBREG)
11544 {
11545 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11546 base = addr;
11547 else
11548 return 0;
11549 }
11550 else if (GET_CODE (addr) == PLUS)
11551 {
11552 rtx addends[4], op;
11553 int n = 0, i;
11554
11555 op = addr;
11556 do
11557 {
11558 if (n >= 4)
11559 return 0;
11560 addends[n++] = XEXP (op, 1);
11561 op = XEXP (op, 0);
11562 }
11563 while (GET_CODE (op) == PLUS);
11564 if (n >= 4)
11565 return 0;
11566 addends[n] = op;
11567
11568 for (i = n; i >= 0; --i)
11569 {
11570 op = addends[i];
11571 switch (GET_CODE (op))
11572 {
11573 case MULT:
11574 if (index)
11575 return 0;
11576 index = XEXP (op, 0);
11577 scale_rtx = XEXP (op, 1);
11578 break;
11579
11580 case ASHIFT:
11581 if (index)
11582 return 0;
11583 index = XEXP (op, 0);
11584 tmp = XEXP (op, 1);
11585 if (!CONST_INT_P (tmp))
11586 return 0;
11587 scale = INTVAL (tmp);
11588 if ((unsigned HOST_WIDE_INT) scale > 3)
11589 return 0;
11590 scale = 1 << scale;
11591 break;
11592
11593 case ZERO_EXTEND:
11594 op = XEXP (op, 0);
11595 if (GET_CODE (op) != UNSPEC)
11596 return 0;
11597 /* FALLTHRU */
11598
11599 case UNSPEC:
11600 if (XINT (op, 1) == UNSPEC_TP
11601 && TARGET_TLS_DIRECT_SEG_REFS
11602 && seg == SEG_DEFAULT)
11603 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11604 else
11605 return 0;
11606 break;
11607
11608 case SUBREG:
11609 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11610 return 0;
11611 /* FALLTHRU */
11612
11613 case REG:
11614 if (!base)
11615 base = op;
11616 else if (!index)
11617 index = op;
11618 else
11619 return 0;
11620 break;
11621
11622 case CONST:
11623 case CONST_INT:
11624 case SYMBOL_REF:
11625 case LABEL_REF:
11626 if (disp)
11627 return 0;
11628 disp = op;
11629 break;
11630
11631 default:
11632 return 0;
11633 }
11634 }
11635 }
11636 else if (GET_CODE (addr) == MULT)
11637 {
11638 index = XEXP (addr, 0); /* index*scale */
11639 scale_rtx = XEXP (addr, 1);
11640 }
11641 else if (GET_CODE (addr) == ASHIFT)
11642 {
11643 /* We're called for lea too, which implements ashift on occasion. */
11644 index = XEXP (addr, 0);
11645 tmp = XEXP (addr, 1);
11646 if (!CONST_INT_P (tmp))
11647 return 0;
11648 scale = INTVAL (tmp);
11649 if ((unsigned HOST_WIDE_INT) scale > 3)
11650 return 0;
11651 scale = 1 << scale;
11652 retval = -1;
11653 }
11654 else
11655 disp = addr; /* displacement */
11656
11657 if (index)
11658 {
11659 if (REG_P (index))
11660 ;
11661 else if (GET_CODE (index) == SUBREG
11662 && ix86_address_subreg_operand (SUBREG_REG (index)))
11663 ;
11664 else
11665 return 0;
11666 }
11667
11668 /* Address override works only on the (%reg) part of %fs:(%reg). */
11669 if (seg != SEG_DEFAULT
11670 && ((base && GET_MODE (base) != word_mode)
11671 || (index && GET_MODE (index) != word_mode)))
11672 return 0;
11673
11674 /* Extract the integral value of scale. */
11675 if (scale_rtx)
11676 {
11677 if (!CONST_INT_P (scale_rtx))
11678 return 0;
11679 scale = INTVAL (scale_rtx);
11680 }
11681
11682 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11683 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11684
11685 /* Avoid useless 0 displacement. */
11686 if (disp == const0_rtx && (base || index))
11687 disp = NULL_RTX;
11688
11689 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11690 if (base_reg && index_reg && scale == 1
11691 && (index_reg == arg_pointer_rtx
11692 || index_reg == frame_pointer_rtx
11693 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11694 {
11695 rtx tmp;
11696 tmp = base, base = index, index = tmp;
11697 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11698 }
11699
11700 /* Special case: %ebp cannot be encoded as a base without a displacement.
11701 Similarly %r13. */
11702 if (!disp
11703 && base_reg
11704 && (base_reg == hard_frame_pointer_rtx
11705 || base_reg == frame_pointer_rtx
11706 || base_reg == arg_pointer_rtx
11707 || (REG_P (base_reg)
11708 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11709 || REGNO (base_reg) == R13_REG))))
11710 disp = const0_rtx;
11711
11712 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11713 Avoid this by transforming to [%esi+0].
11714 Reload calls address legitimization without cfun defined, so we need
11715 to test cfun for being non-NULL. */
11716 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11717 && base_reg && !index_reg && !disp
11718 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11719 disp = const0_rtx;
11720
11721 /* Special case: encode reg+reg instead of reg*2. */
11722 if (!base && index && scale == 2)
11723 base = index, base_reg = index_reg, scale = 1;
11724
11725 /* Special case: scaling cannot be encoded without base or displacement. */
11726 if (!base && !disp && index && scale != 1)
11727 disp = const0_rtx;
11728
11729 out->base = base;
11730 out->index = index;
11731 out->disp = disp;
11732 out->scale = scale;
11733 out->seg = seg;
11734
11735 return retval;
11736 }
11737 \f
11738 /* Return cost of the memory address x.
11739 For i386, it is better to use a complex address than let gcc copy
11740 the address into a reg and make a new pseudo. But not if the address
11741 requires to two regs - that would mean more pseudos with longer
11742 lifetimes. */
11743 static int
11744 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11745 {
11746 struct ix86_address parts;
11747 int cost = 1;
11748 int ok = ix86_decompose_address (x, &parts);
11749
11750 gcc_assert (ok);
11751
11752 if (parts.base && GET_CODE (parts.base) == SUBREG)
11753 parts.base = SUBREG_REG (parts.base);
11754 if (parts.index && GET_CODE (parts.index) == SUBREG)
11755 parts.index = SUBREG_REG (parts.index);
11756
11757 /* Attempt to minimize number of registers in the address. */
11758 if ((parts.base
11759 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11760 || (parts.index
11761 && (!REG_P (parts.index)
11762 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11763 cost++;
11764
11765 if (parts.base
11766 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11767 && parts.index
11768 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11769 && parts.base != parts.index)
11770 cost++;
11771
11772 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11773 since it's predecode logic can't detect the length of instructions
11774 and it degenerates to vector decoded. Increase cost of such
11775 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11776 to split such addresses or even refuse such addresses at all.
11777
11778 Following addressing modes are affected:
11779 [base+scale*index]
11780 [scale*index+disp]
11781 [base+index]
11782
11783 The first and last case may be avoidable by explicitly coding the zero in
11784 memory address, but I don't have AMD-K6 machine handy to check this
11785 theory. */
11786
11787 if (TARGET_K6
11788 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11789 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11790 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11791 cost += 10;
11792
11793 return cost;
11794 }
11795 \f
11796 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11797 this is used for to form addresses to local data when -fPIC is in
11798 use. */
11799
11800 static bool
11801 darwin_local_data_pic (rtx disp)
11802 {
11803 return (GET_CODE (disp) == UNSPEC
11804 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11805 }
11806
11807 /* Determine if a given RTX is a valid constant. We already know this
11808 satisfies CONSTANT_P. */
11809
11810 static bool
11811 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11812 {
11813 switch (GET_CODE (x))
11814 {
11815 case CONST:
11816 x = XEXP (x, 0);
11817
11818 if (GET_CODE (x) == PLUS)
11819 {
11820 if (!CONST_INT_P (XEXP (x, 1)))
11821 return false;
11822 x = XEXP (x, 0);
11823 }
11824
11825 if (TARGET_MACHO && darwin_local_data_pic (x))
11826 return true;
11827
11828 /* Only some unspecs are valid as "constants". */
11829 if (GET_CODE (x) == UNSPEC)
11830 switch (XINT (x, 1))
11831 {
11832 case UNSPEC_GOT:
11833 case UNSPEC_GOTOFF:
11834 case UNSPEC_PLTOFF:
11835 return TARGET_64BIT;
11836 case UNSPEC_TPOFF:
11837 case UNSPEC_NTPOFF:
11838 x = XVECEXP (x, 0, 0);
11839 return (GET_CODE (x) == SYMBOL_REF
11840 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11841 case UNSPEC_DTPOFF:
11842 x = XVECEXP (x, 0, 0);
11843 return (GET_CODE (x) == SYMBOL_REF
11844 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11845 default:
11846 return false;
11847 }
11848
11849 /* We must have drilled down to a symbol. */
11850 if (GET_CODE (x) == LABEL_REF)
11851 return true;
11852 if (GET_CODE (x) != SYMBOL_REF)
11853 return false;
11854 /* FALLTHRU */
11855
11856 case SYMBOL_REF:
11857 /* TLS symbols are never valid. */
11858 if (SYMBOL_REF_TLS_MODEL (x))
11859 return false;
11860
11861 /* DLLIMPORT symbols are never valid. */
11862 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11863 && SYMBOL_REF_DLLIMPORT_P (x))
11864 return false;
11865
11866 #if TARGET_MACHO
11867 /* mdynamic-no-pic */
11868 if (MACHO_DYNAMIC_NO_PIC_P)
11869 return machopic_symbol_defined_p (x);
11870 #endif
11871 break;
11872
11873 case CONST_DOUBLE:
11874 if (GET_MODE (x) == TImode
11875 && x != CONST0_RTX (TImode)
11876 && !TARGET_64BIT)
11877 return false;
11878 break;
11879
11880 case CONST_VECTOR:
11881 if (!standard_sse_constant_p (x))
11882 return false;
11883
11884 default:
11885 break;
11886 }
11887
11888 /* Otherwise we handle everything else in the move patterns. */
11889 return true;
11890 }
11891
11892 /* Determine if it's legal to put X into the constant pool. This
11893 is not possible for the address of thread-local symbols, which
11894 is checked above. */
11895
11896 static bool
11897 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11898 {
11899 /* We can always put integral constants and vectors in memory. */
11900 switch (GET_CODE (x))
11901 {
11902 case CONST_INT:
11903 case CONST_DOUBLE:
11904 case CONST_VECTOR:
11905 return false;
11906
11907 default:
11908 break;
11909 }
11910 return !ix86_legitimate_constant_p (mode, x);
11911 }
11912
11913
11914 /* Nonzero if the constant value X is a legitimate general operand
11915 when generating PIC code. It is given that flag_pic is on and
11916 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11917
11918 bool
11919 legitimate_pic_operand_p (rtx x)
11920 {
11921 rtx inner;
11922
11923 switch (GET_CODE (x))
11924 {
11925 case CONST:
11926 inner = XEXP (x, 0);
11927 if (GET_CODE (inner) == PLUS
11928 && CONST_INT_P (XEXP (inner, 1)))
11929 inner = XEXP (inner, 0);
11930
11931 /* Only some unspecs are valid as "constants". */
11932 if (GET_CODE (inner) == UNSPEC)
11933 switch (XINT (inner, 1))
11934 {
11935 case UNSPEC_GOT:
11936 case UNSPEC_GOTOFF:
11937 case UNSPEC_PLTOFF:
11938 return TARGET_64BIT;
11939 case UNSPEC_TPOFF:
11940 x = XVECEXP (inner, 0, 0);
11941 return (GET_CODE (x) == SYMBOL_REF
11942 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11943 case UNSPEC_MACHOPIC_OFFSET:
11944 return legitimate_pic_address_disp_p (x);
11945 default:
11946 return false;
11947 }
11948 /* FALLTHRU */
11949
11950 case SYMBOL_REF:
11951 case LABEL_REF:
11952 return legitimate_pic_address_disp_p (x);
11953
11954 default:
11955 return true;
11956 }
11957 }
11958
11959 /* Determine if a given CONST RTX is a valid memory displacement
11960 in PIC mode. */
11961
11962 bool
11963 legitimate_pic_address_disp_p (rtx disp)
11964 {
11965 bool saw_plus;
11966
11967 /* In 64bit mode we can allow direct addresses of symbols and labels
11968 when they are not dynamic symbols. */
11969 if (TARGET_64BIT)
11970 {
11971 rtx op0 = disp, op1;
11972
11973 switch (GET_CODE (disp))
11974 {
11975 case LABEL_REF:
11976 return true;
11977
11978 case CONST:
11979 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11980 break;
11981 op0 = XEXP (XEXP (disp, 0), 0);
11982 op1 = XEXP (XEXP (disp, 0), 1);
11983 if (!CONST_INT_P (op1)
11984 || INTVAL (op1) >= 16*1024*1024
11985 || INTVAL (op1) < -16*1024*1024)
11986 break;
11987 if (GET_CODE (op0) == LABEL_REF)
11988 return true;
11989 if (GET_CODE (op0) == CONST
11990 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11991 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11992 return true;
11993 if (GET_CODE (op0) == UNSPEC
11994 && XINT (op0, 1) == UNSPEC_PCREL)
11995 return true;
11996 if (GET_CODE (op0) != SYMBOL_REF)
11997 break;
11998 /* FALLTHRU */
11999
12000 case SYMBOL_REF:
12001 /* TLS references should always be enclosed in UNSPEC. */
12002 if (SYMBOL_REF_TLS_MODEL (op0))
12003 return false;
12004 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12005 && ix86_cmodel != CM_LARGE_PIC)
12006 return true;
12007 break;
12008
12009 default:
12010 break;
12011 }
12012 }
12013 if (GET_CODE (disp) != CONST)
12014 return false;
12015 disp = XEXP (disp, 0);
12016
12017 if (TARGET_64BIT)
12018 {
12019 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12020 of GOT tables. We should not need these anyway. */
12021 if (GET_CODE (disp) != UNSPEC
12022 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12023 && XINT (disp, 1) != UNSPEC_GOTOFF
12024 && XINT (disp, 1) != UNSPEC_PCREL
12025 && XINT (disp, 1) != UNSPEC_PLTOFF))
12026 return false;
12027
12028 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12029 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12030 return false;
12031 return true;
12032 }
12033
12034 saw_plus = false;
12035 if (GET_CODE (disp) == PLUS)
12036 {
12037 if (!CONST_INT_P (XEXP (disp, 1)))
12038 return false;
12039 disp = XEXP (disp, 0);
12040 saw_plus = true;
12041 }
12042
12043 if (TARGET_MACHO && darwin_local_data_pic (disp))
12044 return true;
12045
12046 if (GET_CODE (disp) != UNSPEC)
12047 return false;
12048
12049 switch (XINT (disp, 1))
12050 {
12051 case UNSPEC_GOT:
12052 if (saw_plus)
12053 return false;
12054 /* We need to check for both symbols and labels because VxWorks loads
12055 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12056 details. */
12057 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12058 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12059 case UNSPEC_GOTOFF:
12060 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12061 While ABI specify also 32bit relocation but we don't produce it in
12062 small PIC model at all. */
12063 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12064 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12065 && !TARGET_64BIT)
12066 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12067 return false;
12068 case UNSPEC_GOTTPOFF:
12069 case UNSPEC_GOTNTPOFF:
12070 case UNSPEC_INDNTPOFF:
12071 if (saw_plus)
12072 return false;
12073 disp = XVECEXP (disp, 0, 0);
12074 return (GET_CODE (disp) == SYMBOL_REF
12075 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12076 case UNSPEC_NTPOFF:
12077 disp = XVECEXP (disp, 0, 0);
12078 return (GET_CODE (disp) == SYMBOL_REF
12079 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12080 case UNSPEC_DTPOFF:
12081 disp = XVECEXP (disp, 0, 0);
12082 return (GET_CODE (disp) == SYMBOL_REF
12083 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12084 }
12085
12086 return false;
12087 }
12088
12089 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12090 replace the input X, or the original X if no replacement is called for.
12091 The output parameter *WIN is 1 if the calling macro should goto WIN,
12092 0 if it should not. */
12093
12094 bool
12095 ix86_legitimize_reload_address (rtx x,
12096 enum machine_mode mode ATTRIBUTE_UNUSED,
12097 int opnum, int type,
12098 int ind_levels ATTRIBUTE_UNUSED)
12099 {
12100 /* Reload can generate:
12101
12102 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12103 (reg:DI 97))
12104 (reg:DI 2 cx))
12105
12106 This RTX is rejected from ix86_legitimate_address_p due to
12107 non-strictness of base register 97. Following this rejection,
12108 reload pushes all three components into separate registers,
12109 creating invalid memory address RTX.
12110
12111 Following code reloads only the invalid part of the
12112 memory address RTX. */
12113
12114 if (GET_CODE (x) == PLUS
12115 && REG_P (XEXP (x, 1))
12116 && GET_CODE (XEXP (x, 0)) == PLUS
12117 && REG_P (XEXP (XEXP (x, 0), 1)))
12118 {
12119 rtx base, index;
12120 bool something_reloaded = false;
12121
12122 base = XEXP (XEXP (x, 0), 1);
12123 if (!REG_OK_FOR_BASE_STRICT_P (base))
12124 {
12125 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12126 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12127 opnum, (enum reload_type) type);
12128 something_reloaded = true;
12129 }
12130
12131 index = XEXP (x, 1);
12132 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12133 {
12134 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12135 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12136 opnum, (enum reload_type) type);
12137 something_reloaded = true;
12138 }
12139
12140 gcc_assert (something_reloaded);
12141 return true;
12142 }
12143
12144 return false;
12145 }
12146
12147 /* Recognizes RTL expressions that are valid memory addresses for an
12148 instruction. The MODE argument is the machine mode for the MEM
12149 expression that wants to use this address.
12150
12151 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12152 convert common non-canonical forms to canonical form so that they will
12153 be recognized. */
12154
12155 static bool
12156 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12157 rtx addr, bool strict)
12158 {
12159 struct ix86_address parts;
12160 rtx base, index, disp;
12161 HOST_WIDE_INT scale;
12162
12163 /* Since constant address in x32 is signed extended to 64bit,
12164 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12165 if (TARGET_X32
12166 && CONST_INT_P (addr)
12167 && INTVAL (addr) < 0)
12168 return false;
12169
12170 if (ix86_decompose_address (addr, &parts) <= 0)
12171 /* Decomposition failed. */
12172 return false;
12173
12174 base = parts.base;
12175 index = parts.index;
12176 disp = parts.disp;
12177 scale = parts.scale;
12178
12179 /* Validate base register. */
12180 if (base)
12181 {
12182 rtx reg;
12183
12184 if (REG_P (base))
12185 reg = base;
12186 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12187 reg = SUBREG_REG (base);
12188 else
12189 /* Base is not a register. */
12190 return false;
12191
12192 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12193 return false;
12194
12195 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12196 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12197 /* Base is not valid. */
12198 return false;
12199 }
12200
12201 /* Validate index register. */
12202 if (index)
12203 {
12204 rtx reg;
12205
12206 if (REG_P (index))
12207 reg = index;
12208 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12209 reg = SUBREG_REG (index);
12210 else
12211 /* Index is not a register. */
12212 return false;
12213
12214 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12215 return false;
12216
12217 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12218 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12219 /* Index is not valid. */
12220 return false;
12221 }
12222
12223 /* Index and base should have the same mode. */
12224 if (base && index
12225 && GET_MODE (base) != GET_MODE (index))
12226 return false;
12227
12228 /* Validate scale factor. */
12229 if (scale != 1)
12230 {
12231 if (!index)
12232 /* Scale without index. */
12233 return false;
12234
12235 if (scale != 2 && scale != 4 && scale != 8)
12236 /* Scale is not a valid multiplier. */
12237 return false;
12238 }
12239
12240 /* Validate displacement. */
12241 if (disp)
12242 {
12243 if (GET_CODE (disp) == CONST
12244 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12245 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12246 switch (XINT (XEXP (disp, 0), 1))
12247 {
12248 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12249 used. While ABI specify also 32bit relocations, we don't produce
12250 them at all and use IP relative instead. */
12251 case UNSPEC_GOT:
12252 case UNSPEC_GOTOFF:
12253 gcc_assert (flag_pic);
12254 if (!TARGET_64BIT)
12255 goto is_legitimate_pic;
12256
12257 /* 64bit address unspec. */
12258 return false;
12259
12260 case UNSPEC_GOTPCREL:
12261 case UNSPEC_PCREL:
12262 gcc_assert (flag_pic);
12263 goto is_legitimate_pic;
12264
12265 case UNSPEC_GOTTPOFF:
12266 case UNSPEC_GOTNTPOFF:
12267 case UNSPEC_INDNTPOFF:
12268 case UNSPEC_NTPOFF:
12269 case UNSPEC_DTPOFF:
12270 break;
12271
12272 case UNSPEC_STACK_CHECK:
12273 gcc_assert (flag_split_stack);
12274 break;
12275
12276 default:
12277 /* Invalid address unspec. */
12278 return false;
12279 }
12280
12281 else if (SYMBOLIC_CONST (disp)
12282 && (flag_pic
12283 || (TARGET_MACHO
12284 #if TARGET_MACHO
12285 && MACHOPIC_INDIRECT
12286 && !machopic_operand_p (disp)
12287 #endif
12288 )))
12289 {
12290
12291 is_legitimate_pic:
12292 if (TARGET_64BIT && (index || base))
12293 {
12294 /* foo@dtpoff(%rX) is ok. */
12295 if (GET_CODE (disp) != CONST
12296 || GET_CODE (XEXP (disp, 0)) != PLUS
12297 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12298 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12299 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12300 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12301 /* Non-constant pic memory reference. */
12302 return false;
12303 }
12304 else if ((!TARGET_MACHO || flag_pic)
12305 && ! legitimate_pic_address_disp_p (disp))
12306 /* Displacement is an invalid pic construct. */
12307 return false;
12308 #if TARGET_MACHO
12309 else if (MACHO_DYNAMIC_NO_PIC_P
12310 && !ix86_legitimate_constant_p (Pmode, disp))
12311 /* displacment must be referenced via non_lazy_pointer */
12312 return false;
12313 #endif
12314
12315 /* This code used to verify that a symbolic pic displacement
12316 includes the pic_offset_table_rtx register.
12317
12318 While this is good idea, unfortunately these constructs may
12319 be created by "adds using lea" optimization for incorrect
12320 code like:
12321
12322 int a;
12323 int foo(int i)
12324 {
12325 return *(&a+i);
12326 }
12327
12328 This code is nonsensical, but results in addressing
12329 GOT table with pic_offset_table_rtx base. We can't
12330 just refuse it easily, since it gets matched by
12331 "addsi3" pattern, that later gets split to lea in the
12332 case output register differs from input. While this
12333 can be handled by separate addsi pattern for this case
12334 that never results in lea, this seems to be easier and
12335 correct fix for crash to disable this test. */
12336 }
12337 else if (GET_CODE (disp) != LABEL_REF
12338 && !CONST_INT_P (disp)
12339 && (GET_CODE (disp) != CONST
12340 || !ix86_legitimate_constant_p (Pmode, disp))
12341 && (GET_CODE (disp) != SYMBOL_REF
12342 || !ix86_legitimate_constant_p (Pmode, disp)))
12343 /* Displacement is not constant. */
12344 return false;
12345 else if (TARGET_64BIT
12346 && !x86_64_immediate_operand (disp, VOIDmode))
12347 /* Displacement is out of range. */
12348 return false;
12349 }
12350
12351 /* Everything looks valid. */
12352 return true;
12353 }
12354
12355 /* Determine if a given RTX is a valid constant address. */
12356
12357 bool
12358 constant_address_p (rtx x)
12359 {
12360 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12361 }
12362 \f
12363 /* Return a unique alias set for the GOT. */
12364
12365 static alias_set_type
12366 ix86_GOT_alias_set (void)
12367 {
12368 static alias_set_type set = -1;
12369 if (set == -1)
12370 set = new_alias_set ();
12371 return set;
12372 }
12373
12374 /* Return a legitimate reference for ORIG (an address) using the
12375 register REG. If REG is 0, a new pseudo is generated.
12376
12377 There are two types of references that must be handled:
12378
12379 1. Global data references must load the address from the GOT, via
12380 the PIC reg. An insn is emitted to do this load, and the reg is
12381 returned.
12382
12383 2. Static data references, constant pool addresses, and code labels
12384 compute the address as an offset from the GOT, whose base is in
12385 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12386 differentiate them from global data objects. The returned
12387 address is the PIC reg + an unspec constant.
12388
12389 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12390 reg also appears in the address. */
12391
12392 static rtx
12393 legitimize_pic_address (rtx orig, rtx reg)
12394 {
12395 rtx addr = orig;
12396 rtx new_rtx = orig;
12397 rtx base;
12398
12399 #if TARGET_MACHO
12400 if (TARGET_MACHO && !TARGET_64BIT)
12401 {
12402 if (reg == 0)
12403 reg = gen_reg_rtx (Pmode);
12404 /* Use the generic Mach-O PIC machinery. */
12405 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12406 }
12407 #endif
12408
12409 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12410 new_rtx = addr;
12411 else if (TARGET_64BIT
12412 && ix86_cmodel != CM_SMALL_PIC
12413 && gotoff_operand (addr, Pmode))
12414 {
12415 rtx tmpreg;
12416 /* This symbol may be referenced via a displacement from the PIC
12417 base address (@GOTOFF). */
12418
12419 if (reload_in_progress)
12420 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12421 if (GET_CODE (addr) == CONST)
12422 addr = XEXP (addr, 0);
12423 if (GET_CODE (addr) == PLUS)
12424 {
12425 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12426 UNSPEC_GOTOFF);
12427 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12428 }
12429 else
12430 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12431 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12432 if (!reg)
12433 tmpreg = gen_reg_rtx (Pmode);
12434 else
12435 tmpreg = reg;
12436 emit_move_insn (tmpreg, new_rtx);
12437
12438 if (reg != 0)
12439 {
12440 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12441 tmpreg, 1, OPTAB_DIRECT);
12442 new_rtx = reg;
12443 }
12444 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12445 }
12446 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12447 {
12448 /* This symbol may be referenced via a displacement from the PIC
12449 base address (@GOTOFF). */
12450
12451 if (reload_in_progress)
12452 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12453 if (GET_CODE (addr) == CONST)
12454 addr = XEXP (addr, 0);
12455 if (GET_CODE (addr) == PLUS)
12456 {
12457 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12458 UNSPEC_GOTOFF);
12459 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12460 }
12461 else
12462 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12463 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12464 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12465
12466 if (reg != 0)
12467 {
12468 emit_move_insn (reg, new_rtx);
12469 new_rtx = reg;
12470 }
12471 }
12472 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12473 /* We can't use @GOTOFF for text labels on VxWorks;
12474 see gotoff_operand. */
12475 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12476 {
12477 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12478 {
12479 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12480 return legitimize_dllimport_symbol (addr, true);
12481 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12482 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12483 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12484 {
12485 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12486 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12487 }
12488 }
12489
12490 /* For x64 PE-COFF there is no GOT table. So we use address
12491 directly. */
12492 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12493 {
12494 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12495 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12496
12497 if (reg == 0)
12498 reg = gen_reg_rtx (Pmode);
12499 emit_move_insn (reg, new_rtx);
12500 new_rtx = reg;
12501 }
12502 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12503 {
12504 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12505 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12506 new_rtx = gen_const_mem (Pmode, new_rtx);
12507 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12508
12509 if (reg == 0)
12510 reg = gen_reg_rtx (Pmode);
12511 /* Use directly gen_movsi, otherwise the address is loaded
12512 into register for CSE. We don't want to CSE this addresses,
12513 instead we CSE addresses from the GOT table, so skip this. */
12514 emit_insn (gen_movsi (reg, new_rtx));
12515 new_rtx = reg;
12516 }
12517 else
12518 {
12519 /* This symbol must be referenced via a load from the
12520 Global Offset Table (@GOT). */
12521
12522 if (reload_in_progress)
12523 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12524 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12525 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12526 if (TARGET_64BIT)
12527 new_rtx = force_reg (Pmode, new_rtx);
12528 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12529 new_rtx = gen_const_mem (Pmode, new_rtx);
12530 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12531
12532 if (reg == 0)
12533 reg = gen_reg_rtx (Pmode);
12534 emit_move_insn (reg, new_rtx);
12535 new_rtx = reg;
12536 }
12537 }
12538 else
12539 {
12540 if (CONST_INT_P (addr)
12541 && !x86_64_immediate_operand (addr, VOIDmode))
12542 {
12543 if (reg)
12544 {
12545 emit_move_insn (reg, addr);
12546 new_rtx = reg;
12547 }
12548 else
12549 new_rtx = force_reg (Pmode, addr);
12550 }
12551 else if (GET_CODE (addr) == CONST)
12552 {
12553 addr = XEXP (addr, 0);
12554
12555 /* We must match stuff we generate before. Assume the only
12556 unspecs that can get here are ours. Not that we could do
12557 anything with them anyway.... */
12558 if (GET_CODE (addr) == UNSPEC
12559 || (GET_CODE (addr) == PLUS
12560 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12561 return orig;
12562 gcc_assert (GET_CODE (addr) == PLUS);
12563 }
12564 if (GET_CODE (addr) == PLUS)
12565 {
12566 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12567
12568 /* Check first to see if this is a constant offset from a @GOTOFF
12569 symbol reference. */
12570 if (gotoff_operand (op0, Pmode)
12571 && CONST_INT_P (op1))
12572 {
12573 if (!TARGET_64BIT)
12574 {
12575 if (reload_in_progress)
12576 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12577 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12578 UNSPEC_GOTOFF);
12579 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12580 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12581 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12582
12583 if (reg != 0)
12584 {
12585 emit_move_insn (reg, new_rtx);
12586 new_rtx = reg;
12587 }
12588 }
12589 else
12590 {
12591 if (INTVAL (op1) < -16*1024*1024
12592 || INTVAL (op1) >= 16*1024*1024)
12593 {
12594 if (!x86_64_immediate_operand (op1, Pmode))
12595 op1 = force_reg (Pmode, op1);
12596 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12597 }
12598 }
12599 }
12600 else
12601 {
12602 base = legitimize_pic_address (XEXP (addr, 0), reg);
12603 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12604 base == reg ? NULL_RTX : reg);
12605
12606 if (CONST_INT_P (new_rtx))
12607 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12608 else
12609 {
12610 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12611 {
12612 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12613 new_rtx = XEXP (new_rtx, 1);
12614 }
12615 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12616 }
12617 }
12618 }
12619 }
12620 return new_rtx;
12621 }
12622 \f
12623 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12624
12625 static rtx
12626 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12627 {
12628 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12629
12630 if (GET_MODE (tp) != tp_mode)
12631 {
12632 gcc_assert (GET_MODE (tp) == SImode);
12633 gcc_assert (tp_mode == DImode);
12634
12635 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12636 }
12637
12638 if (to_reg)
12639 tp = copy_to_mode_reg (tp_mode, tp);
12640
12641 return tp;
12642 }
12643
12644 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12645
12646 static GTY(()) rtx ix86_tls_symbol;
12647
12648 static rtx
12649 ix86_tls_get_addr (void)
12650 {
12651 if (!ix86_tls_symbol)
12652 {
12653 const char *sym
12654 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12655 ? "___tls_get_addr" : "__tls_get_addr");
12656
12657 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12658 }
12659
12660 return ix86_tls_symbol;
12661 }
12662
12663 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12664
12665 static GTY(()) rtx ix86_tls_module_base_symbol;
12666
12667 rtx
12668 ix86_tls_module_base (void)
12669 {
12670 if (!ix86_tls_module_base_symbol)
12671 {
12672 ix86_tls_module_base_symbol
12673 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12674
12675 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12676 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12677 }
12678
12679 return ix86_tls_module_base_symbol;
12680 }
12681
12682 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12683 false if we expect this to be used for a memory address and true if
12684 we expect to load the address into a register. */
12685
12686 static rtx
12687 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12688 {
12689 rtx dest, base, off;
12690 rtx pic = NULL_RTX, tp = NULL_RTX;
12691 enum machine_mode tp_mode = Pmode;
12692 int type;
12693
12694 switch (model)
12695 {
12696 case TLS_MODEL_GLOBAL_DYNAMIC:
12697 dest = gen_reg_rtx (Pmode);
12698
12699 if (!TARGET_64BIT)
12700 {
12701 if (flag_pic)
12702 pic = pic_offset_table_rtx;
12703 else
12704 {
12705 pic = gen_reg_rtx (Pmode);
12706 emit_insn (gen_set_got (pic));
12707 }
12708 }
12709
12710 if (TARGET_GNU2_TLS)
12711 {
12712 if (TARGET_64BIT)
12713 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12714 else
12715 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12716
12717 tp = get_thread_pointer (Pmode, true);
12718 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12719
12720 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12721 }
12722 else
12723 {
12724 rtx caddr = ix86_tls_get_addr ();
12725
12726 if (TARGET_64BIT)
12727 {
12728 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12729
12730 start_sequence ();
12731 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12732 caddr));
12733 insns = get_insns ();
12734 end_sequence ();
12735
12736 RTL_CONST_CALL_P (insns) = 1;
12737 emit_libcall_block (insns, dest, rax, x);
12738 }
12739 else
12740 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12741 }
12742 break;
12743
12744 case TLS_MODEL_LOCAL_DYNAMIC:
12745 base = gen_reg_rtx (Pmode);
12746
12747 if (!TARGET_64BIT)
12748 {
12749 if (flag_pic)
12750 pic = pic_offset_table_rtx;
12751 else
12752 {
12753 pic = gen_reg_rtx (Pmode);
12754 emit_insn (gen_set_got (pic));
12755 }
12756 }
12757
12758 if (TARGET_GNU2_TLS)
12759 {
12760 rtx tmp = ix86_tls_module_base ();
12761
12762 if (TARGET_64BIT)
12763 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12764 else
12765 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12766
12767 tp = get_thread_pointer (Pmode, true);
12768 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12769 gen_rtx_MINUS (Pmode, tmp, tp));
12770 }
12771 else
12772 {
12773 rtx caddr = ix86_tls_get_addr ();
12774
12775 if (TARGET_64BIT)
12776 {
12777 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12778
12779 start_sequence ();
12780 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12781 caddr));
12782 insns = get_insns ();
12783 end_sequence ();
12784
12785 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12786 share the LD_BASE result with other LD model accesses. */
12787 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12788 UNSPEC_TLS_LD_BASE);
12789
12790 RTL_CONST_CALL_P (insns) = 1;
12791 emit_libcall_block (insns, base, rax, eqv);
12792 }
12793 else
12794 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12795 }
12796
12797 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12798 off = gen_rtx_CONST (Pmode, off);
12799
12800 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12801
12802 if (TARGET_GNU2_TLS)
12803 {
12804 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12805
12806 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12807 }
12808 break;
12809
12810 case TLS_MODEL_INITIAL_EXEC:
12811 if (TARGET_64BIT)
12812 {
12813 if (TARGET_SUN_TLS && !TARGET_X32)
12814 {
12815 /* The Sun linker took the AMD64 TLS spec literally
12816 and can only handle %rax as destination of the
12817 initial executable code sequence. */
12818
12819 dest = gen_reg_rtx (DImode);
12820 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12821 return dest;
12822 }
12823
12824 /* Generate DImode references to avoid %fs:(%reg32)
12825 problems and linker IE->LE relaxation bug. */
12826 tp_mode = DImode;
12827 pic = NULL;
12828 type = UNSPEC_GOTNTPOFF;
12829 }
12830 else if (flag_pic)
12831 {
12832 if (reload_in_progress)
12833 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12834 pic = pic_offset_table_rtx;
12835 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12836 }
12837 else if (!TARGET_ANY_GNU_TLS)
12838 {
12839 pic = gen_reg_rtx (Pmode);
12840 emit_insn (gen_set_got (pic));
12841 type = UNSPEC_GOTTPOFF;
12842 }
12843 else
12844 {
12845 pic = NULL;
12846 type = UNSPEC_INDNTPOFF;
12847 }
12848
12849 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12850 off = gen_rtx_CONST (tp_mode, off);
12851 if (pic)
12852 off = gen_rtx_PLUS (tp_mode, pic, off);
12853 off = gen_const_mem (tp_mode, off);
12854 set_mem_alias_set (off, ix86_GOT_alias_set ());
12855
12856 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12857 {
12858 base = get_thread_pointer (tp_mode,
12859 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12860 off = force_reg (tp_mode, off);
12861 return gen_rtx_PLUS (tp_mode, base, off);
12862 }
12863 else
12864 {
12865 base = get_thread_pointer (Pmode, true);
12866 dest = gen_reg_rtx (Pmode);
12867 emit_insn (ix86_gen_sub3 (dest, base, off));
12868 }
12869 break;
12870
12871 case TLS_MODEL_LOCAL_EXEC:
12872 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12873 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12874 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12875 off = gen_rtx_CONST (Pmode, off);
12876
12877 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12878 {
12879 base = get_thread_pointer (Pmode,
12880 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12881 return gen_rtx_PLUS (Pmode, base, off);
12882 }
12883 else
12884 {
12885 base = get_thread_pointer (Pmode, true);
12886 dest = gen_reg_rtx (Pmode);
12887 emit_insn (ix86_gen_sub3 (dest, base, off));
12888 }
12889 break;
12890
12891 default:
12892 gcc_unreachable ();
12893 }
12894
12895 return dest;
12896 }
12897
12898 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12899 to symbol DECL. */
12900
12901 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12902 htab_t dllimport_map;
12903
12904 static tree
12905 get_dllimport_decl (tree decl)
12906 {
12907 struct tree_map *h, in;
12908 void **loc;
12909 const char *name;
12910 const char *prefix;
12911 size_t namelen, prefixlen;
12912 char *imp_name;
12913 tree to;
12914 rtx rtl;
12915
12916 if (!dllimport_map)
12917 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12918
12919 in.hash = htab_hash_pointer (decl);
12920 in.base.from = decl;
12921 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12922 h = (struct tree_map *) *loc;
12923 if (h)
12924 return h->to;
12925
12926 *loc = h = ggc_alloc_tree_map ();
12927 h->hash = in.hash;
12928 h->base.from = decl;
12929 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12930 VAR_DECL, NULL, ptr_type_node);
12931 DECL_ARTIFICIAL (to) = 1;
12932 DECL_IGNORED_P (to) = 1;
12933 DECL_EXTERNAL (to) = 1;
12934 TREE_READONLY (to) = 1;
12935
12936 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12937 name = targetm.strip_name_encoding (name);
12938 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12939 ? "*__imp_" : "*__imp__";
12940 namelen = strlen (name);
12941 prefixlen = strlen (prefix);
12942 imp_name = (char *) alloca (namelen + prefixlen + 1);
12943 memcpy (imp_name, prefix, prefixlen);
12944 memcpy (imp_name + prefixlen, name, namelen + 1);
12945
12946 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12947 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12948 SET_SYMBOL_REF_DECL (rtl, to);
12949 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12950
12951 rtl = gen_const_mem (Pmode, rtl);
12952 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12953
12954 SET_DECL_RTL (to, rtl);
12955 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12956
12957 return to;
12958 }
12959
12960 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12961 true if we require the result be a register. */
12962
12963 static rtx
12964 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12965 {
12966 tree imp_decl;
12967 rtx x;
12968
12969 gcc_assert (SYMBOL_REF_DECL (symbol));
12970 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12971
12972 x = DECL_RTL (imp_decl);
12973 if (want_reg)
12974 x = force_reg (Pmode, x);
12975 return x;
12976 }
12977
12978 /* Try machine-dependent ways of modifying an illegitimate address
12979 to be legitimate. If we find one, return the new, valid address.
12980 This macro is used in only one place: `memory_address' in explow.c.
12981
12982 OLDX is the address as it was before break_out_memory_refs was called.
12983 In some cases it is useful to look at this to decide what needs to be done.
12984
12985 It is always safe for this macro to do nothing. It exists to recognize
12986 opportunities to optimize the output.
12987
12988 For the 80386, we handle X+REG by loading X into a register R and
12989 using R+REG. R will go in a general reg and indexing will be used.
12990 However, if REG is a broken-out memory address or multiplication,
12991 nothing needs to be done because REG can certainly go in a general reg.
12992
12993 When -fpic is used, special handling is needed for symbolic references.
12994 See comments by legitimize_pic_address in i386.c for details. */
12995
12996 static rtx
12997 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12998 enum machine_mode mode)
12999 {
13000 int changed = 0;
13001 unsigned log;
13002
13003 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13004 if (log)
13005 return legitimize_tls_address (x, (enum tls_model) log, false);
13006 if (GET_CODE (x) == CONST
13007 && GET_CODE (XEXP (x, 0)) == PLUS
13008 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13009 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13010 {
13011 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13012 (enum tls_model) log, false);
13013 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13014 }
13015
13016 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13017 {
13018 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13019 return legitimize_dllimport_symbol (x, true);
13020 if (GET_CODE (x) == CONST
13021 && GET_CODE (XEXP (x, 0)) == PLUS
13022 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13023 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13024 {
13025 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13026 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13027 }
13028 }
13029
13030 if (flag_pic && SYMBOLIC_CONST (x))
13031 return legitimize_pic_address (x, 0);
13032
13033 #if TARGET_MACHO
13034 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13035 return machopic_indirect_data_reference (x, 0);
13036 #endif
13037
13038 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13039 if (GET_CODE (x) == ASHIFT
13040 && CONST_INT_P (XEXP (x, 1))
13041 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13042 {
13043 changed = 1;
13044 log = INTVAL (XEXP (x, 1));
13045 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13046 GEN_INT (1 << log));
13047 }
13048
13049 if (GET_CODE (x) == PLUS)
13050 {
13051 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13052
13053 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13054 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13055 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13056 {
13057 changed = 1;
13058 log = INTVAL (XEXP (XEXP (x, 0), 1));
13059 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13060 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13061 GEN_INT (1 << log));
13062 }
13063
13064 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13065 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13066 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13067 {
13068 changed = 1;
13069 log = INTVAL (XEXP (XEXP (x, 1), 1));
13070 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13071 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13072 GEN_INT (1 << log));
13073 }
13074
13075 /* Put multiply first if it isn't already. */
13076 if (GET_CODE (XEXP (x, 1)) == MULT)
13077 {
13078 rtx tmp = XEXP (x, 0);
13079 XEXP (x, 0) = XEXP (x, 1);
13080 XEXP (x, 1) = tmp;
13081 changed = 1;
13082 }
13083
13084 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13085 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13086 created by virtual register instantiation, register elimination, and
13087 similar optimizations. */
13088 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13089 {
13090 changed = 1;
13091 x = gen_rtx_PLUS (Pmode,
13092 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13093 XEXP (XEXP (x, 1), 0)),
13094 XEXP (XEXP (x, 1), 1));
13095 }
13096
13097 /* Canonicalize
13098 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13099 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13100 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13101 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13102 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13103 && CONSTANT_P (XEXP (x, 1)))
13104 {
13105 rtx constant;
13106 rtx other = NULL_RTX;
13107
13108 if (CONST_INT_P (XEXP (x, 1)))
13109 {
13110 constant = XEXP (x, 1);
13111 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13112 }
13113 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13114 {
13115 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13116 other = XEXP (x, 1);
13117 }
13118 else
13119 constant = 0;
13120
13121 if (constant)
13122 {
13123 changed = 1;
13124 x = gen_rtx_PLUS (Pmode,
13125 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13126 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13127 plus_constant (Pmode, other,
13128 INTVAL (constant)));
13129 }
13130 }
13131
13132 if (changed && ix86_legitimate_address_p (mode, x, false))
13133 return x;
13134
13135 if (GET_CODE (XEXP (x, 0)) == MULT)
13136 {
13137 changed = 1;
13138 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13139 }
13140
13141 if (GET_CODE (XEXP (x, 1)) == MULT)
13142 {
13143 changed = 1;
13144 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13145 }
13146
13147 if (changed
13148 && REG_P (XEXP (x, 1))
13149 && REG_P (XEXP (x, 0)))
13150 return x;
13151
13152 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13153 {
13154 changed = 1;
13155 x = legitimize_pic_address (x, 0);
13156 }
13157
13158 if (changed && ix86_legitimate_address_p (mode, x, false))
13159 return x;
13160
13161 if (REG_P (XEXP (x, 0)))
13162 {
13163 rtx temp = gen_reg_rtx (Pmode);
13164 rtx val = force_operand (XEXP (x, 1), temp);
13165 if (val != temp)
13166 {
13167 if (GET_MODE (val) != Pmode)
13168 val = convert_to_mode (Pmode, val, 1);
13169 emit_move_insn (temp, val);
13170 }
13171
13172 XEXP (x, 1) = temp;
13173 return x;
13174 }
13175
13176 else if (REG_P (XEXP (x, 1)))
13177 {
13178 rtx temp = gen_reg_rtx (Pmode);
13179 rtx val = force_operand (XEXP (x, 0), temp);
13180 if (val != temp)
13181 {
13182 if (GET_MODE (val) != Pmode)
13183 val = convert_to_mode (Pmode, val, 1);
13184 emit_move_insn (temp, val);
13185 }
13186
13187 XEXP (x, 0) = temp;
13188 return x;
13189 }
13190 }
13191
13192 return x;
13193 }
13194 \f
13195 /* Print an integer constant expression in assembler syntax. Addition
13196 and subtraction are the only arithmetic that may appear in these
13197 expressions. FILE is the stdio stream to write to, X is the rtx, and
13198 CODE is the operand print code from the output string. */
13199
13200 static void
13201 output_pic_addr_const (FILE *file, rtx x, int code)
13202 {
13203 char buf[256];
13204
13205 switch (GET_CODE (x))
13206 {
13207 case PC:
13208 gcc_assert (flag_pic);
13209 putc ('.', file);
13210 break;
13211
13212 case SYMBOL_REF:
13213 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13214 output_addr_const (file, x);
13215 else
13216 {
13217 const char *name = XSTR (x, 0);
13218
13219 /* Mark the decl as referenced so that cgraph will
13220 output the function. */
13221 if (SYMBOL_REF_DECL (x))
13222 mark_decl_referenced (SYMBOL_REF_DECL (x));
13223
13224 #if TARGET_MACHO
13225 if (MACHOPIC_INDIRECT
13226 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13227 name = machopic_indirection_name (x, /*stub_p=*/true);
13228 #endif
13229 assemble_name (file, name);
13230 }
13231 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13232 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13233 fputs ("@PLT", file);
13234 break;
13235
13236 case LABEL_REF:
13237 x = XEXP (x, 0);
13238 /* FALLTHRU */
13239 case CODE_LABEL:
13240 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13241 assemble_name (asm_out_file, buf);
13242 break;
13243
13244 case CONST_INT:
13245 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13246 break;
13247
13248 case CONST:
13249 /* This used to output parentheses around the expression,
13250 but that does not work on the 386 (either ATT or BSD assembler). */
13251 output_pic_addr_const (file, XEXP (x, 0), code);
13252 break;
13253
13254 case CONST_DOUBLE:
13255 if (GET_MODE (x) == VOIDmode)
13256 {
13257 /* We can use %d if the number is <32 bits and positive. */
13258 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13259 fprintf (file, "0x%lx%08lx",
13260 (unsigned long) CONST_DOUBLE_HIGH (x),
13261 (unsigned long) CONST_DOUBLE_LOW (x));
13262 else
13263 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13264 }
13265 else
13266 /* We can't handle floating point constants;
13267 TARGET_PRINT_OPERAND must handle them. */
13268 output_operand_lossage ("floating constant misused");
13269 break;
13270
13271 case PLUS:
13272 /* Some assemblers need integer constants to appear first. */
13273 if (CONST_INT_P (XEXP (x, 0)))
13274 {
13275 output_pic_addr_const (file, XEXP (x, 0), code);
13276 putc ('+', file);
13277 output_pic_addr_const (file, XEXP (x, 1), code);
13278 }
13279 else
13280 {
13281 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13282 output_pic_addr_const (file, XEXP (x, 1), code);
13283 putc ('+', file);
13284 output_pic_addr_const (file, XEXP (x, 0), code);
13285 }
13286 break;
13287
13288 case MINUS:
13289 if (!TARGET_MACHO)
13290 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13291 output_pic_addr_const (file, XEXP (x, 0), code);
13292 putc ('-', file);
13293 output_pic_addr_const (file, XEXP (x, 1), code);
13294 if (!TARGET_MACHO)
13295 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13296 break;
13297
13298 case UNSPEC:
13299 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13300 {
13301 bool f = i386_asm_output_addr_const_extra (file, x);
13302 gcc_assert (f);
13303 break;
13304 }
13305
13306 gcc_assert (XVECLEN (x, 0) == 1);
13307 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13308 switch (XINT (x, 1))
13309 {
13310 case UNSPEC_GOT:
13311 fputs ("@GOT", file);
13312 break;
13313 case UNSPEC_GOTOFF:
13314 fputs ("@GOTOFF", file);
13315 break;
13316 case UNSPEC_PLTOFF:
13317 fputs ("@PLTOFF", file);
13318 break;
13319 case UNSPEC_PCREL:
13320 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13321 "(%rip)" : "[rip]", file);
13322 break;
13323 case UNSPEC_GOTPCREL:
13324 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13325 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13326 break;
13327 case UNSPEC_GOTTPOFF:
13328 /* FIXME: This might be @TPOFF in Sun ld too. */
13329 fputs ("@gottpoff", file);
13330 break;
13331 case UNSPEC_TPOFF:
13332 fputs ("@tpoff", file);
13333 break;
13334 case UNSPEC_NTPOFF:
13335 if (TARGET_64BIT)
13336 fputs ("@tpoff", file);
13337 else
13338 fputs ("@ntpoff", file);
13339 break;
13340 case UNSPEC_DTPOFF:
13341 fputs ("@dtpoff", file);
13342 break;
13343 case UNSPEC_GOTNTPOFF:
13344 if (TARGET_64BIT)
13345 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13346 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13347 else
13348 fputs ("@gotntpoff", file);
13349 break;
13350 case UNSPEC_INDNTPOFF:
13351 fputs ("@indntpoff", file);
13352 break;
13353 #if TARGET_MACHO
13354 case UNSPEC_MACHOPIC_OFFSET:
13355 putc ('-', file);
13356 machopic_output_function_base_name (file);
13357 break;
13358 #endif
13359 default:
13360 output_operand_lossage ("invalid UNSPEC as operand");
13361 break;
13362 }
13363 break;
13364
13365 default:
13366 output_operand_lossage ("invalid expression as operand");
13367 }
13368 }
13369
13370 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13371 We need to emit DTP-relative relocations. */
13372
13373 static void ATTRIBUTE_UNUSED
13374 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13375 {
13376 fputs (ASM_LONG, file);
13377 output_addr_const (file, x);
13378 fputs ("@dtpoff", file);
13379 switch (size)
13380 {
13381 case 4:
13382 break;
13383 case 8:
13384 fputs (", 0", file);
13385 break;
13386 default:
13387 gcc_unreachable ();
13388 }
13389 }
13390
13391 /* Return true if X is a representation of the PIC register. This copes
13392 with calls from ix86_find_base_term, where the register might have
13393 been replaced by a cselib value. */
13394
13395 static bool
13396 ix86_pic_register_p (rtx x)
13397 {
13398 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13399 return (pic_offset_table_rtx
13400 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13401 else
13402 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13403 }
13404
13405 /* Helper function for ix86_delegitimize_address.
13406 Attempt to delegitimize TLS local-exec accesses. */
13407
13408 static rtx
13409 ix86_delegitimize_tls_address (rtx orig_x)
13410 {
13411 rtx x = orig_x, unspec;
13412 struct ix86_address addr;
13413
13414 if (!TARGET_TLS_DIRECT_SEG_REFS)
13415 return orig_x;
13416 if (MEM_P (x))
13417 x = XEXP (x, 0);
13418 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13419 return orig_x;
13420 if (ix86_decompose_address (x, &addr) == 0
13421 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13422 || addr.disp == NULL_RTX
13423 || GET_CODE (addr.disp) != CONST)
13424 return orig_x;
13425 unspec = XEXP (addr.disp, 0);
13426 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13427 unspec = XEXP (unspec, 0);
13428 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13429 return orig_x;
13430 x = XVECEXP (unspec, 0, 0);
13431 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13432 if (unspec != XEXP (addr.disp, 0))
13433 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13434 if (addr.index)
13435 {
13436 rtx idx = addr.index;
13437 if (addr.scale != 1)
13438 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13439 x = gen_rtx_PLUS (Pmode, idx, x);
13440 }
13441 if (addr.base)
13442 x = gen_rtx_PLUS (Pmode, addr.base, x);
13443 if (MEM_P (orig_x))
13444 x = replace_equiv_address_nv (orig_x, x);
13445 return x;
13446 }
13447
13448 /* In the name of slightly smaller debug output, and to cater to
13449 general assembler lossage, recognize PIC+GOTOFF and turn it back
13450 into a direct symbol reference.
13451
13452 On Darwin, this is necessary to avoid a crash, because Darwin
13453 has a different PIC label for each routine but the DWARF debugging
13454 information is not associated with any particular routine, so it's
13455 necessary to remove references to the PIC label from RTL stored by
13456 the DWARF output code. */
13457
13458 static rtx
13459 ix86_delegitimize_address (rtx x)
13460 {
13461 rtx orig_x = delegitimize_mem_from_attrs (x);
13462 /* addend is NULL or some rtx if x is something+GOTOFF where
13463 something doesn't include the PIC register. */
13464 rtx addend = NULL_RTX;
13465 /* reg_addend is NULL or a multiple of some register. */
13466 rtx reg_addend = NULL_RTX;
13467 /* const_addend is NULL or a const_int. */
13468 rtx const_addend = NULL_RTX;
13469 /* This is the result, or NULL. */
13470 rtx result = NULL_RTX;
13471
13472 x = orig_x;
13473
13474 if (MEM_P (x))
13475 x = XEXP (x, 0);
13476
13477 if (TARGET_64BIT)
13478 {
13479 if (GET_CODE (x) == CONST
13480 && GET_CODE (XEXP (x, 0)) == PLUS
13481 && GET_MODE (XEXP (x, 0)) == Pmode
13482 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13483 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13484 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13485 {
13486 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13487 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13488 if (MEM_P (orig_x))
13489 x = replace_equiv_address_nv (orig_x, x);
13490 return x;
13491 }
13492 if (GET_CODE (x) != CONST
13493 || GET_CODE (XEXP (x, 0)) != UNSPEC
13494 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13495 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13496 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13497 return ix86_delegitimize_tls_address (orig_x);
13498 x = XVECEXP (XEXP (x, 0), 0, 0);
13499 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13500 {
13501 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13502 GET_MODE (x), 0);
13503 if (x == NULL_RTX)
13504 return orig_x;
13505 }
13506 return x;
13507 }
13508
13509 if (GET_CODE (x) != PLUS
13510 || GET_CODE (XEXP (x, 1)) != CONST)
13511 return ix86_delegitimize_tls_address (orig_x);
13512
13513 if (ix86_pic_register_p (XEXP (x, 0)))
13514 /* %ebx + GOT/GOTOFF */
13515 ;
13516 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13517 {
13518 /* %ebx + %reg * scale + GOT/GOTOFF */
13519 reg_addend = XEXP (x, 0);
13520 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13521 reg_addend = XEXP (reg_addend, 1);
13522 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13523 reg_addend = XEXP (reg_addend, 0);
13524 else
13525 {
13526 reg_addend = NULL_RTX;
13527 addend = XEXP (x, 0);
13528 }
13529 }
13530 else
13531 addend = XEXP (x, 0);
13532
13533 x = XEXP (XEXP (x, 1), 0);
13534 if (GET_CODE (x) == PLUS
13535 && CONST_INT_P (XEXP (x, 1)))
13536 {
13537 const_addend = XEXP (x, 1);
13538 x = XEXP (x, 0);
13539 }
13540
13541 if (GET_CODE (x) == UNSPEC
13542 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13543 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13544 result = XVECEXP (x, 0, 0);
13545
13546 if (TARGET_MACHO && darwin_local_data_pic (x)
13547 && !MEM_P (orig_x))
13548 result = XVECEXP (x, 0, 0);
13549
13550 if (! result)
13551 return ix86_delegitimize_tls_address (orig_x);
13552
13553 if (const_addend)
13554 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13555 if (reg_addend)
13556 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13557 if (addend)
13558 {
13559 /* If the rest of original X doesn't involve the PIC register, add
13560 addend and subtract pic_offset_table_rtx. This can happen e.g.
13561 for code like:
13562 leal (%ebx, %ecx, 4), %ecx
13563 ...
13564 movl foo@GOTOFF(%ecx), %edx
13565 in which case we return (%ecx - %ebx) + foo. */
13566 if (pic_offset_table_rtx)
13567 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13568 pic_offset_table_rtx),
13569 result);
13570 else
13571 return orig_x;
13572 }
13573 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13574 {
13575 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13576 if (result == NULL_RTX)
13577 return orig_x;
13578 }
13579 return result;
13580 }
13581
13582 /* If X is a machine specific address (i.e. a symbol or label being
13583 referenced as a displacement from the GOT implemented using an
13584 UNSPEC), then return the base term. Otherwise return X. */
13585
13586 rtx
13587 ix86_find_base_term (rtx x)
13588 {
13589 rtx term;
13590
13591 if (TARGET_64BIT)
13592 {
13593 if (GET_CODE (x) != CONST)
13594 return x;
13595 term = XEXP (x, 0);
13596 if (GET_CODE (term) == PLUS
13597 && (CONST_INT_P (XEXP (term, 1))
13598 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13599 term = XEXP (term, 0);
13600 if (GET_CODE (term) != UNSPEC
13601 || (XINT (term, 1) != UNSPEC_GOTPCREL
13602 && XINT (term, 1) != UNSPEC_PCREL))
13603 return x;
13604
13605 return XVECEXP (term, 0, 0);
13606 }
13607
13608 return ix86_delegitimize_address (x);
13609 }
13610 \f
13611 static void
13612 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13613 bool fp, FILE *file)
13614 {
13615 const char *suffix;
13616
13617 if (mode == CCFPmode || mode == CCFPUmode)
13618 {
13619 code = ix86_fp_compare_code_to_integer (code);
13620 mode = CCmode;
13621 }
13622 if (reverse)
13623 code = reverse_condition (code);
13624
13625 switch (code)
13626 {
13627 case EQ:
13628 switch (mode)
13629 {
13630 case CCAmode:
13631 suffix = "a";
13632 break;
13633
13634 case CCCmode:
13635 suffix = "c";
13636 break;
13637
13638 case CCOmode:
13639 suffix = "o";
13640 break;
13641
13642 case CCSmode:
13643 suffix = "s";
13644 break;
13645
13646 default:
13647 suffix = "e";
13648 }
13649 break;
13650 case NE:
13651 switch (mode)
13652 {
13653 case CCAmode:
13654 suffix = "na";
13655 break;
13656
13657 case CCCmode:
13658 suffix = "nc";
13659 break;
13660
13661 case CCOmode:
13662 suffix = "no";
13663 break;
13664
13665 case CCSmode:
13666 suffix = "ns";
13667 break;
13668
13669 default:
13670 suffix = "ne";
13671 }
13672 break;
13673 case GT:
13674 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13675 suffix = "g";
13676 break;
13677 case GTU:
13678 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13679 Those same assemblers have the same but opposite lossage on cmov. */
13680 if (mode == CCmode)
13681 suffix = fp ? "nbe" : "a";
13682 else if (mode == CCCmode)
13683 suffix = "b";
13684 else
13685 gcc_unreachable ();
13686 break;
13687 case LT:
13688 switch (mode)
13689 {
13690 case CCNOmode:
13691 case CCGOCmode:
13692 suffix = "s";
13693 break;
13694
13695 case CCmode:
13696 case CCGCmode:
13697 suffix = "l";
13698 break;
13699
13700 default:
13701 gcc_unreachable ();
13702 }
13703 break;
13704 case LTU:
13705 gcc_assert (mode == CCmode || mode == CCCmode);
13706 suffix = "b";
13707 break;
13708 case GE:
13709 switch (mode)
13710 {
13711 case CCNOmode:
13712 case CCGOCmode:
13713 suffix = "ns";
13714 break;
13715
13716 case CCmode:
13717 case CCGCmode:
13718 suffix = "ge";
13719 break;
13720
13721 default:
13722 gcc_unreachable ();
13723 }
13724 break;
13725 case GEU:
13726 /* ??? As above. */
13727 gcc_assert (mode == CCmode || mode == CCCmode);
13728 suffix = fp ? "nb" : "ae";
13729 break;
13730 case LE:
13731 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13732 suffix = "le";
13733 break;
13734 case LEU:
13735 /* ??? As above. */
13736 if (mode == CCmode)
13737 suffix = "be";
13738 else if (mode == CCCmode)
13739 suffix = fp ? "nb" : "ae";
13740 else
13741 gcc_unreachable ();
13742 break;
13743 case UNORDERED:
13744 suffix = fp ? "u" : "p";
13745 break;
13746 case ORDERED:
13747 suffix = fp ? "nu" : "np";
13748 break;
13749 default:
13750 gcc_unreachable ();
13751 }
13752 fputs (suffix, file);
13753 }
13754
13755 /* Print the name of register X to FILE based on its machine mode and number.
13756 If CODE is 'w', pretend the mode is HImode.
13757 If CODE is 'b', pretend the mode is QImode.
13758 If CODE is 'k', pretend the mode is SImode.
13759 If CODE is 'q', pretend the mode is DImode.
13760 If CODE is 'x', pretend the mode is V4SFmode.
13761 If CODE is 't', pretend the mode is V8SFmode.
13762 If CODE is 'h', pretend the reg is the 'high' byte register.
13763 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13764 If CODE is 'd', duplicate the operand for AVX instruction.
13765 */
13766
13767 void
13768 print_reg (rtx x, int code, FILE *file)
13769 {
13770 const char *reg;
13771 bool duplicated = code == 'd' && TARGET_AVX;
13772
13773 gcc_assert (x == pc_rtx
13774 || (REGNO (x) != ARG_POINTER_REGNUM
13775 && REGNO (x) != FRAME_POINTER_REGNUM
13776 && REGNO (x) != FLAGS_REG
13777 && REGNO (x) != FPSR_REG
13778 && REGNO (x) != FPCR_REG));
13779
13780 if (ASSEMBLER_DIALECT == ASM_ATT)
13781 putc ('%', file);
13782
13783 if (x == pc_rtx)
13784 {
13785 gcc_assert (TARGET_64BIT);
13786 fputs ("rip", file);
13787 return;
13788 }
13789
13790 if (code == 'w' || MMX_REG_P (x))
13791 code = 2;
13792 else if (code == 'b')
13793 code = 1;
13794 else if (code == 'k')
13795 code = 4;
13796 else if (code == 'q')
13797 code = 8;
13798 else if (code == 'y')
13799 code = 3;
13800 else if (code == 'h')
13801 code = 0;
13802 else if (code == 'x')
13803 code = 16;
13804 else if (code == 't')
13805 code = 32;
13806 else
13807 code = GET_MODE_SIZE (GET_MODE (x));
13808
13809 /* Irritatingly, AMD extended registers use different naming convention
13810 from the normal registers: "r%d[bwd]" */
13811 if (REX_INT_REG_P (x))
13812 {
13813 gcc_assert (TARGET_64BIT);
13814 putc ('r', file);
13815 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13816 switch (code)
13817 {
13818 case 0:
13819 error ("extended registers have no high halves");
13820 break;
13821 case 1:
13822 putc ('b', file);
13823 break;
13824 case 2:
13825 putc ('w', file);
13826 break;
13827 case 4:
13828 putc ('d', file);
13829 break;
13830 case 8:
13831 /* no suffix */
13832 break;
13833 default:
13834 error ("unsupported operand size for extended register");
13835 break;
13836 }
13837 return;
13838 }
13839
13840 reg = NULL;
13841 switch (code)
13842 {
13843 case 3:
13844 if (STACK_TOP_P (x))
13845 {
13846 reg = "st(0)";
13847 break;
13848 }
13849 /* FALLTHRU */
13850 case 8:
13851 case 4:
13852 case 12:
13853 if (! ANY_FP_REG_P (x))
13854 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13855 /* FALLTHRU */
13856 case 16:
13857 case 2:
13858 normal:
13859 reg = hi_reg_name[REGNO (x)];
13860 break;
13861 case 1:
13862 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13863 goto normal;
13864 reg = qi_reg_name[REGNO (x)];
13865 break;
13866 case 0:
13867 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13868 goto normal;
13869 reg = qi_high_reg_name[REGNO (x)];
13870 break;
13871 case 32:
13872 if (SSE_REG_P (x))
13873 {
13874 gcc_assert (!duplicated);
13875 putc ('y', file);
13876 fputs (hi_reg_name[REGNO (x)] + 1, file);
13877 return;
13878 }
13879 break;
13880 default:
13881 gcc_unreachable ();
13882 }
13883
13884 fputs (reg, file);
13885 if (duplicated)
13886 {
13887 if (ASSEMBLER_DIALECT == ASM_ATT)
13888 fprintf (file, ", %%%s", reg);
13889 else
13890 fprintf (file, ", %s", reg);
13891 }
13892 }
13893
13894 /* Locate some local-dynamic symbol still in use by this function
13895 so that we can print its name in some tls_local_dynamic_base
13896 pattern. */
13897
13898 static int
13899 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13900 {
13901 rtx x = *px;
13902
13903 if (GET_CODE (x) == SYMBOL_REF
13904 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13905 {
13906 cfun->machine->some_ld_name = XSTR (x, 0);
13907 return 1;
13908 }
13909
13910 return 0;
13911 }
13912
13913 static const char *
13914 get_some_local_dynamic_name (void)
13915 {
13916 rtx insn;
13917
13918 if (cfun->machine->some_ld_name)
13919 return cfun->machine->some_ld_name;
13920
13921 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13922 if (NONDEBUG_INSN_P (insn)
13923 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13924 return cfun->machine->some_ld_name;
13925
13926 return NULL;
13927 }
13928
13929 /* Meaning of CODE:
13930 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13931 C -- print opcode suffix for set/cmov insn.
13932 c -- like C, but print reversed condition
13933 F,f -- likewise, but for floating-point.
13934 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13935 otherwise nothing
13936 R -- print the prefix for register names.
13937 z -- print the opcode suffix for the size of the current operand.
13938 Z -- likewise, with special suffixes for x87 instructions.
13939 * -- print a star (in certain assembler syntax)
13940 A -- print an absolute memory reference.
13941 E -- print address with DImode register names if TARGET_64BIT.
13942 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13943 s -- print a shift double count, followed by the assemblers argument
13944 delimiter.
13945 b -- print the QImode name of the register for the indicated operand.
13946 %b0 would print %al if operands[0] is reg 0.
13947 w -- likewise, print the HImode name of the register.
13948 k -- likewise, print the SImode name of the register.
13949 q -- likewise, print the DImode name of the register.
13950 x -- likewise, print the V4SFmode name of the register.
13951 t -- likewise, print the V8SFmode name of the register.
13952 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13953 y -- print "st(0)" instead of "st" as a register.
13954 d -- print duplicated register operand for AVX instruction.
13955 D -- print condition for SSE cmp instruction.
13956 P -- if PIC, print an @PLT suffix.
13957 p -- print raw symbol name.
13958 X -- don't print any sort of PIC '@' suffix for a symbol.
13959 & -- print some in-use local-dynamic symbol name.
13960 H -- print a memory address offset by 8; used for sse high-parts
13961 Y -- print condition for XOP pcom* instruction.
13962 + -- print a branch hint as 'cs' or 'ds' prefix
13963 ; -- print a semicolon (after prefixes due to bug in older gas).
13964 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13965 @ -- print a segment register of thread base pointer load
13966 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13967 */
13968
13969 void
13970 ix86_print_operand (FILE *file, rtx x, int code)
13971 {
13972 if (code)
13973 {
13974 switch (code)
13975 {
13976 case 'A':
13977 switch (ASSEMBLER_DIALECT)
13978 {
13979 case ASM_ATT:
13980 putc ('*', file);
13981 break;
13982
13983 case ASM_INTEL:
13984 /* Intel syntax. For absolute addresses, registers should not
13985 be surrounded by braces. */
13986 if (!REG_P (x))
13987 {
13988 putc ('[', file);
13989 ix86_print_operand (file, x, 0);
13990 putc (']', file);
13991 return;
13992 }
13993 break;
13994
13995 default:
13996 gcc_unreachable ();
13997 }
13998
13999 ix86_print_operand (file, x, 0);
14000 return;
14001
14002 case 'E':
14003 /* Wrap address in an UNSPEC to declare special handling. */
14004 if (TARGET_64BIT)
14005 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14006
14007 output_address (x);
14008 return;
14009
14010 case 'L':
14011 if (ASSEMBLER_DIALECT == ASM_ATT)
14012 putc ('l', file);
14013 return;
14014
14015 case 'W':
14016 if (ASSEMBLER_DIALECT == ASM_ATT)
14017 putc ('w', file);
14018 return;
14019
14020 case 'B':
14021 if (ASSEMBLER_DIALECT == ASM_ATT)
14022 putc ('b', file);
14023 return;
14024
14025 case 'Q':
14026 if (ASSEMBLER_DIALECT == ASM_ATT)
14027 putc ('l', file);
14028 return;
14029
14030 case 'S':
14031 if (ASSEMBLER_DIALECT == ASM_ATT)
14032 putc ('s', file);
14033 return;
14034
14035 case 'T':
14036 if (ASSEMBLER_DIALECT == ASM_ATT)
14037 putc ('t', file);
14038 return;
14039
14040 case 'O':
14041 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14042 if (ASSEMBLER_DIALECT != ASM_ATT)
14043 return;
14044
14045 switch (GET_MODE_SIZE (GET_MODE (x)))
14046 {
14047 case 2:
14048 putc ('w', file);
14049 break;
14050
14051 case 4:
14052 putc ('l', file);
14053 break;
14054
14055 case 8:
14056 putc ('q', file);
14057 break;
14058
14059 default:
14060 output_operand_lossage
14061 ("invalid operand size for operand code 'O'");
14062 return;
14063 }
14064
14065 putc ('.', file);
14066 #endif
14067 return;
14068
14069 case 'z':
14070 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14071 {
14072 /* Opcodes don't get size suffixes if using Intel opcodes. */
14073 if (ASSEMBLER_DIALECT == ASM_INTEL)
14074 return;
14075
14076 switch (GET_MODE_SIZE (GET_MODE (x)))
14077 {
14078 case 1:
14079 putc ('b', file);
14080 return;
14081
14082 case 2:
14083 putc ('w', file);
14084 return;
14085
14086 case 4:
14087 putc ('l', file);
14088 return;
14089
14090 case 8:
14091 putc ('q', file);
14092 return;
14093
14094 default:
14095 output_operand_lossage
14096 ("invalid operand size for operand code 'z'");
14097 return;
14098 }
14099 }
14100
14101 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14102 warning
14103 (0, "non-integer operand used with operand code 'z'");
14104 /* FALLTHRU */
14105
14106 case 'Z':
14107 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14108 if (ASSEMBLER_DIALECT == ASM_INTEL)
14109 return;
14110
14111 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14112 {
14113 switch (GET_MODE_SIZE (GET_MODE (x)))
14114 {
14115 case 2:
14116 #ifdef HAVE_AS_IX86_FILDS
14117 putc ('s', file);
14118 #endif
14119 return;
14120
14121 case 4:
14122 putc ('l', file);
14123 return;
14124
14125 case 8:
14126 #ifdef HAVE_AS_IX86_FILDQ
14127 putc ('q', file);
14128 #else
14129 fputs ("ll", file);
14130 #endif
14131 return;
14132
14133 default:
14134 break;
14135 }
14136 }
14137 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14138 {
14139 /* 387 opcodes don't get size suffixes
14140 if the operands are registers. */
14141 if (STACK_REG_P (x))
14142 return;
14143
14144 switch (GET_MODE_SIZE (GET_MODE (x)))
14145 {
14146 case 4:
14147 putc ('s', file);
14148 return;
14149
14150 case 8:
14151 putc ('l', file);
14152 return;
14153
14154 case 12:
14155 case 16:
14156 putc ('t', file);
14157 return;
14158
14159 default:
14160 break;
14161 }
14162 }
14163 else
14164 {
14165 output_operand_lossage
14166 ("invalid operand type used with operand code 'Z'");
14167 return;
14168 }
14169
14170 output_operand_lossage
14171 ("invalid operand size for operand code 'Z'");
14172 return;
14173
14174 case 'd':
14175 case 'b':
14176 case 'w':
14177 case 'k':
14178 case 'q':
14179 case 'h':
14180 case 't':
14181 case 'y':
14182 case 'x':
14183 case 'X':
14184 case 'P':
14185 case 'p':
14186 break;
14187
14188 case 's':
14189 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14190 {
14191 ix86_print_operand (file, x, 0);
14192 fputs (", ", file);
14193 }
14194 return;
14195
14196 case 'Y':
14197 switch (GET_CODE (x))
14198 {
14199 case NE:
14200 fputs ("neq", file);
14201 break;
14202 case EQ:
14203 fputs ("eq", file);
14204 break;
14205 case GE:
14206 case GEU:
14207 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14208 break;
14209 case GT:
14210 case GTU:
14211 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14212 break;
14213 case LE:
14214 case LEU:
14215 fputs ("le", file);
14216 break;
14217 case LT:
14218 case LTU:
14219 fputs ("lt", file);
14220 break;
14221 case UNORDERED:
14222 fputs ("unord", file);
14223 break;
14224 case ORDERED:
14225 fputs ("ord", file);
14226 break;
14227 case UNEQ:
14228 fputs ("ueq", file);
14229 break;
14230 case UNGE:
14231 fputs ("nlt", file);
14232 break;
14233 case UNGT:
14234 fputs ("nle", file);
14235 break;
14236 case UNLE:
14237 fputs ("ule", file);
14238 break;
14239 case UNLT:
14240 fputs ("ult", file);
14241 break;
14242 case LTGT:
14243 fputs ("une", file);
14244 break;
14245 default:
14246 output_operand_lossage ("operand is not a condition code, "
14247 "invalid operand code 'Y'");
14248 return;
14249 }
14250 return;
14251
14252 case 'D':
14253 /* Little bit of braindamage here. The SSE compare instructions
14254 does use completely different names for the comparisons that the
14255 fp conditional moves. */
14256 switch (GET_CODE (x))
14257 {
14258 case UNEQ:
14259 if (TARGET_AVX)
14260 {
14261 fputs ("eq_us", file);
14262 break;
14263 }
14264 case EQ:
14265 fputs ("eq", file);
14266 break;
14267 case UNLT:
14268 if (TARGET_AVX)
14269 {
14270 fputs ("nge", file);
14271 break;
14272 }
14273 case LT:
14274 fputs ("lt", file);
14275 break;
14276 case UNLE:
14277 if (TARGET_AVX)
14278 {
14279 fputs ("ngt", file);
14280 break;
14281 }
14282 case LE:
14283 fputs ("le", file);
14284 break;
14285 case UNORDERED:
14286 fputs ("unord", file);
14287 break;
14288 case LTGT:
14289 if (TARGET_AVX)
14290 {
14291 fputs ("neq_oq", file);
14292 break;
14293 }
14294 case NE:
14295 fputs ("neq", file);
14296 break;
14297 case GE:
14298 if (TARGET_AVX)
14299 {
14300 fputs ("ge", file);
14301 break;
14302 }
14303 case UNGE:
14304 fputs ("nlt", file);
14305 break;
14306 case GT:
14307 if (TARGET_AVX)
14308 {
14309 fputs ("gt", file);
14310 break;
14311 }
14312 case UNGT:
14313 fputs ("nle", file);
14314 break;
14315 case ORDERED:
14316 fputs ("ord", file);
14317 break;
14318 default:
14319 output_operand_lossage ("operand is not a condition code, "
14320 "invalid operand code 'D'");
14321 return;
14322 }
14323 return;
14324
14325 case 'F':
14326 case 'f':
14327 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14328 if (ASSEMBLER_DIALECT == ASM_ATT)
14329 putc ('.', file);
14330 #endif
14331
14332 case 'C':
14333 case 'c':
14334 if (!COMPARISON_P (x))
14335 {
14336 output_operand_lossage ("operand is not a condition code, "
14337 "invalid operand code '%c'", code);
14338 return;
14339 }
14340 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14341 code == 'c' || code == 'f',
14342 code == 'F' || code == 'f',
14343 file);
14344 return;
14345
14346 case 'H':
14347 if (!offsettable_memref_p (x))
14348 {
14349 output_operand_lossage ("operand is not an offsettable memory "
14350 "reference, invalid operand code 'H'");
14351 return;
14352 }
14353 /* It doesn't actually matter what mode we use here, as we're
14354 only going to use this for printing. */
14355 x = adjust_address_nv (x, DImode, 8);
14356 break;
14357
14358 case 'K':
14359 gcc_assert (CONST_INT_P (x));
14360
14361 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14362 #ifdef HAVE_AS_IX86_HLE
14363 fputs ("xacquire ", file);
14364 #else
14365 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14366 #endif
14367 else if (INTVAL (x) & IX86_HLE_RELEASE)
14368 #ifdef HAVE_AS_IX86_HLE
14369 fputs ("xrelease ", file);
14370 #else
14371 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14372 #endif
14373 /* We do not want to print value of the operand. */
14374 return;
14375
14376 case '*':
14377 if (ASSEMBLER_DIALECT == ASM_ATT)
14378 putc ('*', file);
14379 return;
14380
14381 case '&':
14382 {
14383 const char *name = get_some_local_dynamic_name ();
14384 if (name == NULL)
14385 output_operand_lossage ("'%%&' used without any "
14386 "local dynamic TLS references");
14387 else
14388 assemble_name (file, name);
14389 return;
14390 }
14391
14392 case '+':
14393 {
14394 rtx x;
14395
14396 if (!optimize
14397 || optimize_function_for_size_p (cfun)
14398 || !TARGET_BRANCH_PREDICTION_HINTS)
14399 return;
14400
14401 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14402 if (x)
14403 {
14404 int pred_val = INTVAL (XEXP (x, 0));
14405
14406 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14407 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14408 {
14409 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14410 bool cputaken
14411 = final_forward_branch_p (current_output_insn) == 0;
14412
14413 /* Emit hints only in the case default branch prediction
14414 heuristics would fail. */
14415 if (taken != cputaken)
14416 {
14417 /* We use 3e (DS) prefix for taken branches and
14418 2e (CS) prefix for not taken branches. */
14419 if (taken)
14420 fputs ("ds ; ", file);
14421 else
14422 fputs ("cs ; ", file);
14423 }
14424 }
14425 }
14426 return;
14427 }
14428
14429 case ';':
14430 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14431 putc (';', file);
14432 #endif
14433 return;
14434
14435 case '@':
14436 if (ASSEMBLER_DIALECT == ASM_ATT)
14437 putc ('%', file);
14438
14439 /* The kernel uses a different segment register for performance
14440 reasons; a system call would not have to trash the userspace
14441 segment register, which would be expensive. */
14442 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14443 fputs ("fs", file);
14444 else
14445 fputs ("gs", file);
14446 return;
14447
14448 case '~':
14449 putc (TARGET_AVX2 ? 'i' : 'f', file);
14450 return;
14451
14452 case '^':
14453 if (TARGET_64BIT && Pmode != word_mode)
14454 fputs ("addr32 ", file);
14455 return;
14456
14457 default:
14458 output_operand_lossage ("invalid operand code '%c'", code);
14459 }
14460 }
14461
14462 if (REG_P (x))
14463 print_reg (x, code, file);
14464
14465 else if (MEM_P (x))
14466 {
14467 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14468 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14469 && GET_MODE (x) != BLKmode)
14470 {
14471 const char * size;
14472 switch (GET_MODE_SIZE (GET_MODE (x)))
14473 {
14474 case 1: size = "BYTE"; break;
14475 case 2: size = "WORD"; break;
14476 case 4: size = "DWORD"; break;
14477 case 8: size = "QWORD"; break;
14478 case 12: size = "TBYTE"; break;
14479 case 16:
14480 if (GET_MODE (x) == XFmode)
14481 size = "TBYTE";
14482 else
14483 size = "XMMWORD";
14484 break;
14485 case 32: size = "YMMWORD"; break;
14486 default:
14487 gcc_unreachable ();
14488 }
14489
14490 /* Check for explicit size override (codes 'b', 'w', 'k',
14491 'q' and 'x') */
14492 if (code == 'b')
14493 size = "BYTE";
14494 else if (code == 'w')
14495 size = "WORD";
14496 else if (code == 'k')
14497 size = "DWORD";
14498 else if (code == 'q')
14499 size = "QWORD";
14500 else if (code == 'x')
14501 size = "XMMWORD";
14502
14503 fputs (size, file);
14504 fputs (" PTR ", file);
14505 }
14506
14507 x = XEXP (x, 0);
14508 /* Avoid (%rip) for call operands. */
14509 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14510 && !CONST_INT_P (x))
14511 output_addr_const (file, x);
14512 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14513 output_operand_lossage ("invalid constraints for operand");
14514 else
14515 output_address (x);
14516 }
14517
14518 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14519 {
14520 REAL_VALUE_TYPE r;
14521 long l;
14522
14523 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14524 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14525
14526 if (ASSEMBLER_DIALECT == ASM_ATT)
14527 putc ('$', file);
14528 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14529 if (code == 'q')
14530 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14531 else
14532 fprintf (file, "0x%08x", (unsigned int) l);
14533 }
14534
14535 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14536 {
14537 REAL_VALUE_TYPE r;
14538 long l[2];
14539
14540 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14541 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14542
14543 if (ASSEMBLER_DIALECT == ASM_ATT)
14544 putc ('$', file);
14545 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14546 }
14547
14548 /* These float cases don't actually occur as immediate operands. */
14549 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14550 {
14551 char dstr[30];
14552
14553 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14554 fputs (dstr, file);
14555 }
14556
14557 else
14558 {
14559 /* We have patterns that allow zero sets of memory, for instance.
14560 In 64-bit mode, we should probably support all 8-byte vectors,
14561 since we can in fact encode that into an immediate. */
14562 if (GET_CODE (x) == CONST_VECTOR)
14563 {
14564 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14565 x = const0_rtx;
14566 }
14567
14568 if (code != 'P' && code != 'p')
14569 {
14570 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14571 {
14572 if (ASSEMBLER_DIALECT == ASM_ATT)
14573 putc ('$', file);
14574 }
14575 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14576 || GET_CODE (x) == LABEL_REF)
14577 {
14578 if (ASSEMBLER_DIALECT == ASM_ATT)
14579 putc ('$', file);
14580 else
14581 fputs ("OFFSET FLAT:", file);
14582 }
14583 }
14584 if (CONST_INT_P (x))
14585 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14586 else if (flag_pic || MACHOPIC_INDIRECT)
14587 output_pic_addr_const (file, x, code);
14588 else
14589 output_addr_const (file, x);
14590 }
14591 }
14592
14593 static bool
14594 ix86_print_operand_punct_valid_p (unsigned char code)
14595 {
14596 return (code == '@' || code == '*' || code == '+' || code == '&'
14597 || code == ';' || code == '~' || code == '^');
14598 }
14599 \f
14600 /* Print a memory operand whose address is ADDR. */
14601
14602 static void
14603 ix86_print_operand_address (FILE *file, rtx addr)
14604 {
14605 struct ix86_address parts;
14606 rtx base, index, disp;
14607 int scale;
14608 int ok;
14609 bool vsib = false;
14610 int code = 0;
14611
14612 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14613 {
14614 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14615 gcc_assert (parts.index == NULL_RTX);
14616 parts.index = XVECEXP (addr, 0, 1);
14617 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14618 addr = XVECEXP (addr, 0, 0);
14619 vsib = true;
14620 }
14621 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14622 {
14623 gcc_assert (TARGET_64BIT);
14624 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14625 code = 'q';
14626 }
14627 else
14628 ok = ix86_decompose_address (addr, &parts);
14629
14630 gcc_assert (ok);
14631
14632 if (parts.base && GET_CODE (parts.base) == SUBREG)
14633 {
14634 rtx tmp = SUBREG_REG (parts.base);
14635 parts.base = simplify_subreg (GET_MODE (parts.base),
14636 tmp, GET_MODE (tmp), 0);
14637 }
14638
14639 if (parts.index && GET_CODE (parts.index) == SUBREG)
14640 {
14641 rtx tmp = SUBREG_REG (parts.index);
14642 parts.index = simplify_subreg (GET_MODE (parts.index),
14643 tmp, GET_MODE (tmp), 0);
14644 }
14645
14646 base = parts.base;
14647 index = parts.index;
14648 disp = parts.disp;
14649 scale = parts.scale;
14650
14651 switch (parts.seg)
14652 {
14653 case SEG_DEFAULT:
14654 break;
14655 case SEG_FS:
14656 case SEG_GS:
14657 if (ASSEMBLER_DIALECT == ASM_ATT)
14658 putc ('%', file);
14659 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14660 break;
14661 default:
14662 gcc_unreachable ();
14663 }
14664
14665 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14666 if (TARGET_64BIT && !base && !index)
14667 {
14668 rtx symbol = disp;
14669
14670 if (GET_CODE (disp) == CONST
14671 && GET_CODE (XEXP (disp, 0)) == PLUS
14672 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14673 symbol = XEXP (XEXP (disp, 0), 0);
14674
14675 if (GET_CODE (symbol) == LABEL_REF
14676 || (GET_CODE (symbol) == SYMBOL_REF
14677 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14678 base = pc_rtx;
14679 }
14680 if (!base && !index)
14681 {
14682 /* Displacement only requires special attention. */
14683
14684 if (CONST_INT_P (disp))
14685 {
14686 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14687 fputs ("ds:", file);
14688 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14689 }
14690 else if (flag_pic)
14691 output_pic_addr_const (file, disp, 0);
14692 else
14693 output_addr_const (file, disp);
14694 }
14695 else
14696 {
14697 /* Print SImode register names for zero-extended
14698 addresses to force addr32 prefix. */
14699 if (TARGET_64BIT
14700 && (GET_CODE (addr) == ZERO_EXTEND
14701 || GET_CODE (addr) == AND))
14702 {
14703 gcc_assert (!code);
14704 code = 'l';
14705 }
14706
14707 if (ASSEMBLER_DIALECT == ASM_ATT)
14708 {
14709 if (disp)
14710 {
14711 if (flag_pic)
14712 output_pic_addr_const (file, disp, 0);
14713 else if (GET_CODE (disp) == LABEL_REF)
14714 output_asm_label (disp);
14715 else
14716 output_addr_const (file, disp);
14717 }
14718
14719 putc ('(', file);
14720 if (base)
14721 print_reg (base, code, file);
14722 if (index)
14723 {
14724 putc (',', file);
14725 print_reg (index, vsib ? 0 : code, file);
14726 if (scale != 1 || vsib)
14727 fprintf (file, ",%d", scale);
14728 }
14729 putc (')', file);
14730 }
14731 else
14732 {
14733 rtx offset = NULL_RTX;
14734
14735 if (disp)
14736 {
14737 /* Pull out the offset of a symbol; print any symbol itself. */
14738 if (GET_CODE (disp) == CONST
14739 && GET_CODE (XEXP (disp, 0)) == PLUS
14740 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14741 {
14742 offset = XEXP (XEXP (disp, 0), 1);
14743 disp = gen_rtx_CONST (VOIDmode,
14744 XEXP (XEXP (disp, 0), 0));
14745 }
14746
14747 if (flag_pic)
14748 output_pic_addr_const (file, disp, 0);
14749 else if (GET_CODE (disp) == LABEL_REF)
14750 output_asm_label (disp);
14751 else if (CONST_INT_P (disp))
14752 offset = disp;
14753 else
14754 output_addr_const (file, disp);
14755 }
14756
14757 putc ('[', file);
14758 if (base)
14759 {
14760 print_reg (base, code, file);
14761 if (offset)
14762 {
14763 if (INTVAL (offset) >= 0)
14764 putc ('+', file);
14765 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14766 }
14767 }
14768 else if (offset)
14769 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14770 else
14771 putc ('0', file);
14772
14773 if (index)
14774 {
14775 putc ('+', file);
14776 print_reg (index, vsib ? 0 : code, file);
14777 if (scale != 1 || vsib)
14778 fprintf (file, "*%d", scale);
14779 }
14780 putc (']', file);
14781 }
14782 }
14783 }
14784
14785 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14786
14787 static bool
14788 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14789 {
14790 rtx op;
14791
14792 if (GET_CODE (x) != UNSPEC)
14793 return false;
14794
14795 op = XVECEXP (x, 0, 0);
14796 switch (XINT (x, 1))
14797 {
14798 case UNSPEC_GOTTPOFF:
14799 output_addr_const (file, op);
14800 /* FIXME: This might be @TPOFF in Sun ld. */
14801 fputs ("@gottpoff", file);
14802 break;
14803 case UNSPEC_TPOFF:
14804 output_addr_const (file, op);
14805 fputs ("@tpoff", file);
14806 break;
14807 case UNSPEC_NTPOFF:
14808 output_addr_const (file, op);
14809 if (TARGET_64BIT)
14810 fputs ("@tpoff", file);
14811 else
14812 fputs ("@ntpoff", file);
14813 break;
14814 case UNSPEC_DTPOFF:
14815 output_addr_const (file, op);
14816 fputs ("@dtpoff", file);
14817 break;
14818 case UNSPEC_GOTNTPOFF:
14819 output_addr_const (file, op);
14820 if (TARGET_64BIT)
14821 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14822 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14823 else
14824 fputs ("@gotntpoff", file);
14825 break;
14826 case UNSPEC_INDNTPOFF:
14827 output_addr_const (file, op);
14828 fputs ("@indntpoff", file);
14829 break;
14830 #if TARGET_MACHO
14831 case UNSPEC_MACHOPIC_OFFSET:
14832 output_addr_const (file, op);
14833 putc ('-', file);
14834 machopic_output_function_base_name (file);
14835 break;
14836 #endif
14837
14838 case UNSPEC_STACK_CHECK:
14839 {
14840 int offset;
14841
14842 gcc_assert (flag_split_stack);
14843
14844 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14845 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14846 #else
14847 gcc_unreachable ();
14848 #endif
14849
14850 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14851 }
14852 break;
14853
14854 default:
14855 return false;
14856 }
14857
14858 return true;
14859 }
14860 \f
14861 /* Split one or more double-mode RTL references into pairs of half-mode
14862 references. The RTL can be REG, offsettable MEM, integer constant, or
14863 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14864 split and "num" is its length. lo_half and hi_half are output arrays
14865 that parallel "operands". */
14866
14867 void
14868 split_double_mode (enum machine_mode mode, rtx operands[],
14869 int num, rtx lo_half[], rtx hi_half[])
14870 {
14871 enum machine_mode half_mode;
14872 unsigned int byte;
14873
14874 switch (mode)
14875 {
14876 case TImode:
14877 half_mode = DImode;
14878 break;
14879 case DImode:
14880 half_mode = SImode;
14881 break;
14882 default:
14883 gcc_unreachable ();
14884 }
14885
14886 byte = GET_MODE_SIZE (half_mode);
14887
14888 while (num--)
14889 {
14890 rtx op = operands[num];
14891
14892 /* simplify_subreg refuse to split volatile memory addresses,
14893 but we still have to handle it. */
14894 if (MEM_P (op))
14895 {
14896 lo_half[num] = adjust_address (op, half_mode, 0);
14897 hi_half[num] = adjust_address (op, half_mode, byte);
14898 }
14899 else
14900 {
14901 lo_half[num] = simplify_gen_subreg (half_mode, op,
14902 GET_MODE (op) == VOIDmode
14903 ? mode : GET_MODE (op), 0);
14904 hi_half[num] = simplify_gen_subreg (half_mode, op,
14905 GET_MODE (op) == VOIDmode
14906 ? mode : GET_MODE (op), byte);
14907 }
14908 }
14909 }
14910 \f
14911 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14912 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14913 is the expression of the binary operation. The output may either be
14914 emitted here, or returned to the caller, like all output_* functions.
14915
14916 There is no guarantee that the operands are the same mode, as they
14917 might be within FLOAT or FLOAT_EXTEND expressions. */
14918
14919 #ifndef SYSV386_COMPAT
14920 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14921 wants to fix the assemblers because that causes incompatibility
14922 with gcc. No-one wants to fix gcc because that causes
14923 incompatibility with assemblers... You can use the option of
14924 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14925 #define SYSV386_COMPAT 1
14926 #endif
14927
14928 const char *
14929 output_387_binary_op (rtx insn, rtx *operands)
14930 {
14931 static char buf[40];
14932 const char *p;
14933 const char *ssep;
14934 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14935
14936 #ifdef ENABLE_CHECKING
14937 /* Even if we do not want to check the inputs, this documents input
14938 constraints. Which helps in understanding the following code. */
14939 if (STACK_REG_P (operands[0])
14940 && ((REG_P (operands[1])
14941 && REGNO (operands[0]) == REGNO (operands[1])
14942 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14943 || (REG_P (operands[2])
14944 && REGNO (operands[0]) == REGNO (operands[2])
14945 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14946 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14947 ; /* ok */
14948 else
14949 gcc_assert (is_sse);
14950 #endif
14951
14952 switch (GET_CODE (operands[3]))
14953 {
14954 case PLUS:
14955 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14956 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14957 p = "fiadd";
14958 else
14959 p = "fadd";
14960 ssep = "vadd";
14961 break;
14962
14963 case MINUS:
14964 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14965 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14966 p = "fisub";
14967 else
14968 p = "fsub";
14969 ssep = "vsub";
14970 break;
14971
14972 case MULT:
14973 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14974 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14975 p = "fimul";
14976 else
14977 p = "fmul";
14978 ssep = "vmul";
14979 break;
14980
14981 case DIV:
14982 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14983 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14984 p = "fidiv";
14985 else
14986 p = "fdiv";
14987 ssep = "vdiv";
14988 break;
14989
14990 default:
14991 gcc_unreachable ();
14992 }
14993
14994 if (is_sse)
14995 {
14996 if (TARGET_AVX)
14997 {
14998 strcpy (buf, ssep);
14999 if (GET_MODE (operands[0]) == SFmode)
15000 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15001 else
15002 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15003 }
15004 else
15005 {
15006 strcpy (buf, ssep + 1);
15007 if (GET_MODE (operands[0]) == SFmode)
15008 strcat (buf, "ss\t{%2, %0|%0, %2}");
15009 else
15010 strcat (buf, "sd\t{%2, %0|%0, %2}");
15011 }
15012 return buf;
15013 }
15014 strcpy (buf, p);
15015
15016 switch (GET_CODE (operands[3]))
15017 {
15018 case MULT:
15019 case PLUS:
15020 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15021 {
15022 rtx temp = operands[2];
15023 operands[2] = operands[1];
15024 operands[1] = temp;
15025 }
15026
15027 /* know operands[0] == operands[1]. */
15028
15029 if (MEM_P (operands[2]))
15030 {
15031 p = "%Z2\t%2";
15032 break;
15033 }
15034
15035 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15036 {
15037 if (STACK_TOP_P (operands[0]))
15038 /* How is it that we are storing to a dead operand[2]?
15039 Well, presumably operands[1] is dead too. We can't
15040 store the result to st(0) as st(0) gets popped on this
15041 instruction. Instead store to operands[2] (which I
15042 think has to be st(1)). st(1) will be popped later.
15043 gcc <= 2.8.1 didn't have this check and generated
15044 assembly code that the Unixware assembler rejected. */
15045 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15046 else
15047 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15048 break;
15049 }
15050
15051 if (STACK_TOP_P (operands[0]))
15052 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15053 else
15054 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15055 break;
15056
15057 case MINUS:
15058 case DIV:
15059 if (MEM_P (operands[1]))
15060 {
15061 p = "r%Z1\t%1";
15062 break;
15063 }
15064
15065 if (MEM_P (operands[2]))
15066 {
15067 p = "%Z2\t%2";
15068 break;
15069 }
15070
15071 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15072 {
15073 #if SYSV386_COMPAT
15074 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15075 derived assemblers, confusingly reverse the direction of
15076 the operation for fsub{r} and fdiv{r} when the
15077 destination register is not st(0). The Intel assembler
15078 doesn't have this brain damage. Read !SYSV386_COMPAT to
15079 figure out what the hardware really does. */
15080 if (STACK_TOP_P (operands[0]))
15081 p = "{p\t%0, %2|rp\t%2, %0}";
15082 else
15083 p = "{rp\t%2, %0|p\t%0, %2}";
15084 #else
15085 if (STACK_TOP_P (operands[0]))
15086 /* As above for fmul/fadd, we can't store to st(0). */
15087 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15088 else
15089 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15090 #endif
15091 break;
15092 }
15093
15094 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15095 {
15096 #if SYSV386_COMPAT
15097 if (STACK_TOP_P (operands[0]))
15098 p = "{rp\t%0, %1|p\t%1, %0}";
15099 else
15100 p = "{p\t%1, %0|rp\t%0, %1}";
15101 #else
15102 if (STACK_TOP_P (operands[0]))
15103 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15104 else
15105 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15106 #endif
15107 break;
15108 }
15109
15110 if (STACK_TOP_P (operands[0]))
15111 {
15112 if (STACK_TOP_P (operands[1]))
15113 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15114 else
15115 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15116 break;
15117 }
15118 else if (STACK_TOP_P (operands[1]))
15119 {
15120 #if SYSV386_COMPAT
15121 p = "{\t%1, %0|r\t%0, %1}";
15122 #else
15123 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15124 #endif
15125 }
15126 else
15127 {
15128 #if SYSV386_COMPAT
15129 p = "{r\t%2, %0|\t%0, %2}";
15130 #else
15131 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15132 #endif
15133 }
15134 break;
15135
15136 default:
15137 gcc_unreachable ();
15138 }
15139
15140 strcat (buf, p);
15141 return buf;
15142 }
15143
15144 /* Return needed mode for entity in optimize_mode_switching pass. */
15145
15146 int
15147 ix86_mode_needed (int entity, rtx insn)
15148 {
15149 enum attr_i387_cw mode;
15150
15151 /* The mode UNINITIALIZED is used to store control word after a
15152 function call or ASM pattern. The mode ANY specify that function
15153 has no requirements on the control word and make no changes in the
15154 bits we are interested in. */
15155
15156 if (CALL_P (insn)
15157 || (NONJUMP_INSN_P (insn)
15158 && (asm_noperands (PATTERN (insn)) >= 0
15159 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15160 return I387_CW_UNINITIALIZED;
15161
15162 if (recog_memoized (insn) < 0)
15163 return I387_CW_ANY;
15164
15165 mode = get_attr_i387_cw (insn);
15166
15167 switch (entity)
15168 {
15169 case I387_TRUNC:
15170 if (mode == I387_CW_TRUNC)
15171 return mode;
15172 break;
15173
15174 case I387_FLOOR:
15175 if (mode == I387_CW_FLOOR)
15176 return mode;
15177 break;
15178
15179 case I387_CEIL:
15180 if (mode == I387_CW_CEIL)
15181 return mode;
15182 break;
15183
15184 case I387_MASK_PM:
15185 if (mode == I387_CW_MASK_PM)
15186 return mode;
15187 break;
15188
15189 default:
15190 gcc_unreachable ();
15191 }
15192
15193 return I387_CW_ANY;
15194 }
15195
15196 /* Output code to initialize control word copies used by trunc?f?i and
15197 rounding patterns. CURRENT_MODE is set to current control word,
15198 while NEW_MODE is set to new control word. */
15199
15200 void
15201 emit_i387_cw_initialization (int mode)
15202 {
15203 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15204 rtx new_mode;
15205
15206 enum ix86_stack_slot slot;
15207
15208 rtx reg = gen_reg_rtx (HImode);
15209
15210 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15211 emit_move_insn (reg, copy_rtx (stored_mode));
15212
15213 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15214 || optimize_function_for_size_p (cfun))
15215 {
15216 switch (mode)
15217 {
15218 case I387_CW_TRUNC:
15219 /* round toward zero (truncate) */
15220 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15221 slot = SLOT_CW_TRUNC;
15222 break;
15223
15224 case I387_CW_FLOOR:
15225 /* round down toward -oo */
15226 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15227 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15228 slot = SLOT_CW_FLOOR;
15229 break;
15230
15231 case I387_CW_CEIL:
15232 /* round up toward +oo */
15233 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15234 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15235 slot = SLOT_CW_CEIL;
15236 break;
15237
15238 case I387_CW_MASK_PM:
15239 /* mask precision exception for nearbyint() */
15240 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15241 slot = SLOT_CW_MASK_PM;
15242 break;
15243
15244 default:
15245 gcc_unreachable ();
15246 }
15247 }
15248 else
15249 {
15250 switch (mode)
15251 {
15252 case I387_CW_TRUNC:
15253 /* round toward zero (truncate) */
15254 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15255 slot = SLOT_CW_TRUNC;
15256 break;
15257
15258 case I387_CW_FLOOR:
15259 /* round down toward -oo */
15260 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15261 slot = SLOT_CW_FLOOR;
15262 break;
15263
15264 case I387_CW_CEIL:
15265 /* round up toward +oo */
15266 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15267 slot = SLOT_CW_CEIL;
15268 break;
15269
15270 case I387_CW_MASK_PM:
15271 /* mask precision exception for nearbyint() */
15272 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15273 slot = SLOT_CW_MASK_PM;
15274 break;
15275
15276 default:
15277 gcc_unreachable ();
15278 }
15279 }
15280
15281 gcc_assert (slot < MAX_386_STACK_LOCALS);
15282
15283 new_mode = assign_386_stack_local (HImode, slot);
15284 emit_move_insn (new_mode, reg);
15285 }
15286
15287 /* Output code for INSN to convert a float to a signed int. OPERANDS
15288 are the insn operands. The output may be [HSD]Imode and the input
15289 operand may be [SDX]Fmode. */
15290
15291 const char *
15292 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15293 {
15294 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15295 int dimode_p = GET_MODE (operands[0]) == DImode;
15296 int round_mode = get_attr_i387_cw (insn);
15297
15298 /* Jump through a hoop or two for DImode, since the hardware has no
15299 non-popping instruction. We used to do this a different way, but
15300 that was somewhat fragile and broke with post-reload splitters. */
15301 if ((dimode_p || fisttp) && !stack_top_dies)
15302 output_asm_insn ("fld\t%y1", operands);
15303
15304 gcc_assert (STACK_TOP_P (operands[1]));
15305 gcc_assert (MEM_P (operands[0]));
15306 gcc_assert (GET_MODE (operands[1]) != TFmode);
15307
15308 if (fisttp)
15309 output_asm_insn ("fisttp%Z0\t%0", operands);
15310 else
15311 {
15312 if (round_mode != I387_CW_ANY)
15313 output_asm_insn ("fldcw\t%3", operands);
15314 if (stack_top_dies || dimode_p)
15315 output_asm_insn ("fistp%Z0\t%0", operands);
15316 else
15317 output_asm_insn ("fist%Z0\t%0", operands);
15318 if (round_mode != I387_CW_ANY)
15319 output_asm_insn ("fldcw\t%2", operands);
15320 }
15321
15322 return "";
15323 }
15324
15325 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15326 have the values zero or one, indicates the ffreep insn's operand
15327 from the OPERANDS array. */
15328
15329 static const char *
15330 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15331 {
15332 if (TARGET_USE_FFREEP)
15333 #ifdef HAVE_AS_IX86_FFREEP
15334 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15335 #else
15336 {
15337 static char retval[32];
15338 int regno = REGNO (operands[opno]);
15339
15340 gcc_assert (FP_REGNO_P (regno));
15341
15342 regno -= FIRST_STACK_REG;
15343
15344 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15345 return retval;
15346 }
15347 #endif
15348
15349 return opno ? "fstp\t%y1" : "fstp\t%y0";
15350 }
15351
15352
15353 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15354 should be used. UNORDERED_P is true when fucom should be used. */
15355
15356 const char *
15357 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15358 {
15359 int stack_top_dies;
15360 rtx cmp_op0, cmp_op1;
15361 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15362
15363 if (eflags_p)
15364 {
15365 cmp_op0 = operands[0];
15366 cmp_op1 = operands[1];
15367 }
15368 else
15369 {
15370 cmp_op0 = operands[1];
15371 cmp_op1 = operands[2];
15372 }
15373
15374 if (is_sse)
15375 {
15376 if (GET_MODE (operands[0]) == SFmode)
15377 if (unordered_p)
15378 return "%vucomiss\t{%1, %0|%0, %1}";
15379 else
15380 return "%vcomiss\t{%1, %0|%0, %1}";
15381 else
15382 if (unordered_p)
15383 return "%vucomisd\t{%1, %0|%0, %1}";
15384 else
15385 return "%vcomisd\t{%1, %0|%0, %1}";
15386 }
15387
15388 gcc_assert (STACK_TOP_P (cmp_op0));
15389
15390 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15391
15392 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15393 {
15394 if (stack_top_dies)
15395 {
15396 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15397 return output_387_ffreep (operands, 1);
15398 }
15399 else
15400 return "ftst\n\tfnstsw\t%0";
15401 }
15402
15403 if (STACK_REG_P (cmp_op1)
15404 && stack_top_dies
15405 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15406 && REGNO (cmp_op1) != FIRST_STACK_REG)
15407 {
15408 /* If both the top of the 387 stack dies, and the other operand
15409 is also a stack register that dies, then this must be a
15410 `fcompp' float compare */
15411
15412 if (eflags_p)
15413 {
15414 /* There is no double popping fcomi variant. Fortunately,
15415 eflags is immune from the fstp's cc clobbering. */
15416 if (unordered_p)
15417 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15418 else
15419 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15420 return output_387_ffreep (operands, 0);
15421 }
15422 else
15423 {
15424 if (unordered_p)
15425 return "fucompp\n\tfnstsw\t%0";
15426 else
15427 return "fcompp\n\tfnstsw\t%0";
15428 }
15429 }
15430 else
15431 {
15432 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15433
15434 static const char * const alt[16] =
15435 {
15436 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15437 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15438 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15439 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15440
15441 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15442 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15443 NULL,
15444 NULL,
15445
15446 "fcomi\t{%y1, %0|%0, %y1}",
15447 "fcomip\t{%y1, %0|%0, %y1}",
15448 "fucomi\t{%y1, %0|%0, %y1}",
15449 "fucomip\t{%y1, %0|%0, %y1}",
15450
15451 NULL,
15452 NULL,
15453 NULL,
15454 NULL
15455 };
15456
15457 int mask;
15458 const char *ret;
15459
15460 mask = eflags_p << 3;
15461 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15462 mask |= unordered_p << 1;
15463 mask |= stack_top_dies;
15464
15465 gcc_assert (mask < 16);
15466 ret = alt[mask];
15467 gcc_assert (ret);
15468
15469 return ret;
15470 }
15471 }
15472
15473 void
15474 ix86_output_addr_vec_elt (FILE *file, int value)
15475 {
15476 const char *directive = ASM_LONG;
15477
15478 #ifdef ASM_QUAD
15479 if (TARGET_LP64)
15480 directive = ASM_QUAD;
15481 #else
15482 gcc_assert (!TARGET_64BIT);
15483 #endif
15484
15485 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15486 }
15487
15488 void
15489 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15490 {
15491 const char *directive = ASM_LONG;
15492
15493 #ifdef ASM_QUAD
15494 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15495 directive = ASM_QUAD;
15496 #else
15497 gcc_assert (!TARGET_64BIT);
15498 #endif
15499 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15500 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15501 fprintf (file, "%s%s%d-%s%d\n",
15502 directive, LPREFIX, value, LPREFIX, rel);
15503 else if (HAVE_AS_GOTOFF_IN_DATA)
15504 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15505 #if TARGET_MACHO
15506 else if (TARGET_MACHO)
15507 {
15508 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15509 machopic_output_function_base_name (file);
15510 putc ('\n', file);
15511 }
15512 #endif
15513 else
15514 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15515 GOT_SYMBOL_NAME, LPREFIX, value);
15516 }
15517 \f
15518 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15519 for the target. */
15520
15521 void
15522 ix86_expand_clear (rtx dest)
15523 {
15524 rtx tmp;
15525
15526 /* We play register width games, which are only valid after reload. */
15527 gcc_assert (reload_completed);
15528
15529 /* Avoid HImode and its attendant prefix byte. */
15530 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15531 dest = gen_rtx_REG (SImode, REGNO (dest));
15532 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15533
15534 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15535 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15536 {
15537 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15538 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15539 }
15540
15541 emit_insn (tmp);
15542 }
15543
15544 /* X is an unchanging MEM. If it is a constant pool reference, return
15545 the constant pool rtx, else NULL. */
15546
15547 rtx
15548 maybe_get_pool_constant (rtx x)
15549 {
15550 x = ix86_delegitimize_address (XEXP (x, 0));
15551
15552 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15553 return get_pool_constant (x);
15554
15555 return NULL_RTX;
15556 }
15557
15558 void
15559 ix86_expand_move (enum machine_mode mode, rtx operands[])
15560 {
15561 rtx op0, op1;
15562 enum tls_model model;
15563
15564 op0 = operands[0];
15565 op1 = operands[1];
15566
15567 if (GET_CODE (op1) == SYMBOL_REF)
15568 {
15569 model = SYMBOL_REF_TLS_MODEL (op1);
15570 if (model)
15571 {
15572 op1 = legitimize_tls_address (op1, model, true);
15573 op1 = force_operand (op1, op0);
15574 if (op1 == op0)
15575 return;
15576 if (GET_MODE (op1) != mode)
15577 op1 = convert_to_mode (mode, op1, 1);
15578 }
15579 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15580 && SYMBOL_REF_DLLIMPORT_P (op1))
15581 op1 = legitimize_dllimport_symbol (op1, false);
15582 }
15583 else if (GET_CODE (op1) == CONST
15584 && GET_CODE (XEXP (op1, 0)) == PLUS
15585 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15586 {
15587 rtx addend = XEXP (XEXP (op1, 0), 1);
15588 rtx symbol = XEXP (XEXP (op1, 0), 0);
15589 rtx tmp = NULL;
15590
15591 model = SYMBOL_REF_TLS_MODEL (symbol);
15592 if (model)
15593 tmp = legitimize_tls_address (symbol, model, true);
15594 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15595 && SYMBOL_REF_DLLIMPORT_P (symbol))
15596 tmp = legitimize_dllimport_symbol (symbol, true);
15597
15598 if (tmp)
15599 {
15600 tmp = force_operand (tmp, NULL);
15601 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15602 op0, 1, OPTAB_DIRECT);
15603 if (tmp == op0)
15604 return;
15605 if (GET_MODE (tmp) != mode)
15606 op1 = convert_to_mode (mode, tmp, 1);
15607 }
15608 }
15609
15610 if ((flag_pic || MACHOPIC_INDIRECT)
15611 && symbolic_operand (op1, mode))
15612 {
15613 if (TARGET_MACHO && !TARGET_64BIT)
15614 {
15615 #if TARGET_MACHO
15616 /* dynamic-no-pic */
15617 if (MACHOPIC_INDIRECT)
15618 {
15619 rtx temp = ((reload_in_progress
15620 || ((op0 && REG_P (op0))
15621 && mode == Pmode))
15622 ? op0 : gen_reg_rtx (Pmode));
15623 op1 = machopic_indirect_data_reference (op1, temp);
15624 if (MACHOPIC_PURE)
15625 op1 = machopic_legitimize_pic_address (op1, mode,
15626 temp == op1 ? 0 : temp);
15627 }
15628 if (op0 != op1 && GET_CODE (op0) != MEM)
15629 {
15630 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15631 emit_insn (insn);
15632 return;
15633 }
15634 if (GET_CODE (op0) == MEM)
15635 op1 = force_reg (Pmode, op1);
15636 else
15637 {
15638 rtx temp = op0;
15639 if (GET_CODE (temp) != REG)
15640 temp = gen_reg_rtx (Pmode);
15641 temp = legitimize_pic_address (op1, temp);
15642 if (temp == op0)
15643 return;
15644 op1 = temp;
15645 }
15646 /* dynamic-no-pic */
15647 #endif
15648 }
15649 else
15650 {
15651 if (MEM_P (op0))
15652 op1 = force_reg (mode, op1);
15653 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15654 {
15655 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15656 op1 = legitimize_pic_address (op1, reg);
15657 if (op0 == op1)
15658 return;
15659 if (GET_MODE (op1) != mode)
15660 op1 = convert_to_mode (mode, op1, 1);
15661 }
15662 }
15663 }
15664 else
15665 {
15666 if (MEM_P (op0)
15667 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15668 || !push_operand (op0, mode))
15669 && MEM_P (op1))
15670 op1 = force_reg (mode, op1);
15671
15672 if (push_operand (op0, mode)
15673 && ! general_no_elim_operand (op1, mode))
15674 op1 = copy_to_mode_reg (mode, op1);
15675
15676 /* Force large constants in 64bit compilation into register
15677 to get them CSEed. */
15678 if (can_create_pseudo_p ()
15679 && (mode == DImode) && TARGET_64BIT
15680 && immediate_operand (op1, mode)
15681 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15682 && !register_operand (op0, mode)
15683 && optimize)
15684 op1 = copy_to_mode_reg (mode, op1);
15685
15686 if (can_create_pseudo_p ()
15687 && FLOAT_MODE_P (mode)
15688 && GET_CODE (op1) == CONST_DOUBLE)
15689 {
15690 /* If we are loading a floating point constant to a register,
15691 force the value to memory now, since we'll get better code
15692 out the back end. */
15693
15694 op1 = validize_mem (force_const_mem (mode, op1));
15695 if (!register_operand (op0, mode))
15696 {
15697 rtx temp = gen_reg_rtx (mode);
15698 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15699 emit_move_insn (op0, temp);
15700 return;
15701 }
15702 }
15703 }
15704
15705 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15706 }
15707
15708 void
15709 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15710 {
15711 rtx op0 = operands[0], op1 = operands[1];
15712 unsigned int align = GET_MODE_ALIGNMENT (mode);
15713
15714 /* Force constants other than zero into memory. We do not know how
15715 the instructions used to build constants modify the upper 64 bits
15716 of the register, once we have that information we may be able
15717 to handle some of them more efficiently. */
15718 if (can_create_pseudo_p ()
15719 && register_operand (op0, mode)
15720 && (CONSTANT_P (op1)
15721 || (GET_CODE (op1) == SUBREG
15722 && CONSTANT_P (SUBREG_REG (op1))))
15723 && !standard_sse_constant_p (op1))
15724 op1 = validize_mem (force_const_mem (mode, op1));
15725
15726 /* We need to check memory alignment for SSE mode since attribute
15727 can make operands unaligned. */
15728 if (can_create_pseudo_p ()
15729 && SSE_REG_MODE_P (mode)
15730 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15731 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15732 {
15733 rtx tmp[2];
15734
15735 /* ix86_expand_vector_move_misalign() does not like constants ... */
15736 if (CONSTANT_P (op1)
15737 || (GET_CODE (op1) == SUBREG
15738 && CONSTANT_P (SUBREG_REG (op1))))
15739 op1 = validize_mem (force_const_mem (mode, op1));
15740
15741 /* ... nor both arguments in memory. */
15742 if (!register_operand (op0, mode)
15743 && !register_operand (op1, mode))
15744 op1 = force_reg (mode, op1);
15745
15746 tmp[0] = op0; tmp[1] = op1;
15747 ix86_expand_vector_move_misalign (mode, tmp);
15748 return;
15749 }
15750
15751 /* Make operand1 a register if it isn't already. */
15752 if (can_create_pseudo_p ()
15753 && !register_operand (op0, mode)
15754 && !register_operand (op1, mode))
15755 {
15756 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15757 return;
15758 }
15759
15760 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15761 }
15762
15763 /* Split 32-byte AVX unaligned load and store if needed. */
15764
15765 static void
15766 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15767 {
15768 rtx m;
15769 rtx (*extract) (rtx, rtx, rtx);
15770 rtx (*move_unaligned) (rtx, rtx);
15771 enum machine_mode mode;
15772
15773 switch (GET_MODE (op0))
15774 {
15775 default:
15776 gcc_unreachable ();
15777 case V32QImode:
15778 extract = gen_avx_vextractf128v32qi;
15779 move_unaligned = gen_avx_movdqu256;
15780 mode = V16QImode;
15781 break;
15782 case V8SFmode:
15783 extract = gen_avx_vextractf128v8sf;
15784 move_unaligned = gen_avx_movups256;
15785 mode = V4SFmode;
15786 break;
15787 case V4DFmode:
15788 extract = gen_avx_vextractf128v4df;
15789 move_unaligned = gen_avx_movupd256;
15790 mode = V2DFmode;
15791 break;
15792 }
15793
15794 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15795 {
15796 rtx r = gen_reg_rtx (mode);
15797 m = adjust_address (op1, mode, 0);
15798 emit_move_insn (r, m);
15799 m = adjust_address (op1, mode, 16);
15800 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15801 emit_move_insn (op0, r);
15802 }
15803 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15804 {
15805 m = adjust_address (op0, mode, 0);
15806 emit_insn (extract (m, op1, const0_rtx));
15807 m = adjust_address (op0, mode, 16);
15808 emit_insn (extract (m, op1, const1_rtx));
15809 }
15810 else
15811 emit_insn (move_unaligned (op0, op1));
15812 }
15813
15814 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15815 straight to ix86_expand_vector_move. */
15816 /* Code generation for scalar reg-reg moves of single and double precision data:
15817 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15818 movaps reg, reg
15819 else
15820 movss reg, reg
15821 if (x86_sse_partial_reg_dependency == true)
15822 movapd reg, reg
15823 else
15824 movsd reg, reg
15825
15826 Code generation for scalar loads of double precision data:
15827 if (x86_sse_split_regs == true)
15828 movlpd mem, reg (gas syntax)
15829 else
15830 movsd mem, reg
15831
15832 Code generation for unaligned packed loads of single precision data
15833 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15834 if (x86_sse_unaligned_move_optimal)
15835 movups mem, reg
15836
15837 if (x86_sse_partial_reg_dependency == true)
15838 {
15839 xorps reg, reg
15840 movlps mem, reg
15841 movhps mem+8, reg
15842 }
15843 else
15844 {
15845 movlps mem, reg
15846 movhps mem+8, reg
15847 }
15848
15849 Code generation for unaligned packed loads of double precision data
15850 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15851 if (x86_sse_unaligned_move_optimal)
15852 movupd mem, reg
15853
15854 if (x86_sse_split_regs == true)
15855 {
15856 movlpd mem, reg
15857 movhpd mem+8, reg
15858 }
15859 else
15860 {
15861 movsd mem, reg
15862 movhpd mem+8, reg
15863 }
15864 */
15865
15866 void
15867 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15868 {
15869 rtx op0, op1, m;
15870
15871 op0 = operands[0];
15872 op1 = operands[1];
15873
15874 if (TARGET_AVX
15875 && GET_MODE_SIZE (mode) == 32)
15876 {
15877 switch (GET_MODE_CLASS (mode))
15878 {
15879 case MODE_VECTOR_INT:
15880 case MODE_INT:
15881 op0 = gen_lowpart (V32QImode, op0);
15882 op1 = gen_lowpart (V32QImode, op1);
15883 /* FALLTHRU */
15884
15885 case MODE_VECTOR_FLOAT:
15886 ix86_avx256_split_vector_move_misalign (op0, op1);
15887 break;
15888
15889 default:
15890 gcc_unreachable ();
15891 }
15892
15893 return;
15894 }
15895
15896 if (MEM_P (op1))
15897 {
15898 /* ??? If we have typed data, then it would appear that using
15899 movdqu is the only way to get unaligned data loaded with
15900 integer type. */
15901 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15902 {
15903 op0 = gen_lowpart (V16QImode, op0);
15904 op1 = gen_lowpart (V16QImode, op1);
15905 /* We will eventually emit movups based on insn attributes. */
15906 emit_insn (gen_sse2_movdqu (op0, op1));
15907 }
15908 else if (TARGET_SSE2 && mode == V2DFmode)
15909 {
15910 rtx zero;
15911
15912 if (TARGET_AVX
15913 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15914 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15915 || optimize_function_for_size_p (cfun))
15916 {
15917 /* We will eventually emit movups based on insn attributes. */
15918 emit_insn (gen_sse2_movupd (op0, op1));
15919 return;
15920 }
15921
15922 /* When SSE registers are split into halves, we can avoid
15923 writing to the top half twice. */
15924 if (TARGET_SSE_SPLIT_REGS)
15925 {
15926 emit_clobber (op0);
15927 zero = op0;
15928 }
15929 else
15930 {
15931 /* ??? Not sure about the best option for the Intel chips.
15932 The following would seem to satisfy; the register is
15933 entirely cleared, breaking the dependency chain. We
15934 then store to the upper half, with a dependency depth
15935 of one. A rumor has it that Intel recommends two movsd
15936 followed by an unpacklpd, but this is unconfirmed. And
15937 given that the dependency depth of the unpacklpd would
15938 still be one, I'm not sure why this would be better. */
15939 zero = CONST0_RTX (V2DFmode);
15940 }
15941
15942 m = adjust_address (op1, DFmode, 0);
15943 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15944 m = adjust_address (op1, DFmode, 8);
15945 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15946 }
15947 else
15948 {
15949 if (TARGET_AVX
15950 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15951 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15952 || optimize_function_for_size_p (cfun))
15953 {
15954 op0 = gen_lowpart (V4SFmode, op0);
15955 op1 = gen_lowpart (V4SFmode, op1);
15956 emit_insn (gen_sse_movups (op0, op1));
15957 return;
15958 }
15959
15960 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15961 emit_move_insn (op0, CONST0_RTX (mode));
15962 else
15963 emit_clobber (op0);
15964
15965 if (mode != V4SFmode)
15966 op0 = gen_lowpart (V4SFmode, op0);
15967
15968 m = adjust_address (op1, V2SFmode, 0);
15969 emit_insn (gen_sse_loadlps (op0, op0, m));
15970 m = adjust_address (op1, V2SFmode, 8);
15971 emit_insn (gen_sse_loadhps (op0, op0, m));
15972 }
15973 }
15974 else if (MEM_P (op0))
15975 {
15976 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15977 {
15978 op0 = gen_lowpart (V16QImode, op0);
15979 op1 = gen_lowpart (V16QImode, op1);
15980 /* We will eventually emit movups based on insn attributes. */
15981 emit_insn (gen_sse2_movdqu (op0, op1));
15982 }
15983 else if (TARGET_SSE2 && mode == V2DFmode)
15984 {
15985 if (TARGET_AVX
15986 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
15987 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15988 || optimize_function_for_size_p (cfun))
15989 /* We will eventually emit movups based on insn attributes. */
15990 emit_insn (gen_sse2_movupd (op0, op1));
15991 else
15992 {
15993 m = adjust_address (op0, DFmode, 0);
15994 emit_insn (gen_sse2_storelpd (m, op1));
15995 m = adjust_address (op0, DFmode, 8);
15996 emit_insn (gen_sse2_storehpd (m, op1));
15997 }
15998 }
15999 else
16000 {
16001 if (mode != V4SFmode)
16002 op1 = gen_lowpart (V4SFmode, op1);
16003
16004 if (TARGET_AVX
16005 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16006 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16007 || optimize_function_for_size_p (cfun))
16008 {
16009 op0 = gen_lowpart (V4SFmode, op0);
16010 emit_insn (gen_sse_movups (op0, op1));
16011 }
16012 else
16013 {
16014 m = adjust_address (op0, V2SFmode, 0);
16015 emit_insn (gen_sse_storelps (m, op1));
16016 m = adjust_address (op0, V2SFmode, 8);
16017 emit_insn (gen_sse_storehps (m, op1));
16018 }
16019 }
16020 }
16021 else
16022 gcc_unreachable ();
16023 }
16024
16025 /* Expand a push in MODE. This is some mode for which we do not support
16026 proper push instructions, at least from the registers that we expect
16027 the value to live in. */
16028
16029 void
16030 ix86_expand_push (enum machine_mode mode, rtx x)
16031 {
16032 rtx tmp;
16033
16034 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16035 GEN_INT (-GET_MODE_SIZE (mode)),
16036 stack_pointer_rtx, 1, OPTAB_DIRECT);
16037 if (tmp != stack_pointer_rtx)
16038 emit_move_insn (stack_pointer_rtx, tmp);
16039
16040 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16041
16042 /* When we push an operand onto stack, it has to be aligned at least
16043 at the function argument boundary. However since we don't have
16044 the argument type, we can't determine the actual argument
16045 boundary. */
16046 emit_move_insn (tmp, x);
16047 }
16048
16049 /* Helper function of ix86_fixup_binary_operands to canonicalize
16050 operand order. Returns true if the operands should be swapped. */
16051
16052 static bool
16053 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16054 rtx operands[])
16055 {
16056 rtx dst = operands[0];
16057 rtx src1 = operands[1];
16058 rtx src2 = operands[2];
16059
16060 /* If the operation is not commutative, we can't do anything. */
16061 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16062 return false;
16063
16064 /* Highest priority is that src1 should match dst. */
16065 if (rtx_equal_p (dst, src1))
16066 return false;
16067 if (rtx_equal_p (dst, src2))
16068 return true;
16069
16070 /* Next highest priority is that immediate constants come second. */
16071 if (immediate_operand (src2, mode))
16072 return false;
16073 if (immediate_operand (src1, mode))
16074 return true;
16075
16076 /* Lowest priority is that memory references should come second. */
16077 if (MEM_P (src2))
16078 return false;
16079 if (MEM_P (src1))
16080 return true;
16081
16082 return false;
16083 }
16084
16085
16086 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16087 destination to use for the operation. If different from the true
16088 destination in operands[0], a copy operation will be required. */
16089
16090 rtx
16091 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16092 rtx operands[])
16093 {
16094 rtx dst = operands[0];
16095 rtx src1 = operands[1];
16096 rtx src2 = operands[2];
16097
16098 /* Canonicalize operand order. */
16099 if (ix86_swap_binary_operands_p (code, mode, operands))
16100 {
16101 rtx temp;
16102
16103 /* It is invalid to swap operands of different modes. */
16104 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16105
16106 temp = src1;
16107 src1 = src2;
16108 src2 = temp;
16109 }
16110
16111 /* Both source operands cannot be in memory. */
16112 if (MEM_P (src1) && MEM_P (src2))
16113 {
16114 /* Optimization: Only read from memory once. */
16115 if (rtx_equal_p (src1, src2))
16116 {
16117 src2 = force_reg (mode, src2);
16118 src1 = src2;
16119 }
16120 else
16121 src2 = force_reg (mode, src2);
16122 }
16123
16124 /* If the destination is memory, and we do not have matching source
16125 operands, do things in registers. */
16126 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16127 dst = gen_reg_rtx (mode);
16128
16129 /* Source 1 cannot be a constant. */
16130 if (CONSTANT_P (src1))
16131 src1 = force_reg (mode, src1);
16132
16133 /* Source 1 cannot be a non-matching memory. */
16134 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16135 src1 = force_reg (mode, src1);
16136
16137 /* Improve address combine. */
16138 if (code == PLUS
16139 && GET_MODE_CLASS (mode) == MODE_INT
16140 && MEM_P (src2))
16141 src2 = force_reg (mode, src2);
16142
16143 operands[1] = src1;
16144 operands[2] = src2;
16145 return dst;
16146 }
16147
16148 /* Similarly, but assume that the destination has already been
16149 set up properly. */
16150
16151 void
16152 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16153 enum machine_mode mode, rtx operands[])
16154 {
16155 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16156 gcc_assert (dst == operands[0]);
16157 }
16158
16159 /* Attempt to expand a binary operator. Make the expansion closer to the
16160 actual machine, then just general_operand, which will allow 3 separate
16161 memory references (one output, two input) in a single insn. */
16162
16163 void
16164 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16165 rtx operands[])
16166 {
16167 rtx src1, src2, dst, op, clob;
16168
16169 dst = ix86_fixup_binary_operands (code, mode, operands);
16170 src1 = operands[1];
16171 src2 = operands[2];
16172
16173 /* Emit the instruction. */
16174
16175 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16176 if (reload_in_progress)
16177 {
16178 /* Reload doesn't know about the flags register, and doesn't know that
16179 it doesn't want to clobber it. We can only do this with PLUS. */
16180 gcc_assert (code == PLUS);
16181 emit_insn (op);
16182 }
16183 else if (reload_completed
16184 && code == PLUS
16185 && !rtx_equal_p (dst, src1))
16186 {
16187 /* This is going to be an LEA; avoid splitting it later. */
16188 emit_insn (op);
16189 }
16190 else
16191 {
16192 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16193 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16194 }
16195
16196 /* Fix up the destination if needed. */
16197 if (dst != operands[0])
16198 emit_move_insn (operands[0], dst);
16199 }
16200
16201 /* Return TRUE or FALSE depending on whether the binary operator meets the
16202 appropriate constraints. */
16203
16204 bool
16205 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16206 rtx operands[3])
16207 {
16208 rtx dst = operands[0];
16209 rtx src1 = operands[1];
16210 rtx src2 = operands[2];
16211
16212 /* Both source operands cannot be in memory. */
16213 if (MEM_P (src1) && MEM_P (src2))
16214 return false;
16215
16216 /* Canonicalize operand order for commutative operators. */
16217 if (ix86_swap_binary_operands_p (code, mode, operands))
16218 {
16219 rtx temp = src1;
16220 src1 = src2;
16221 src2 = temp;
16222 }
16223
16224 /* If the destination is memory, we must have a matching source operand. */
16225 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16226 return false;
16227
16228 /* Source 1 cannot be a constant. */
16229 if (CONSTANT_P (src1))
16230 return false;
16231
16232 /* Source 1 cannot be a non-matching memory. */
16233 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16234 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16235 return (code == AND
16236 && (mode == HImode
16237 || mode == SImode
16238 || (TARGET_64BIT && mode == DImode))
16239 && satisfies_constraint_L (src2));
16240
16241 return true;
16242 }
16243
16244 /* Attempt to expand a unary operator. Make the expansion closer to the
16245 actual machine, then just general_operand, which will allow 2 separate
16246 memory references (one output, one input) in a single insn. */
16247
16248 void
16249 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16250 rtx operands[])
16251 {
16252 int matching_memory;
16253 rtx src, dst, op, clob;
16254
16255 dst = operands[0];
16256 src = operands[1];
16257
16258 /* If the destination is memory, and we do not have matching source
16259 operands, do things in registers. */
16260 matching_memory = 0;
16261 if (MEM_P (dst))
16262 {
16263 if (rtx_equal_p (dst, src))
16264 matching_memory = 1;
16265 else
16266 dst = gen_reg_rtx (mode);
16267 }
16268
16269 /* When source operand is memory, destination must match. */
16270 if (MEM_P (src) && !matching_memory)
16271 src = force_reg (mode, src);
16272
16273 /* Emit the instruction. */
16274
16275 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16276 if (reload_in_progress || code == NOT)
16277 {
16278 /* Reload doesn't know about the flags register, and doesn't know that
16279 it doesn't want to clobber it. */
16280 gcc_assert (code == NOT);
16281 emit_insn (op);
16282 }
16283 else
16284 {
16285 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16286 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16287 }
16288
16289 /* Fix up the destination if needed. */
16290 if (dst != operands[0])
16291 emit_move_insn (operands[0], dst);
16292 }
16293
16294 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16295 divisor are within the range [0-255]. */
16296
16297 void
16298 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16299 bool signed_p)
16300 {
16301 rtx end_label, qimode_label;
16302 rtx insn, div, mod;
16303 rtx scratch, tmp0, tmp1, tmp2;
16304 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16305 rtx (*gen_zero_extend) (rtx, rtx);
16306 rtx (*gen_test_ccno_1) (rtx, rtx);
16307
16308 switch (mode)
16309 {
16310 case SImode:
16311 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16312 gen_test_ccno_1 = gen_testsi_ccno_1;
16313 gen_zero_extend = gen_zero_extendqisi2;
16314 break;
16315 case DImode:
16316 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16317 gen_test_ccno_1 = gen_testdi_ccno_1;
16318 gen_zero_extend = gen_zero_extendqidi2;
16319 break;
16320 default:
16321 gcc_unreachable ();
16322 }
16323
16324 end_label = gen_label_rtx ();
16325 qimode_label = gen_label_rtx ();
16326
16327 scratch = gen_reg_rtx (mode);
16328
16329 /* Use 8bit unsigned divimod if dividend and divisor are within
16330 the range [0-255]. */
16331 emit_move_insn (scratch, operands[2]);
16332 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16333 scratch, 1, OPTAB_DIRECT);
16334 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16335 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16336 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16337 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16338 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16339 pc_rtx);
16340 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16341 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16342 JUMP_LABEL (insn) = qimode_label;
16343
16344 /* Generate original signed/unsigned divimod. */
16345 div = gen_divmod4_1 (operands[0], operands[1],
16346 operands[2], operands[3]);
16347 emit_insn (div);
16348
16349 /* Branch to the end. */
16350 emit_jump_insn (gen_jump (end_label));
16351 emit_barrier ();
16352
16353 /* Generate 8bit unsigned divide. */
16354 emit_label (qimode_label);
16355 /* Don't use operands[0] for result of 8bit divide since not all
16356 registers support QImode ZERO_EXTRACT. */
16357 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16358 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16359 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16360 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16361
16362 if (signed_p)
16363 {
16364 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16365 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16366 }
16367 else
16368 {
16369 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16370 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16371 }
16372
16373 /* Extract remainder from AH. */
16374 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16375 if (REG_P (operands[1]))
16376 insn = emit_move_insn (operands[1], tmp1);
16377 else
16378 {
16379 /* Need a new scratch register since the old one has result
16380 of 8bit divide. */
16381 scratch = gen_reg_rtx (mode);
16382 emit_move_insn (scratch, tmp1);
16383 insn = emit_move_insn (operands[1], scratch);
16384 }
16385 set_unique_reg_note (insn, REG_EQUAL, mod);
16386
16387 /* Zero extend quotient from AL. */
16388 tmp1 = gen_lowpart (QImode, tmp0);
16389 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16390 set_unique_reg_note (insn, REG_EQUAL, div);
16391
16392 emit_label (end_label);
16393 }
16394
16395 #define LEA_MAX_STALL (3)
16396 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16397
16398 /* Increase given DISTANCE in half-cycles according to
16399 dependencies between PREV and NEXT instructions.
16400 Add 1 half-cycle if there is no dependency and
16401 go to next cycle if there is some dependecy. */
16402
16403 static unsigned int
16404 increase_distance (rtx prev, rtx next, unsigned int distance)
16405 {
16406 df_ref *use_rec;
16407 df_ref *def_rec;
16408
16409 if (!prev || !next)
16410 return distance + (distance & 1) + 2;
16411
16412 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16413 return distance + 1;
16414
16415 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16416 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16417 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16418 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16419 return distance + (distance & 1) + 2;
16420
16421 return distance + 1;
16422 }
16423
16424 /* Function checks if instruction INSN defines register number
16425 REGNO1 or REGNO2. */
16426
16427 static bool
16428 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16429 rtx insn)
16430 {
16431 df_ref *def_rec;
16432
16433 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16434 if (DF_REF_REG_DEF_P (*def_rec)
16435 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16436 && (regno1 == DF_REF_REGNO (*def_rec)
16437 || regno2 == DF_REF_REGNO (*def_rec)))
16438 {
16439 return true;
16440 }
16441
16442 return false;
16443 }
16444
16445 /* Function checks if instruction INSN uses register number
16446 REGNO as a part of address expression. */
16447
16448 static bool
16449 insn_uses_reg_mem (unsigned int regno, rtx insn)
16450 {
16451 df_ref *use_rec;
16452
16453 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16454 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16455 return true;
16456
16457 return false;
16458 }
16459
16460 /* Search backward for non-agu definition of register number REGNO1
16461 or register number REGNO2 in basic block starting from instruction
16462 START up to head of basic block or instruction INSN.
16463
16464 Function puts true value into *FOUND var if definition was found
16465 and false otherwise.
16466
16467 Distance in half-cycles between START and found instruction or head
16468 of BB is added to DISTANCE and returned. */
16469
16470 static int
16471 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16472 rtx insn, int distance,
16473 rtx start, bool *found)
16474 {
16475 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16476 rtx prev = start;
16477 rtx next = NULL;
16478
16479 *found = false;
16480
16481 while (prev
16482 && prev != insn
16483 && distance < LEA_SEARCH_THRESHOLD)
16484 {
16485 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16486 {
16487 distance = increase_distance (prev, next, distance);
16488 if (insn_defines_reg (regno1, regno2, prev))
16489 {
16490 if (recog_memoized (prev) < 0
16491 || get_attr_type (prev) != TYPE_LEA)
16492 {
16493 *found = true;
16494 return distance;
16495 }
16496 }
16497
16498 next = prev;
16499 }
16500 if (prev == BB_HEAD (bb))
16501 break;
16502
16503 prev = PREV_INSN (prev);
16504 }
16505
16506 return distance;
16507 }
16508
16509 /* Search backward for non-agu definition of register number REGNO1
16510 or register number REGNO2 in INSN's basic block until
16511 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16512 2. Reach neighbour BBs boundary, or
16513 3. Reach agu definition.
16514 Returns the distance between the non-agu definition point and INSN.
16515 If no definition point, returns -1. */
16516
16517 static int
16518 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16519 rtx insn)
16520 {
16521 basic_block bb = BLOCK_FOR_INSN (insn);
16522 int distance = 0;
16523 bool found = false;
16524
16525 if (insn != BB_HEAD (bb))
16526 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16527 distance, PREV_INSN (insn),
16528 &found);
16529
16530 if (!found && distance < LEA_SEARCH_THRESHOLD)
16531 {
16532 edge e;
16533 edge_iterator ei;
16534 bool simple_loop = false;
16535
16536 FOR_EACH_EDGE (e, ei, bb->preds)
16537 if (e->src == bb)
16538 {
16539 simple_loop = true;
16540 break;
16541 }
16542
16543 if (simple_loop)
16544 distance = distance_non_agu_define_in_bb (regno1, regno2,
16545 insn, distance,
16546 BB_END (bb), &found);
16547 else
16548 {
16549 int shortest_dist = -1;
16550 bool found_in_bb = false;
16551
16552 FOR_EACH_EDGE (e, ei, bb->preds)
16553 {
16554 int bb_dist
16555 = distance_non_agu_define_in_bb (regno1, regno2,
16556 insn, distance,
16557 BB_END (e->src),
16558 &found_in_bb);
16559 if (found_in_bb)
16560 {
16561 if (shortest_dist < 0)
16562 shortest_dist = bb_dist;
16563 else if (bb_dist > 0)
16564 shortest_dist = MIN (bb_dist, shortest_dist);
16565
16566 found = true;
16567 }
16568 }
16569
16570 distance = shortest_dist;
16571 }
16572 }
16573
16574 /* get_attr_type may modify recog data. We want to make sure
16575 that recog data is valid for instruction INSN, on which
16576 distance_non_agu_define is called. INSN is unchanged here. */
16577 extract_insn_cached (insn);
16578
16579 if (!found)
16580 return -1;
16581
16582 return distance >> 1;
16583 }
16584
16585 /* Return the distance in half-cycles between INSN and the next
16586 insn that uses register number REGNO in memory address added
16587 to DISTANCE. Return -1 if REGNO0 is set.
16588
16589 Put true value into *FOUND if register usage was found and
16590 false otherwise.
16591 Put true value into *REDEFINED if register redefinition was
16592 found and false otherwise. */
16593
16594 static int
16595 distance_agu_use_in_bb (unsigned int regno,
16596 rtx insn, int distance, rtx start,
16597 bool *found, bool *redefined)
16598 {
16599 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16600 rtx next = start;
16601 rtx prev = NULL;
16602
16603 *found = false;
16604 *redefined = false;
16605
16606 while (next
16607 && next != insn
16608 && distance < LEA_SEARCH_THRESHOLD)
16609 {
16610 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16611 {
16612 distance = increase_distance(prev, next, distance);
16613 if (insn_uses_reg_mem (regno, next))
16614 {
16615 /* Return DISTANCE if OP0 is used in memory
16616 address in NEXT. */
16617 *found = true;
16618 return distance;
16619 }
16620
16621 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16622 {
16623 /* Return -1 if OP0 is set in NEXT. */
16624 *redefined = true;
16625 return -1;
16626 }
16627
16628 prev = next;
16629 }
16630
16631 if (next == BB_END (bb))
16632 break;
16633
16634 next = NEXT_INSN (next);
16635 }
16636
16637 return distance;
16638 }
16639
16640 /* Return the distance between INSN and the next insn that uses
16641 register number REGNO0 in memory address. Return -1 if no such
16642 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16643
16644 static int
16645 distance_agu_use (unsigned int regno0, rtx insn)
16646 {
16647 basic_block bb = BLOCK_FOR_INSN (insn);
16648 int distance = 0;
16649 bool found = false;
16650 bool redefined = false;
16651
16652 if (insn != BB_END (bb))
16653 distance = distance_agu_use_in_bb (regno0, insn, distance,
16654 NEXT_INSN (insn),
16655 &found, &redefined);
16656
16657 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16658 {
16659 edge e;
16660 edge_iterator ei;
16661 bool simple_loop = false;
16662
16663 FOR_EACH_EDGE (e, ei, bb->succs)
16664 if (e->dest == bb)
16665 {
16666 simple_loop = true;
16667 break;
16668 }
16669
16670 if (simple_loop)
16671 distance = distance_agu_use_in_bb (regno0, insn,
16672 distance, BB_HEAD (bb),
16673 &found, &redefined);
16674 else
16675 {
16676 int shortest_dist = -1;
16677 bool found_in_bb = false;
16678 bool redefined_in_bb = false;
16679
16680 FOR_EACH_EDGE (e, ei, bb->succs)
16681 {
16682 int bb_dist
16683 = distance_agu_use_in_bb (regno0, insn,
16684 distance, BB_HEAD (e->dest),
16685 &found_in_bb, &redefined_in_bb);
16686 if (found_in_bb)
16687 {
16688 if (shortest_dist < 0)
16689 shortest_dist = bb_dist;
16690 else if (bb_dist > 0)
16691 shortest_dist = MIN (bb_dist, shortest_dist);
16692
16693 found = true;
16694 }
16695 }
16696
16697 distance = shortest_dist;
16698 }
16699 }
16700
16701 if (!found || redefined)
16702 return -1;
16703
16704 return distance >> 1;
16705 }
16706
16707 /* Define this macro to tune LEA priority vs ADD, it take effect when
16708 there is a dilemma of choicing LEA or ADD
16709 Negative value: ADD is more preferred than LEA
16710 Zero: Netrual
16711 Positive value: LEA is more preferred than ADD*/
16712 #define IX86_LEA_PRIORITY 0
16713
16714 /* Return true if usage of lea INSN has performance advantage
16715 over a sequence of instructions. Instructions sequence has
16716 SPLIT_COST cycles higher latency than lea latency. */
16717
16718 bool
16719 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16720 unsigned int regno2, unsigned int split_cost)
16721 {
16722 int dist_define, dist_use;
16723
16724 dist_define = distance_non_agu_define (regno1, regno2, insn);
16725 dist_use = distance_agu_use (regno0, insn);
16726
16727 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16728 {
16729 /* If there is no non AGU operand definition, no AGU
16730 operand usage and split cost is 0 then both lea
16731 and non lea variants have same priority. Currently
16732 we prefer lea for 64 bit code and non lea on 32 bit
16733 code. */
16734 if (dist_use < 0 && split_cost == 0)
16735 return TARGET_64BIT || IX86_LEA_PRIORITY;
16736 else
16737 return true;
16738 }
16739
16740 /* With longer definitions distance lea is more preferable.
16741 Here we change it to take into account splitting cost and
16742 lea priority. */
16743 dist_define += split_cost + IX86_LEA_PRIORITY;
16744
16745 /* If there is no use in memory addess then we just check
16746 that split cost does not exceed AGU stall. */
16747 if (dist_use < 0)
16748 return dist_define >= LEA_MAX_STALL;
16749
16750 /* If this insn has both backward non-agu dependence and forward
16751 agu dependence, the one with short distance takes effect. */
16752 return dist_define >= dist_use;
16753 }
16754
16755 /* Return true if it is legal to clobber flags by INSN and
16756 false otherwise. */
16757
16758 static bool
16759 ix86_ok_to_clobber_flags (rtx insn)
16760 {
16761 basic_block bb = BLOCK_FOR_INSN (insn);
16762 df_ref *use;
16763 bitmap live;
16764
16765 while (insn)
16766 {
16767 if (NONDEBUG_INSN_P (insn))
16768 {
16769 for (use = DF_INSN_USES (insn); *use; use++)
16770 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16771 return false;
16772
16773 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16774 return true;
16775 }
16776
16777 if (insn == BB_END (bb))
16778 break;
16779
16780 insn = NEXT_INSN (insn);
16781 }
16782
16783 live = df_get_live_out(bb);
16784 return !REGNO_REG_SET_P (live, FLAGS_REG);
16785 }
16786
16787 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16788 move and add to avoid AGU stalls. */
16789
16790 bool
16791 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16792 {
16793 unsigned int regno0 = true_regnum (operands[0]);
16794 unsigned int regno1 = true_regnum (operands[1]);
16795 unsigned int regno2 = true_regnum (operands[2]);
16796
16797 /* Check if we need to optimize. */
16798 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16799 return false;
16800
16801 /* Check it is correct to split here. */
16802 if (!ix86_ok_to_clobber_flags(insn))
16803 return false;
16804
16805 /* We need to split only adds with non destructive
16806 destination operand. */
16807 if (regno0 == regno1 || regno0 == regno2)
16808 return false;
16809 else
16810 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16811 }
16812
16813 /* Return true if we should emit lea instruction instead of mov
16814 instruction. */
16815
16816 bool
16817 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16818 {
16819 unsigned int regno0;
16820 unsigned int regno1;
16821
16822 /* Check if we need to optimize. */
16823 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16824 return false;
16825
16826 /* Use lea for reg to reg moves only. */
16827 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16828 return false;
16829
16830 regno0 = true_regnum (operands[0]);
16831 regno1 = true_regnum (operands[1]);
16832
16833 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16834 }
16835
16836 /* Return true if we need to split lea into a sequence of
16837 instructions to avoid AGU stalls. */
16838
16839 bool
16840 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16841 {
16842 unsigned int regno0 = true_regnum (operands[0]) ;
16843 unsigned int regno1 = -1;
16844 unsigned int regno2 = -1;
16845 unsigned int split_cost = 0;
16846 struct ix86_address parts;
16847 int ok;
16848
16849 /* Check we need to optimize. */
16850 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16851 return false;
16852
16853 /* Check it is correct to split here. */
16854 if (!ix86_ok_to_clobber_flags(insn))
16855 return false;
16856
16857 ok = ix86_decompose_address (operands[1], &parts);
16858 gcc_assert (ok);
16859
16860 /* We should not split into add if non legitimate pic
16861 operand is used as displacement. */
16862 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16863 return false;
16864
16865 if (parts.base)
16866 regno1 = true_regnum (parts.base);
16867 if (parts.index)
16868 regno2 = true_regnum (parts.index);
16869
16870 /* Compute how many cycles we will add to execution time
16871 if split lea into a sequence of instructions. */
16872 if (parts.base || parts.index)
16873 {
16874 /* Have to use mov instruction if non desctructive
16875 destination form is used. */
16876 if (regno1 != regno0 && regno2 != regno0)
16877 split_cost += 1;
16878
16879 /* Have to add index to base if both exist. */
16880 if (parts.base && parts.index)
16881 split_cost += 1;
16882
16883 /* Have to use shift and adds if scale is 2 or greater. */
16884 if (parts.scale > 1)
16885 {
16886 if (regno0 != regno1)
16887 split_cost += 1;
16888 else if (regno2 == regno0)
16889 split_cost += 4;
16890 else
16891 split_cost += parts.scale;
16892 }
16893
16894 /* Have to use add instruction with immediate if
16895 disp is non zero. */
16896 if (parts.disp && parts.disp != const0_rtx)
16897 split_cost += 1;
16898
16899 /* Subtract the price of lea. */
16900 split_cost -= 1;
16901 }
16902
16903 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16904 }
16905
16906 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16907 matches destination. RTX includes clobber of FLAGS_REG. */
16908
16909 static void
16910 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16911 rtx dst, rtx src)
16912 {
16913 rtx op, clob;
16914
16915 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16916 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16917
16918 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16919 }
16920
16921 /* Split lea instructions into a sequence of instructions
16922 which are executed on ALU to avoid AGU stalls.
16923 It is assumed that it is allowed to clobber flags register
16924 at lea position. */
16925
16926 extern void
16927 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16928 {
16929 unsigned int regno0 = true_regnum (operands[0]) ;
16930 unsigned int regno1 = INVALID_REGNUM;
16931 unsigned int regno2 = INVALID_REGNUM;
16932 struct ix86_address parts;
16933 rtx tmp;
16934 int ok, adds;
16935
16936 ok = ix86_decompose_address (operands[1], &parts);
16937 gcc_assert (ok);
16938
16939 if (parts.base)
16940 {
16941 if (GET_MODE (parts.base) != mode)
16942 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16943 regno1 = true_regnum (parts.base);
16944 }
16945
16946 if (parts.index)
16947 {
16948 if (GET_MODE (parts.index) != mode)
16949 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16950 regno2 = true_regnum (parts.index);
16951 }
16952
16953 if (parts.scale > 1)
16954 {
16955 /* Case r1 = r1 + ... */
16956 if (regno1 == regno0)
16957 {
16958 /* If we have a case r1 = r1 + C * r1 then we
16959 should use multiplication which is very
16960 expensive. Assume cost model is wrong if we
16961 have such case here. */
16962 gcc_assert (regno2 != regno0);
16963
16964 for (adds = parts.scale; adds > 0; adds--)
16965 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16966 }
16967 else
16968 {
16969 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16970 if (regno0 != regno2)
16971 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16972
16973 /* Use shift for scaling. */
16974 ix86_emit_binop (ASHIFT, mode, operands[0],
16975 GEN_INT (exact_log2 (parts.scale)));
16976
16977 if (parts.base)
16978 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16979
16980 if (parts.disp && parts.disp != const0_rtx)
16981 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16982 }
16983 }
16984 else if (!parts.base && !parts.index)
16985 {
16986 gcc_assert(parts.disp);
16987 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16988 }
16989 else
16990 {
16991 if (!parts.base)
16992 {
16993 if (regno0 != regno2)
16994 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16995 }
16996 else if (!parts.index)
16997 {
16998 if (regno0 != regno1)
16999 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17000 }
17001 else
17002 {
17003 if (regno0 == regno1)
17004 tmp = parts.index;
17005 else if (regno0 == regno2)
17006 tmp = parts.base;
17007 else
17008 {
17009 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17010 tmp = parts.index;
17011 }
17012
17013 ix86_emit_binop (PLUS, mode, operands[0], tmp);
17014 }
17015
17016 if (parts.disp && parts.disp != const0_rtx)
17017 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17018 }
17019 }
17020
17021 /* Return true if it is ok to optimize an ADD operation to LEA
17022 operation to avoid flag register consumation. For most processors,
17023 ADD is faster than LEA. For the processors like ATOM, if the
17024 destination register of LEA holds an actual address which will be
17025 used soon, LEA is better and otherwise ADD is better. */
17026
17027 bool
17028 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17029 {
17030 unsigned int regno0 = true_regnum (operands[0]);
17031 unsigned int regno1 = true_regnum (operands[1]);
17032 unsigned int regno2 = true_regnum (operands[2]);
17033
17034 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17035 if (regno0 != regno1 && regno0 != regno2)
17036 return true;
17037
17038 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17039 return false;
17040
17041 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17042 }
17043
17044 /* Return true if destination reg of SET_BODY is shift count of
17045 USE_BODY. */
17046
17047 static bool
17048 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17049 {
17050 rtx set_dest;
17051 rtx shift_rtx;
17052 int i;
17053
17054 /* Retrieve destination of SET_BODY. */
17055 switch (GET_CODE (set_body))
17056 {
17057 case SET:
17058 set_dest = SET_DEST (set_body);
17059 if (!set_dest || !REG_P (set_dest))
17060 return false;
17061 break;
17062 case PARALLEL:
17063 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17064 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17065 use_body))
17066 return true;
17067 default:
17068 return false;
17069 break;
17070 }
17071
17072 /* Retrieve shift count of USE_BODY. */
17073 switch (GET_CODE (use_body))
17074 {
17075 case SET:
17076 shift_rtx = XEXP (use_body, 1);
17077 break;
17078 case PARALLEL:
17079 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17080 if (ix86_dep_by_shift_count_body (set_body,
17081 XVECEXP (use_body, 0, i)))
17082 return true;
17083 default:
17084 return false;
17085 break;
17086 }
17087
17088 if (shift_rtx
17089 && (GET_CODE (shift_rtx) == ASHIFT
17090 || GET_CODE (shift_rtx) == LSHIFTRT
17091 || GET_CODE (shift_rtx) == ASHIFTRT
17092 || GET_CODE (shift_rtx) == ROTATE
17093 || GET_CODE (shift_rtx) == ROTATERT))
17094 {
17095 rtx shift_count = XEXP (shift_rtx, 1);
17096
17097 /* Return true if shift count is dest of SET_BODY. */
17098 if (REG_P (shift_count)
17099 && true_regnum (set_dest) == true_regnum (shift_count))
17100 return true;
17101 }
17102
17103 return false;
17104 }
17105
17106 /* Return true if destination reg of SET_INSN is shift count of
17107 USE_INSN. */
17108
17109 bool
17110 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17111 {
17112 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17113 PATTERN (use_insn));
17114 }
17115
17116 /* Return TRUE or FALSE depending on whether the unary operator meets the
17117 appropriate constraints. */
17118
17119 bool
17120 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17121 enum machine_mode mode ATTRIBUTE_UNUSED,
17122 rtx operands[2] ATTRIBUTE_UNUSED)
17123 {
17124 /* If one of operands is memory, source and destination must match. */
17125 if ((MEM_P (operands[0])
17126 || MEM_P (operands[1]))
17127 && ! rtx_equal_p (operands[0], operands[1]))
17128 return false;
17129 return true;
17130 }
17131
17132 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17133 are ok, keeping in mind the possible movddup alternative. */
17134
17135 bool
17136 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17137 {
17138 if (MEM_P (operands[0]))
17139 return rtx_equal_p (operands[0], operands[1 + high]);
17140 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17141 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17142 return true;
17143 }
17144
17145 /* Post-reload splitter for converting an SF or DFmode value in an
17146 SSE register into an unsigned SImode. */
17147
17148 void
17149 ix86_split_convert_uns_si_sse (rtx operands[])
17150 {
17151 enum machine_mode vecmode;
17152 rtx value, large, zero_or_two31, input, two31, x;
17153
17154 large = operands[1];
17155 zero_or_two31 = operands[2];
17156 input = operands[3];
17157 two31 = operands[4];
17158 vecmode = GET_MODE (large);
17159 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17160
17161 /* Load up the value into the low element. We must ensure that the other
17162 elements are valid floats -- zero is the easiest such value. */
17163 if (MEM_P (input))
17164 {
17165 if (vecmode == V4SFmode)
17166 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17167 else
17168 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17169 }
17170 else
17171 {
17172 input = gen_rtx_REG (vecmode, REGNO (input));
17173 emit_move_insn (value, CONST0_RTX (vecmode));
17174 if (vecmode == V4SFmode)
17175 emit_insn (gen_sse_movss (value, value, input));
17176 else
17177 emit_insn (gen_sse2_movsd (value, value, input));
17178 }
17179
17180 emit_move_insn (large, two31);
17181 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17182
17183 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17184 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17185
17186 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17187 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17188
17189 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17190 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17191
17192 large = gen_rtx_REG (V4SImode, REGNO (large));
17193 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17194
17195 x = gen_rtx_REG (V4SImode, REGNO (value));
17196 if (vecmode == V4SFmode)
17197 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17198 else
17199 emit_insn (gen_sse2_cvttpd2dq (x, value));
17200 value = x;
17201
17202 emit_insn (gen_xorv4si3 (value, value, large));
17203 }
17204
17205 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17206 Expects the 64-bit DImode to be supplied in a pair of integral
17207 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17208 -mfpmath=sse, !optimize_size only. */
17209
17210 void
17211 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17212 {
17213 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17214 rtx int_xmm, fp_xmm;
17215 rtx biases, exponents;
17216 rtx x;
17217
17218 int_xmm = gen_reg_rtx (V4SImode);
17219 if (TARGET_INTER_UNIT_MOVES)
17220 emit_insn (gen_movdi_to_sse (int_xmm, input));
17221 else if (TARGET_SSE_SPLIT_REGS)
17222 {
17223 emit_clobber (int_xmm);
17224 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17225 }
17226 else
17227 {
17228 x = gen_reg_rtx (V2DImode);
17229 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17230 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17231 }
17232
17233 x = gen_rtx_CONST_VECTOR (V4SImode,
17234 gen_rtvec (4, GEN_INT (0x43300000UL),
17235 GEN_INT (0x45300000UL),
17236 const0_rtx, const0_rtx));
17237 exponents = validize_mem (force_const_mem (V4SImode, x));
17238
17239 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17240 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17241
17242 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17243 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17244 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17245 (0x1.0p84 + double(fp_value_hi_xmm)).
17246 Note these exponents differ by 32. */
17247
17248 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17249
17250 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17251 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17252 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17253 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17254 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17255 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17256 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17257 biases = validize_mem (force_const_mem (V2DFmode, biases));
17258 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17259
17260 /* Add the upper and lower DFmode values together. */
17261 if (TARGET_SSE3)
17262 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17263 else
17264 {
17265 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17266 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17267 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17268 }
17269
17270 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17271 }
17272
17273 /* Not used, but eases macroization of patterns. */
17274 void
17275 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17276 rtx input ATTRIBUTE_UNUSED)
17277 {
17278 gcc_unreachable ();
17279 }
17280
17281 /* Convert an unsigned SImode value into a DFmode. Only currently used
17282 for SSE, but applicable anywhere. */
17283
17284 void
17285 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17286 {
17287 REAL_VALUE_TYPE TWO31r;
17288 rtx x, fp;
17289
17290 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17291 NULL, 1, OPTAB_DIRECT);
17292
17293 fp = gen_reg_rtx (DFmode);
17294 emit_insn (gen_floatsidf2 (fp, x));
17295
17296 real_ldexp (&TWO31r, &dconst1, 31);
17297 x = const_double_from_real_value (TWO31r, DFmode);
17298
17299 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17300 if (x != target)
17301 emit_move_insn (target, x);
17302 }
17303
17304 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17305 32-bit mode; otherwise we have a direct convert instruction. */
17306
17307 void
17308 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17309 {
17310 REAL_VALUE_TYPE TWO32r;
17311 rtx fp_lo, fp_hi, x;
17312
17313 fp_lo = gen_reg_rtx (DFmode);
17314 fp_hi = gen_reg_rtx (DFmode);
17315
17316 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17317
17318 real_ldexp (&TWO32r, &dconst1, 32);
17319 x = const_double_from_real_value (TWO32r, DFmode);
17320 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17321
17322 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17323
17324 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17325 0, OPTAB_DIRECT);
17326 if (x != target)
17327 emit_move_insn (target, x);
17328 }
17329
17330 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17331 For x86_32, -mfpmath=sse, !optimize_size only. */
17332 void
17333 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17334 {
17335 REAL_VALUE_TYPE ONE16r;
17336 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17337
17338 real_ldexp (&ONE16r, &dconst1, 16);
17339 x = const_double_from_real_value (ONE16r, SFmode);
17340 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17341 NULL, 0, OPTAB_DIRECT);
17342 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17343 NULL, 0, OPTAB_DIRECT);
17344 fp_hi = gen_reg_rtx (SFmode);
17345 fp_lo = gen_reg_rtx (SFmode);
17346 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17347 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17348 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17349 0, OPTAB_DIRECT);
17350 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17351 0, OPTAB_DIRECT);
17352 if (!rtx_equal_p (target, fp_hi))
17353 emit_move_insn (target, fp_hi);
17354 }
17355
17356 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17357 a vector of unsigned ints VAL to vector of floats TARGET. */
17358
17359 void
17360 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17361 {
17362 rtx tmp[8];
17363 REAL_VALUE_TYPE TWO16r;
17364 enum machine_mode intmode = GET_MODE (val);
17365 enum machine_mode fltmode = GET_MODE (target);
17366 rtx (*cvt) (rtx, rtx);
17367
17368 if (intmode == V4SImode)
17369 cvt = gen_floatv4siv4sf2;
17370 else
17371 cvt = gen_floatv8siv8sf2;
17372 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17373 tmp[0] = force_reg (intmode, tmp[0]);
17374 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17375 OPTAB_DIRECT);
17376 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17377 NULL_RTX, 1, OPTAB_DIRECT);
17378 tmp[3] = gen_reg_rtx (fltmode);
17379 emit_insn (cvt (tmp[3], tmp[1]));
17380 tmp[4] = gen_reg_rtx (fltmode);
17381 emit_insn (cvt (tmp[4], tmp[2]));
17382 real_ldexp (&TWO16r, &dconst1, 16);
17383 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17384 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17385 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17386 OPTAB_DIRECT);
17387 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17388 OPTAB_DIRECT);
17389 if (tmp[7] != target)
17390 emit_move_insn (target, tmp[7]);
17391 }
17392
17393 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17394 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17395 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17396 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17397
17398 rtx
17399 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17400 {
17401 REAL_VALUE_TYPE TWO31r;
17402 rtx two31r, tmp[4];
17403 enum machine_mode mode = GET_MODE (val);
17404 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17405 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17406 rtx (*cmp) (rtx, rtx, rtx, rtx);
17407 int i;
17408
17409 for (i = 0; i < 3; i++)
17410 tmp[i] = gen_reg_rtx (mode);
17411 real_ldexp (&TWO31r, &dconst1, 31);
17412 two31r = const_double_from_real_value (TWO31r, scalarmode);
17413 two31r = ix86_build_const_vector (mode, 1, two31r);
17414 two31r = force_reg (mode, two31r);
17415 switch (mode)
17416 {
17417 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17418 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17419 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17420 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17421 default: gcc_unreachable ();
17422 }
17423 tmp[3] = gen_rtx_LE (mode, two31r, val);
17424 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17425 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17426 0, OPTAB_DIRECT);
17427 if (intmode == V4SImode || TARGET_AVX2)
17428 *xorp = expand_simple_binop (intmode, ASHIFT,
17429 gen_lowpart (intmode, tmp[0]),
17430 GEN_INT (31), NULL_RTX, 0,
17431 OPTAB_DIRECT);
17432 else
17433 {
17434 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17435 two31 = ix86_build_const_vector (intmode, 1, two31);
17436 *xorp = expand_simple_binop (intmode, AND,
17437 gen_lowpart (intmode, tmp[0]),
17438 two31, NULL_RTX, 0,
17439 OPTAB_DIRECT);
17440 }
17441 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17442 0, OPTAB_DIRECT);
17443 }
17444
17445 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17446 then replicate the value for all elements of the vector
17447 register. */
17448
17449 rtx
17450 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17451 {
17452 int i, n_elt;
17453 rtvec v;
17454 enum machine_mode scalar_mode;
17455
17456 switch (mode)
17457 {
17458 case V32QImode:
17459 case V16QImode:
17460 case V16HImode:
17461 case V8HImode:
17462 case V8SImode:
17463 case V4SImode:
17464 case V4DImode:
17465 case V2DImode:
17466 gcc_assert (vect);
17467 case V8SFmode:
17468 case V4SFmode:
17469 case V4DFmode:
17470 case V2DFmode:
17471 n_elt = GET_MODE_NUNITS (mode);
17472 v = rtvec_alloc (n_elt);
17473 scalar_mode = GET_MODE_INNER (mode);
17474
17475 RTVEC_ELT (v, 0) = value;
17476
17477 for (i = 1; i < n_elt; ++i)
17478 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17479
17480 return gen_rtx_CONST_VECTOR (mode, v);
17481
17482 default:
17483 gcc_unreachable ();
17484 }
17485 }
17486
17487 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17488 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17489 for an SSE register. If VECT is true, then replicate the mask for
17490 all elements of the vector register. If INVERT is true, then create
17491 a mask excluding the sign bit. */
17492
17493 rtx
17494 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17495 {
17496 enum machine_mode vec_mode, imode;
17497 HOST_WIDE_INT hi, lo;
17498 int shift = 63;
17499 rtx v;
17500 rtx mask;
17501
17502 /* Find the sign bit, sign extended to 2*HWI. */
17503 switch (mode)
17504 {
17505 case V8SImode:
17506 case V4SImode:
17507 case V8SFmode:
17508 case V4SFmode:
17509 vec_mode = mode;
17510 mode = GET_MODE_INNER (mode);
17511 imode = SImode;
17512 lo = 0x80000000, hi = lo < 0;
17513 break;
17514
17515 case V4DImode:
17516 case V2DImode:
17517 case V4DFmode:
17518 case V2DFmode:
17519 vec_mode = mode;
17520 mode = GET_MODE_INNER (mode);
17521 imode = DImode;
17522 if (HOST_BITS_PER_WIDE_INT >= 64)
17523 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17524 else
17525 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17526 break;
17527
17528 case TImode:
17529 case TFmode:
17530 vec_mode = VOIDmode;
17531 if (HOST_BITS_PER_WIDE_INT >= 64)
17532 {
17533 imode = TImode;
17534 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17535 }
17536 else
17537 {
17538 rtvec vec;
17539
17540 imode = DImode;
17541 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17542
17543 if (invert)
17544 {
17545 lo = ~lo, hi = ~hi;
17546 v = constm1_rtx;
17547 }
17548 else
17549 v = const0_rtx;
17550
17551 mask = immed_double_const (lo, hi, imode);
17552
17553 vec = gen_rtvec (2, v, mask);
17554 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17555 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17556
17557 return v;
17558 }
17559 break;
17560
17561 default:
17562 gcc_unreachable ();
17563 }
17564
17565 if (invert)
17566 lo = ~lo, hi = ~hi;
17567
17568 /* Force this value into the low part of a fp vector constant. */
17569 mask = immed_double_const (lo, hi, imode);
17570 mask = gen_lowpart (mode, mask);
17571
17572 if (vec_mode == VOIDmode)
17573 return force_reg (mode, mask);
17574
17575 v = ix86_build_const_vector (vec_mode, vect, mask);
17576 return force_reg (vec_mode, v);
17577 }
17578
17579 /* Generate code for floating point ABS or NEG. */
17580
17581 void
17582 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17583 rtx operands[])
17584 {
17585 rtx mask, set, dst, src;
17586 bool use_sse = false;
17587 bool vector_mode = VECTOR_MODE_P (mode);
17588 enum machine_mode vmode = mode;
17589
17590 if (vector_mode)
17591 use_sse = true;
17592 else if (mode == TFmode)
17593 use_sse = true;
17594 else if (TARGET_SSE_MATH)
17595 {
17596 use_sse = SSE_FLOAT_MODE_P (mode);
17597 if (mode == SFmode)
17598 vmode = V4SFmode;
17599 else if (mode == DFmode)
17600 vmode = V2DFmode;
17601 }
17602
17603 /* NEG and ABS performed with SSE use bitwise mask operations.
17604 Create the appropriate mask now. */
17605 if (use_sse)
17606 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17607 else
17608 mask = NULL_RTX;
17609
17610 dst = operands[0];
17611 src = operands[1];
17612
17613 set = gen_rtx_fmt_e (code, mode, src);
17614 set = gen_rtx_SET (VOIDmode, dst, set);
17615
17616 if (mask)
17617 {
17618 rtx use, clob;
17619 rtvec par;
17620
17621 use = gen_rtx_USE (VOIDmode, mask);
17622 if (vector_mode)
17623 par = gen_rtvec (2, set, use);
17624 else
17625 {
17626 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17627 par = gen_rtvec (3, set, use, clob);
17628 }
17629 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17630 }
17631 else
17632 emit_insn (set);
17633 }
17634
17635 /* Expand a copysign operation. Special case operand 0 being a constant. */
17636
17637 void
17638 ix86_expand_copysign (rtx operands[])
17639 {
17640 enum machine_mode mode, vmode;
17641 rtx dest, op0, op1, mask, nmask;
17642
17643 dest = operands[0];
17644 op0 = operands[1];
17645 op1 = operands[2];
17646
17647 mode = GET_MODE (dest);
17648
17649 if (mode == SFmode)
17650 vmode = V4SFmode;
17651 else if (mode == DFmode)
17652 vmode = V2DFmode;
17653 else
17654 vmode = mode;
17655
17656 if (GET_CODE (op0) == CONST_DOUBLE)
17657 {
17658 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17659
17660 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17661 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17662
17663 if (mode == SFmode || mode == DFmode)
17664 {
17665 if (op0 == CONST0_RTX (mode))
17666 op0 = CONST0_RTX (vmode);
17667 else
17668 {
17669 rtx v = ix86_build_const_vector (vmode, false, op0);
17670
17671 op0 = force_reg (vmode, v);
17672 }
17673 }
17674 else if (op0 != CONST0_RTX (mode))
17675 op0 = force_reg (mode, op0);
17676
17677 mask = ix86_build_signbit_mask (vmode, 0, 0);
17678
17679 if (mode == SFmode)
17680 copysign_insn = gen_copysignsf3_const;
17681 else if (mode == DFmode)
17682 copysign_insn = gen_copysigndf3_const;
17683 else
17684 copysign_insn = gen_copysigntf3_const;
17685
17686 emit_insn (copysign_insn (dest, op0, op1, mask));
17687 }
17688 else
17689 {
17690 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17691
17692 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17693 mask = ix86_build_signbit_mask (vmode, 0, 0);
17694
17695 if (mode == SFmode)
17696 copysign_insn = gen_copysignsf3_var;
17697 else if (mode == DFmode)
17698 copysign_insn = gen_copysigndf3_var;
17699 else
17700 copysign_insn = gen_copysigntf3_var;
17701
17702 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17703 }
17704 }
17705
17706 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17707 be a constant, and so has already been expanded into a vector constant. */
17708
17709 void
17710 ix86_split_copysign_const (rtx operands[])
17711 {
17712 enum machine_mode mode, vmode;
17713 rtx dest, op0, mask, x;
17714
17715 dest = operands[0];
17716 op0 = operands[1];
17717 mask = operands[3];
17718
17719 mode = GET_MODE (dest);
17720 vmode = GET_MODE (mask);
17721
17722 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17723 x = gen_rtx_AND (vmode, dest, mask);
17724 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17725
17726 if (op0 != CONST0_RTX (vmode))
17727 {
17728 x = gen_rtx_IOR (vmode, dest, op0);
17729 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17730 }
17731 }
17732
17733 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17734 so we have to do two masks. */
17735
17736 void
17737 ix86_split_copysign_var (rtx operands[])
17738 {
17739 enum machine_mode mode, vmode;
17740 rtx dest, scratch, op0, op1, mask, nmask, x;
17741
17742 dest = operands[0];
17743 scratch = operands[1];
17744 op0 = operands[2];
17745 op1 = operands[3];
17746 nmask = operands[4];
17747 mask = operands[5];
17748
17749 mode = GET_MODE (dest);
17750 vmode = GET_MODE (mask);
17751
17752 if (rtx_equal_p (op0, op1))
17753 {
17754 /* Shouldn't happen often (it's useless, obviously), but when it does
17755 we'd generate incorrect code if we continue below. */
17756 emit_move_insn (dest, op0);
17757 return;
17758 }
17759
17760 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17761 {
17762 gcc_assert (REGNO (op1) == REGNO (scratch));
17763
17764 x = gen_rtx_AND (vmode, scratch, mask);
17765 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17766
17767 dest = mask;
17768 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17769 x = gen_rtx_NOT (vmode, dest);
17770 x = gen_rtx_AND (vmode, x, op0);
17771 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17772 }
17773 else
17774 {
17775 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17776 {
17777 x = gen_rtx_AND (vmode, scratch, mask);
17778 }
17779 else /* alternative 2,4 */
17780 {
17781 gcc_assert (REGNO (mask) == REGNO (scratch));
17782 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17783 x = gen_rtx_AND (vmode, scratch, op1);
17784 }
17785 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17786
17787 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17788 {
17789 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17790 x = gen_rtx_AND (vmode, dest, nmask);
17791 }
17792 else /* alternative 3,4 */
17793 {
17794 gcc_assert (REGNO (nmask) == REGNO (dest));
17795 dest = nmask;
17796 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17797 x = gen_rtx_AND (vmode, dest, op0);
17798 }
17799 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17800 }
17801
17802 x = gen_rtx_IOR (vmode, dest, scratch);
17803 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17804 }
17805
17806 /* Return TRUE or FALSE depending on whether the first SET in INSN
17807 has source and destination with matching CC modes, and that the
17808 CC mode is at least as constrained as REQ_MODE. */
17809
17810 bool
17811 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17812 {
17813 rtx set;
17814 enum machine_mode set_mode;
17815
17816 set = PATTERN (insn);
17817 if (GET_CODE (set) == PARALLEL)
17818 set = XVECEXP (set, 0, 0);
17819 gcc_assert (GET_CODE (set) == SET);
17820 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17821
17822 set_mode = GET_MODE (SET_DEST (set));
17823 switch (set_mode)
17824 {
17825 case CCNOmode:
17826 if (req_mode != CCNOmode
17827 && (req_mode != CCmode
17828 || XEXP (SET_SRC (set), 1) != const0_rtx))
17829 return false;
17830 break;
17831 case CCmode:
17832 if (req_mode == CCGCmode)
17833 return false;
17834 /* FALLTHRU */
17835 case CCGCmode:
17836 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17837 return false;
17838 /* FALLTHRU */
17839 case CCGOCmode:
17840 if (req_mode == CCZmode)
17841 return false;
17842 /* FALLTHRU */
17843 case CCZmode:
17844 break;
17845
17846 case CCAmode:
17847 case CCCmode:
17848 case CCOmode:
17849 case CCSmode:
17850 if (set_mode != req_mode)
17851 return false;
17852 break;
17853
17854 default:
17855 gcc_unreachable ();
17856 }
17857
17858 return GET_MODE (SET_SRC (set)) == set_mode;
17859 }
17860
17861 /* Generate insn patterns to do an integer compare of OPERANDS. */
17862
17863 static rtx
17864 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17865 {
17866 enum machine_mode cmpmode;
17867 rtx tmp, flags;
17868
17869 cmpmode = SELECT_CC_MODE (code, op0, op1);
17870 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17871
17872 /* This is very simple, but making the interface the same as in the
17873 FP case makes the rest of the code easier. */
17874 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17875 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17876
17877 /* Return the test that should be put into the flags user, i.e.
17878 the bcc, scc, or cmov instruction. */
17879 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17880 }
17881
17882 /* Figure out whether to use ordered or unordered fp comparisons.
17883 Return the appropriate mode to use. */
17884
17885 enum machine_mode
17886 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17887 {
17888 /* ??? In order to make all comparisons reversible, we do all comparisons
17889 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17890 all forms trapping and nontrapping comparisons, we can make inequality
17891 comparisons trapping again, since it results in better code when using
17892 FCOM based compares. */
17893 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17894 }
17895
17896 enum machine_mode
17897 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17898 {
17899 enum machine_mode mode = GET_MODE (op0);
17900
17901 if (SCALAR_FLOAT_MODE_P (mode))
17902 {
17903 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17904 return ix86_fp_compare_mode (code);
17905 }
17906
17907 switch (code)
17908 {
17909 /* Only zero flag is needed. */
17910 case EQ: /* ZF=0 */
17911 case NE: /* ZF!=0 */
17912 return CCZmode;
17913 /* Codes needing carry flag. */
17914 case GEU: /* CF=0 */
17915 case LTU: /* CF=1 */
17916 /* Detect overflow checks. They need just the carry flag. */
17917 if (GET_CODE (op0) == PLUS
17918 && rtx_equal_p (op1, XEXP (op0, 0)))
17919 return CCCmode;
17920 else
17921 return CCmode;
17922 case GTU: /* CF=0 & ZF=0 */
17923 case LEU: /* CF=1 | ZF=1 */
17924 /* Detect overflow checks. They need just the carry flag. */
17925 if (GET_CODE (op0) == MINUS
17926 && rtx_equal_p (op1, XEXP (op0, 0)))
17927 return CCCmode;
17928 else
17929 return CCmode;
17930 /* Codes possibly doable only with sign flag when
17931 comparing against zero. */
17932 case GE: /* SF=OF or SF=0 */
17933 case LT: /* SF<>OF or SF=1 */
17934 if (op1 == const0_rtx)
17935 return CCGOCmode;
17936 else
17937 /* For other cases Carry flag is not required. */
17938 return CCGCmode;
17939 /* Codes doable only with sign flag when comparing
17940 against zero, but we miss jump instruction for it
17941 so we need to use relational tests against overflow
17942 that thus needs to be zero. */
17943 case GT: /* ZF=0 & SF=OF */
17944 case LE: /* ZF=1 | SF<>OF */
17945 if (op1 == const0_rtx)
17946 return CCNOmode;
17947 else
17948 return CCGCmode;
17949 /* strcmp pattern do (use flags) and combine may ask us for proper
17950 mode. */
17951 case USE:
17952 return CCmode;
17953 default:
17954 gcc_unreachable ();
17955 }
17956 }
17957
17958 /* Return the fixed registers used for condition codes. */
17959
17960 static bool
17961 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17962 {
17963 *p1 = FLAGS_REG;
17964 *p2 = FPSR_REG;
17965 return true;
17966 }
17967
17968 /* If two condition code modes are compatible, return a condition code
17969 mode which is compatible with both. Otherwise, return
17970 VOIDmode. */
17971
17972 static enum machine_mode
17973 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17974 {
17975 if (m1 == m2)
17976 return m1;
17977
17978 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17979 return VOIDmode;
17980
17981 if ((m1 == CCGCmode && m2 == CCGOCmode)
17982 || (m1 == CCGOCmode && m2 == CCGCmode))
17983 return CCGCmode;
17984
17985 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17986 return m2;
17987 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17988 return m1;
17989
17990 switch (m1)
17991 {
17992 default:
17993 gcc_unreachable ();
17994
17995 case CCmode:
17996 case CCGCmode:
17997 case CCGOCmode:
17998 case CCNOmode:
17999 case CCAmode:
18000 case CCCmode:
18001 case CCOmode:
18002 case CCSmode:
18003 case CCZmode:
18004 switch (m2)
18005 {
18006 default:
18007 return VOIDmode;
18008
18009 case CCmode:
18010 case CCGCmode:
18011 case CCGOCmode:
18012 case CCNOmode:
18013 case CCAmode:
18014 case CCCmode:
18015 case CCOmode:
18016 case CCSmode:
18017 case CCZmode:
18018 return CCmode;
18019 }
18020
18021 case CCFPmode:
18022 case CCFPUmode:
18023 /* These are only compatible with themselves, which we already
18024 checked above. */
18025 return VOIDmode;
18026 }
18027 }
18028
18029
18030 /* Return a comparison we can do and that it is equivalent to
18031 swap_condition (code) apart possibly from orderedness.
18032 But, never change orderedness if TARGET_IEEE_FP, returning
18033 UNKNOWN in that case if necessary. */
18034
18035 static enum rtx_code
18036 ix86_fp_swap_condition (enum rtx_code code)
18037 {
18038 switch (code)
18039 {
18040 case GT: /* GTU - CF=0 & ZF=0 */
18041 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18042 case GE: /* GEU - CF=0 */
18043 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18044 case UNLT: /* LTU - CF=1 */
18045 return TARGET_IEEE_FP ? UNKNOWN : GT;
18046 case UNLE: /* LEU - CF=1 | ZF=1 */
18047 return TARGET_IEEE_FP ? UNKNOWN : GE;
18048 default:
18049 return swap_condition (code);
18050 }
18051 }
18052
18053 /* Return cost of comparison CODE using the best strategy for performance.
18054 All following functions do use number of instructions as a cost metrics.
18055 In future this should be tweaked to compute bytes for optimize_size and
18056 take into account performance of various instructions on various CPUs. */
18057
18058 static int
18059 ix86_fp_comparison_cost (enum rtx_code code)
18060 {
18061 int arith_cost;
18062
18063 /* The cost of code using bit-twiddling on %ah. */
18064 switch (code)
18065 {
18066 case UNLE:
18067 case UNLT:
18068 case LTGT:
18069 case GT:
18070 case GE:
18071 case UNORDERED:
18072 case ORDERED:
18073 case UNEQ:
18074 arith_cost = 4;
18075 break;
18076 case LT:
18077 case NE:
18078 case EQ:
18079 case UNGE:
18080 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18081 break;
18082 case LE:
18083 case UNGT:
18084 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18085 break;
18086 default:
18087 gcc_unreachable ();
18088 }
18089
18090 switch (ix86_fp_comparison_strategy (code))
18091 {
18092 case IX86_FPCMP_COMI:
18093 return arith_cost > 4 ? 3 : 2;
18094 case IX86_FPCMP_SAHF:
18095 return arith_cost > 4 ? 4 : 3;
18096 default:
18097 return arith_cost;
18098 }
18099 }
18100
18101 /* Return strategy to use for floating-point. We assume that fcomi is always
18102 preferrable where available, since that is also true when looking at size
18103 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18104
18105 enum ix86_fpcmp_strategy
18106 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18107 {
18108 /* Do fcomi/sahf based test when profitable. */
18109
18110 if (TARGET_CMOVE)
18111 return IX86_FPCMP_COMI;
18112
18113 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18114 return IX86_FPCMP_SAHF;
18115
18116 return IX86_FPCMP_ARITH;
18117 }
18118
18119 /* Swap, force into registers, or otherwise massage the two operands
18120 to a fp comparison. The operands are updated in place; the new
18121 comparison code is returned. */
18122
18123 static enum rtx_code
18124 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18125 {
18126 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18127 rtx op0 = *pop0, op1 = *pop1;
18128 enum machine_mode op_mode = GET_MODE (op0);
18129 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18130
18131 /* All of the unordered compare instructions only work on registers.
18132 The same is true of the fcomi compare instructions. The XFmode
18133 compare instructions require registers except when comparing
18134 against zero or when converting operand 1 from fixed point to
18135 floating point. */
18136
18137 if (!is_sse
18138 && (fpcmp_mode == CCFPUmode
18139 || (op_mode == XFmode
18140 && ! (standard_80387_constant_p (op0) == 1
18141 || standard_80387_constant_p (op1) == 1)
18142 && GET_CODE (op1) != FLOAT)
18143 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18144 {
18145 op0 = force_reg (op_mode, op0);
18146 op1 = force_reg (op_mode, op1);
18147 }
18148 else
18149 {
18150 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18151 things around if they appear profitable, otherwise force op0
18152 into a register. */
18153
18154 if (standard_80387_constant_p (op0) == 0
18155 || (MEM_P (op0)
18156 && ! (standard_80387_constant_p (op1) == 0
18157 || MEM_P (op1))))
18158 {
18159 enum rtx_code new_code = ix86_fp_swap_condition (code);
18160 if (new_code != UNKNOWN)
18161 {
18162 rtx tmp;
18163 tmp = op0, op0 = op1, op1 = tmp;
18164 code = new_code;
18165 }
18166 }
18167
18168 if (!REG_P (op0))
18169 op0 = force_reg (op_mode, op0);
18170
18171 if (CONSTANT_P (op1))
18172 {
18173 int tmp = standard_80387_constant_p (op1);
18174 if (tmp == 0)
18175 op1 = validize_mem (force_const_mem (op_mode, op1));
18176 else if (tmp == 1)
18177 {
18178 if (TARGET_CMOVE)
18179 op1 = force_reg (op_mode, op1);
18180 }
18181 else
18182 op1 = force_reg (op_mode, op1);
18183 }
18184 }
18185
18186 /* Try to rearrange the comparison to make it cheaper. */
18187 if (ix86_fp_comparison_cost (code)
18188 > ix86_fp_comparison_cost (swap_condition (code))
18189 && (REG_P (op1) || can_create_pseudo_p ()))
18190 {
18191 rtx tmp;
18192 tmp = op0, op0 = op1, op1 = tmp;
18193 code = swap_condition (code);
18194 if (!REG_P (op0))
18195 op0 = force_reg (op_mode, op0);
18196 }
18197
18198 *pop0 = op0;
18199 *pop1 = op1;
18200 return code;
18201 }
18202
18203 /* Convert comparison codes we use to represent FP comparison to integer
18204 code that will result in proper branch. Return UNKNOWN if no such code
18205 is available. */
18206
18207 enum rtx_code
18208 ix86_fp_compare_code_to_integer (enum rtx_code code)
18209 {
18210 switch (code)
18211 {
18212 case GT:
18213 return GTU;
18214 case GE:
18215 return GEU;
18216 case ORDERED:
18217 case UNORDERED:
18218 return code;
18219 break;
18220 case UNEQ:
18221 return EQ;
18222 break;
18223 case UNLT:
18224 return LTU;
18225 break;
18226 case UNLE:
18227 return LEU;
18228 break;
18229 case LTGT:
18230 return NE;
18231 break;
18232 default:
18233 return UNKNOWN;
18234 }
18235 }
18236
18237 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18238
18239 static rtx
18240 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18241 {
18242 enum machine_mode fpcmp_mode, intcmp_mode;
18243 rtx tmp, tmp2;
18244
18245 fpcmp_mode = ix86_fp_compare_mode (code);
18246 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18247
18248 /* Do fcomi/sahf based test when profitable. */
18249 switch (ix86_fp_comparison_strategy (code))
18250 {
18251 case IX86_FPCMP_COMI:
18252 intcmp_mode = fpcmp_mode;
18253 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18254 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18255 tmp);
18256 emit_insn (tmp);
18257 break;
18258
18259 case IX86_FPCMP_SAHF:
18260 intcmp_mode = fpcmp_mode;
18261 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18262 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18263 tmp);
18264
18265 if (!scratch)
18266 scratch = gen_reg_rtx (HImode);
18267 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18268 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18269 break;
18270
18271 case IX86_FPCMP_ARITH:
18272 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18273 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18274 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18275 if (!scratch)
18276 scratch = gen_reg_rtx (HImode);
18277 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18278
18279 /* In the unordered case, we have to check C2 for NaN's, which
18280 doesn't happen to work out to anything nice combination-wise.
18281 So do some bit twiddling on the value we've got in AH to come
18282 up with an appropriate set of condition codes. */
18283
18284 intcmp_mode = CCNOmode;
18285 switch (code)
18286 {
18287 case GT:
18288 case UNGT:
18289 if (code == GT || !TARGET_IEEE_FP)
18290 {
18291 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18292 code = EQ;
18293 }
18294 else
18295 {
18296 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18297 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18298 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18299 intcmp_mode = CCmode;
18300 code = GEU;
18301 }
18302 break;
18303 case LT:
18304 case UNLT:
18305 if (code == LT && TARGET_IEEE_FP)
18306 {
18307 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18308 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18309 intcmp_mode = CCmode;
18310 code = EQ;
18311 }
18312 else
18313 {
18314 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18315 code = NE;
18316 }
18317 break;
18318 case GE:
18319 case UNGE:
18320 if (code == GE || !TARGET_IEEE_FP)
18321 {
18322 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18323 code = EQ;
18324 }
18325 else
18326 {
18327 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18328 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18329 code = NE;
18330 }
18331 break;
18332 case LE:
18333 case UNLE:
18334 if (code == LE && TARGET_IEEE_FP)
18335 {
18336 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18337 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18338 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18339 intcmp_mode = CCmode;
18340 code = LTU;
18341 }
18342 else
18343 {
18344 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18345 code = NE;
18346 }
18347 break;
18348 case EQ:
18349 case UNEQ:
18350 if (code == EQ && TARGET_IEEE_FP)
18351 {
18352 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18353 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18354 intcmp_mode = CCmode;
18355 code = EQ;
18356 }
18357 else
18358 {
18359 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18360 code = NE;
18361 }
18362 break;
18363 case NE:
18364 case LTGT:
18365 if (code == NE && TARGET_IEEE_FP)
18366 {
18367 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18368 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18369 GEN_INT (0x40)));
18370 code = NE;
18371 }
18372 else
18373 {
18374 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18375 code = EQ;
18376 }
18377 break;
18378
18379 case UNORDERED:
18380 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18381 code = NE;
18382 break;
18383 case ORDERED:
18384 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18385 code = EQ;
18386 break;
18387
18388 default:
18389 gcc_unreachable ();
18390 }
18391 break;
18392
18393 default:
18394 gcc_unreachable();
18395 }
18396
18397 /* Return the test that should be put into the flags user, i.e.
18398 the bcc, scc, or cmov instruction. */
18399 return gen_rtx_fmt_ee (code, VOIDmode,
18400 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18401 const0_rtx);
18402 }
18403
18404 static rtx
18405 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18406 {
18407 rtx ret;
18408
18409 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18410 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18411
18412 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18413 {
18414 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18415 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18416 }
18417 else
18418 ret = ix86_expand_int_compare (code, op0, op1);
18419
18420 return ret;
18421 }
18422
18423 void
18424 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18425 {
18426 enum machine_mode mode = GET_MODE (op0);
18427 rtx tmp;
18428
18429 switch (mode)
18430 {
18431 case SFmode:
18432 case DFmode:
18433 case XFmode:
18434 case QImode:
18435 case HImode:
18436 case SImode:
18437 simple:
18438 tmp = ix86_expand_compare (code, op0, op1);
18439 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18440 gen_rtx_LABEL_REF (VOIDmode, label),
18441 pc_rtx);
18442 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18443 return;
18444
18445 case DImode:
18446 if (TARGET_64BIT)
18447 goto simple;
18448 case TImode:
18449 /* Expand DImode branch into multiple compare+branch. */
18450 {
18451 rtx lo[2], hi[2], label2;
18452 enum rtx_code code1, code2, code3;
18453 enum machine_mode submode;
18454
18455 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18456 {
18457 tmp = op0, op0 = op1, op1 = tmp;
18458 code = swap_condition (code);
18459 }
18460
18461 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18462 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18463
18464 submode = mode == DImode ? SImode : DImode;
18465
18466 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18467 avoid two branches. This costs one extra insn, so disable when
18468 optimizing for size. */
18469
18470 if ((code == EQ || code == NE)
18471 && (!optimize_insn_for_size_p ()
18472 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18473 {
18474 rtx xor0, xor1;
18475
18476 xor1 = hi[0];
18477 if (hi[1] != const0_rtx)
18478 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18479 NULL_RTX, 0, OPTAB_WIDEN);
18480
18481 xor0 = lo[0];
18482 if (lo[1] != const0_rtx)
18483 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18484 NULL_RTX, 0, OPTAB_WIDEN);
18485
18486 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18487 NULL_RTX, 0, OPTAB_WIDEN);
18488
18489 ix86_expand_branch (code, tmp, const0_rtx, label);
18490 return;
18491 }
18492
18493 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18494 op1 is a constant and the low word is zero, then we can just
18495 examine the high word. Similarly for low word -1 and
18496 less-or-equal-than or greater-than. */
18497
18498 if (CONST_INT_P (hi[1]))
18499 switch (code)
18500 {
18501 case LT: case LTU: case GE: case GEU:
18502 if (lo[1] == const0_rtx)
18503 {
18504 ix86_expand_branch (code, hi[0], hi[1], label);
18505 return;
18506 }
18507 break;
18508 case LE: case LEU: case GT: case GTU:
18509 if (lo[1] == constm1_rtx)
18510 {
18511 ix86_expand_branch (code, hi[0], hi[1], label);
18512 return;
18513 }
18514 break;
18515 default:
18516 break;
18517 }
18518
18519 /* Otherwise, we need two or three jumps. */
18520
18521 label2 = gen_label_rtx ();
18522
18523 code1 = code;
18524 code2 = swap_condition (code);
18525 code3 = unsigned_condition (code);
18526
18527 switch (code)
18528 {
18529 case LT: case GT: case LTU: case GTU:
18530 break;
18531
18532 case LE: code1 = LT; code2 = GT; break;
18533 case GE: code1 = GT; code2 = LT; break;
18534 case LEU: code1 = LTU; code2 = GTU; break;
18535 case GEU: code1 = GTU; code2 = LTU; break;
18536
18537 case EQ: code1 = UNKNOWN; code2 = NE; break;
18538 case NE: code2 = UNKNOWN; break;
18539
18540 default:
18541 gcc_unreachable ();
18542 }
18543
18544 /*
18545 * a < b =>
18546 * if (hi(a) < hi(b)) goto true;
18547 * if (hi(a) > hi(b)) goto false;
18548 * if (lo(a) < lo(b)) goto true;
18549 * false:
18550 */
18551
18552 if (code1 != UNKNOWN)
18553 ix86_expand_branch (code1, hi[0], hi[1], label);
18554 if (code2 != UNKNOWN)
18555 ix86_expand_branch (code2, hi[0], hi[1], label2);
18556
18557 ix86_expand_branch (code3, lo[0], lo[1], label);
18558
18559 if (code2 != UNKNOWN)
18560 emit_label (label2);
18561 return;
18562 }
18563
18564 default:
18565 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18566 goto simple;
18567 }
18568 }
18569
18570 /* Split branch based on floating point condition. */
18571 void
18572 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18573 rtx target1, rtx target2, rtx tmp, rtx pushed)
18574 {
18575 rtx condition;
18576 rtx i;
18577
18578 if (target2 != pc_rtx)
18579 {
18580 rtx tmp = target2;
18581 code = reverse_condition_maybe_unordered (code);
18582 target2 = target1;
18583 target1 = tmp;
18584 }
18585
18586 condition = ix86_expand_fp_compare (code, op1, op2,
18587 tmp);
18588
18589 /* Remove pushed operand from stack. */
18590 if (pushed)
18591 ix86_free_from_memory (GET_MODE (pushed));
18592
18593 i = emit_jump_insn (gen_rtx_SET
18594 (VOIDmode, pc_rtx,
18595 gen_rtx_IF_THEN_ELSE (VOIDmode,
18596 condition, target1, target2)));
18597 if (split_branch_probability >= 0)
18598 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18599 }
18600
18601 void
18602 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18603 {
18604 rtx ret;
18605
18606 gcc_assert (GET_MODE (dest) == QImode);
18607
18608 ret = ix86_expand_compare (code, op0, op1);
18609 PUT_MODE (ret, QImode);
18610 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18611 }
18612
18613 /* Expand comparison setting or clearing carry flag. Return true when
18614 successful and set pop for the operation. */
18615 static bool
18616 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18617 {
18618 enum machine_mode mode =
18619 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18620
18621 /* Do not handle double-mode compares that go through special path. */
18622 if (mode == (TARGET_64BIT ? TImode : DImode))
18623 return false;
18624
18625 if (SCALAR_FLOAT_MODE_P (mode))
18626 {
18627 rtx compare_op, compare_seq;
18628
18629 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18630
18631 /* Shortcut: following common codes never translate
18632 into carry flag compares. */
18633 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18634 || code == ORDERED || code == UNORDERED)
18635 return false;
18636
18637 /* These comparisons require zero flag; swap operands so they won't. */
18638 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18639 && !TARGET_IEEE_FP)
18640 {
18641 rtx tmp = op0;
18642 op0 = op1;
18643 op1 = tmp;
18644 code = swap_condition (code);
18645 }
18646
18647 /* Try to expand the comparison and verify that we end up with
18648 carry flag based comparison. This fails to be true only when
18649 we decide to expand comparison using arithmetic that is not
18650 too common scenario. */
18651 start_sequence ();
18652 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18653 compare_seq = get_insns ();
18654 end_sequence ();
18655
18656 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18657 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18658 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18659 else
18660 code = GET_CODE (compare_op);
18661
18662 if (code != LTU && code != GEU)
18663 return false;
18664
18665 emit_insn (compare_seq);
18666 *pop = compare_op;
18667 return true;
18668 }
18669
18670 if (!INTEGRAL_MODE_P (mode))
18671 return false;
18672
18673 switch (code)
18674 {
18675 case LTU:
18676 case GEU:
18677 break;
18678
18679 /* Convert a==0 into (unsigned)a<1. */
18680 case EQ:
18681 case NE:
18682 if (op1 != const0_rtx)
18683 return false;
18684 op1 = const1_rtx;
18685 code = (code == EQ ? LTU : GEU);
18686 break;
18687
18688 /* Convert a>b into b<a or a>=b-1. */
18689 case GTU:
18690 case LEU:
18691 if (CONST_INT_P (op1))
18692 {
18693 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18694 /* Bail out on overflow. We still can swap operands but that
18695 would force loading of the constant into register. */
18696 if (op1 == const0_rtx
18697 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18698 return false;
18699 code = (code == GTU ? GEU : LTU);
18700 }
18701 else
18702 {
18703 rtx tmp = op1;
18704 op1 = op0;
18705 op0 = tmp;
18706 code = (code == GTU ? LTU : GEU);
18707 }
18708 break;
18709
18710 /* Convert a>=0 into (unsigned)a<0x80000000. */
18711 case LT:
18712 case GE:
18713 if (mode == DImode || op1 != const0_rtx)
18714 return false;
18715 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18716 code = (code == LT ? GEU : LTU);
18717 break;
18718 case LE:
18719 case GT:
18720 if (mode == DImode || op1 != constm1_rtx)
18721 return false;
18722 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18723 code = (code == LE ? GEU : LTU);
18724 break;
18725
18726 default:
18727 return false;
18728 }
18729 /* Swapping operands may cause constant to appear as first operand. */
18730 if (!nonimmediate_operand (op0, VOIDmode))
18731 {
18732 if (!can_create_pseudo_p ())
18733 return false;
18734 op0 = force_reg (mode, op0);
18735 }
18736 *pop = ix86_expand_compare (code, op0, op1);
18737 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18738 return true;
18739 }
18740
18741 bool
18742 ix86_expand_int_movcc (rtx operands[])
18743 {
18744 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18745 rtx compare_seq, compare_op;
18746 enum machine_mode mode = GET_MODE (operands[0]);
18747 bool sign_bit_compare_p = false;
18748 rtx op0 = XEXP (operands[1], 0);
18749 rtx op1 = XEXP (operands[1], 1);
18750
18751 if (GET_MODE (op0) == TImode
18752 || (GET_MODE (op0) == DImode
18753 && !TARGET_64BIT))
18754 return false;
18755
18756 start_sequence ();
18757 compare_op = ix86_expand_compare (code, op0, op1);
18758 compare_seq = get_insns ();
18759 end_sequence ();
18760
18761 compare_code = GET_CODE (compare_op);
18762
18763 if ((op1 == const0_rtx && (code == GE || code == LT))
18764 || (op1 == constm1_rtx && (code == GT || code == LE)))
18765 sign_bit_compare_p = true;
18766
18767 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18768 HImode insns, we'd be swallowed in word prefix ops. */
18769
18770 if ((mode != HImode || TARGET_FAST_PREFIX)
18771 && (mode != (TARGET_64BIT ? TImode : DImode))
18772 && CONST_INT_P (operands[2])
18773 && CONST_INT_P (operands[3]))
18774 {
18775 rtx out = operands[0];
18776 HOST_WIDE_INT ct = INTVAL (operands[2]);
18777 HOST_WIDE_INT cf = INTVAL (operands[3]);
18778 HOST_WIDE_INT diff;
18779
18780 diff = ct - cf;
18781 /* Sign bit compares are better done using shifts than we do by using
18782 sbb. */
18783 if (sign_bit_compare_p
18784 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18785 {
18786 /* Detect overlap between destination and compare sources. */
18787 rtx tmp = out;
18788
18789 if (!sign_bit_compare_p)
18790 {
18791 rtx flags;
18792 bool fpcmp = false;
18793
18794 compare_code = GET_CODE (compare_op);
18795
18796 flags = XEXP (compare_op, 0);
18797
18798 if (GET_MODE (flags) == CCFPmode
18799 || GET_MODE (flags) == CCFPUmode)
18800 {
18801 fpcmp = true;
18802 compare_code
18803 = ix86_fp_compare_code_to_integer (compare_code);
18804 }
18805
18806 /* To simplify rest of code, restrict to the GEU case. */
18807 if (compare_code == LTU)
18808 {
18809 HOST_WIDE_INT tmp = ct;
18810 ct = cf;
18811 cf = tmp;
18812 compare_code = reverse_condition (compare_code);
18813 code = reverse_condition (code);
18814 }
18815 else
18816 {
18817 if (fpcmp)
18818 PUT_CODE (compare_op,
18819 reverse_condition_maybe_unordered
18820 (GET_CODE (compare_op)));
18821 else
18822 PUT_CODE (compare_op,
18823 reverse_condition (GET_CODE (compare_op)));
18824 }
18825 diff = ct - cf;
18826
18827 if (reg_overlap_mentioned_p (out, op0)
18828 || reg_overlap_mentioned_p (out, op1))
18829 tmp = gen_reg_rtx (mode);
18830
18831 if (mode == DImode)
18832 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18833 else
18834 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18835 flags, compare_op));
18836 }
18837 else
18838 {
18839 if (code == GT || code == GE)
18840 code = reverse_condition (code);
18841 else
18842 {
18843 HOST_WIDE_INT tmp = ct;
18844 ct = cf;
18845 cf = tmp;
18846 diff = ct - cf;
18847 }
18848 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18849 }
18850
18851 if (diff == 1)
18852 {
18853 /*
18854 * cmpl op0,op1
18855 * sbbl dest,dest
18856 * [addl dest, ct]
18857 *
18858 * Size 5 - 8.
18859 */
18860 if (ct)
18861 tmp = expand_simple_binop (mode, PLUS,
18862 tmp, GEN_INT (ct),
18863 copy_rtx (tmp), 1, OPTAB_DIRECT);
18864 }
18865 else if (cf == -1)
18866 {
18867 /*
18868 * cmpl op0,op1
18869 * sbbl dest,dest
18870 * orl $ct, dest
18871 *
18872 * Size 8.
18873 */
18874 tmp = expand_simple_binop (mode, IOR,
18875 tmp, GEN_INT (ct),
18876 copy_rtx (tmp), 1, OPTAB_DIRECT);
18877 }
18878 else if (diff == -1 && ct)
18879 {
18880 /*
18881 * cmpl op0,op1
18882 * sbbl dest,dest
18883 * notl dest
18884 * [addl dest, cf]
18885 *
18886 * Size 8 - 11.
18887 */
18888 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18889 if (cf)
18890 tmp = expand_simple_binop (mode, PLUS,
18891 copy_rtx (tmp), GEN_INT (cf),
18892 copy_rtx (tmp), 1, OPTAB_DIRECT);
18893 }
18894 else
18895 {
18896 /*
18897 * cmpl op0,op1
18898 * sbbl dest,dest
18899 * [notl dest]
18900 * andl cf - ct, dest
18901 * [addl dest, ct]
18902 *
18903 * Size 8 - 11.
18904 */
18905
18906 if (cf == 0)
18907 {
18908 cf = ct;
18909 ct = 0;
18910 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18911 }
18912
18913 tmp = expand_simple_binop (mode, AND,
18914 copy_rtx (tmp),
18915 gen_int_mode (cf - ct, mode),
18916 copy_rtx (tmp), 1, OPTAB_DIRECT);
18917 if (ct)
18918 tmp = expand_simple_binop (mode, PLUS,
18919 copy_rtx (tmp), GEN_INT (ct),
18920 copy_rtx (tmp), 1, OPTAB_DIRECT);
18921 }
18922
18923 if (!rtx_equal_p (tmp, out))
18924 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18925
18926 return true;
18927 }
18928
18929 if (diff < 0)
18930 {
18931 enum machine_mode cmp_mode = GET_MODE (op0);
18932
18933 HOST_WIDE_INT tmp;
18934 tmp = ct, ct = cf, cf = tmp;
18935 diff = -diff;
18936
18937 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18938 {
18939 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18940
18941 /* We may be reversing unordered compare to normal compare, that
18942 is not valid in general (we may convert non-trapping condition
18943 to trapping one), however on i386 we currently emit all
18944 comparisons unordered. */
18945 compare_code = reverse_condition_maybe_unordered (compare_code);
18946 code = reverse_condition_maybe_unordered (code);
18947 }
18948 else
18949 {
18950 compare_code = reverse_condition (compare_code);
18951 code = reverse_condition (code);
18952 }
18953 }
18954
18955 compare_code = UNKNOWN;
18956 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18957 && CONST_INT_P (op1))
18958 {
18959 if (op1 == const0_rtx
18960 && (code == LT || code == GE))
18961 compare_code = code;
18962 else if (op1 == constm1_rtx)
18963 {
18964 if (code == LE)
18965 compare_code = LT;
18966 else if (code == GT)
18967 compare_code = GE;
18968 }
18969 }
18970
18971 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18972 if (compare_code != UNKNOWN
18973 && GET_MODE (op0) == GET_MODE (out)
18974 && (cf == -1 || ct == -1))
18975 {
18976 /* If lea code below could be used, only optimize
18977 if it results in a 2 insn sequence. */
18978
18979 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18980 || diff == 3 || diff == 5 || diff == 9)
18981 || (compare_code == LT && ct == -1)
18982 || (compare_code == GE && cf == -1))
18983 {
18984 /*
18985 * notl op1 (if necessary)
18986 * sarl $31, op1
18987 * orl cf, op1
18988 */
18989 if (ct != -1)
18990 {
18991 cf = ct;
18992 ct = -1;
18993 code = reverse_condition (code);
18994 }
18995
18996 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18997
18998 out = expand_simple_binop (mode, IOR,
18999 out, GEN_INT (cf),
19000 out, 1, OPTAB_DIRECT);
19001 if (out != operands[0])
19002 emit_move_insn (operands[0], out);
19003
19004 return true;
19005 }
19006 }
19007
19008
19009 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19010 || diff == 3 || diff == 5 || diff == 9)
19011 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19012 && (mode != DImode
19013 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19014 {
19015 /*
19016 * xorl dest,dest
19017 * cmpl op1,op2
19018 * setcc dest
19019 * lea cf(dest*(ct-cf)),dest
19020 *
19021 * Size 14.
19022 *
19023 * This also catches the degenerate setcc-only case.
19024 */
19025
19026 rtx tmp;
19027 int nops;
19028
19029 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19030
19031 nops = 0;
19032 /* On x86_64 the lea instruction operates on Pmode, so we need
19033 to get arithmetics done in proper mode to match. */
19034 if (diff == 1)
19035 tmp = copy_rtx (out);
19036 else
19037 {
19038 rtx out1;
19039 out1 = copy_rtx (out);
19040 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19041 nops++;
19042 if (diff & 1)
19043 {
19044 tmp = gen_rtx_PLUS (mode, tmp, out1);
19045 nops++;
19046 }
19047 }
19048 if (cf != 0)
19049 {
19050 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19051 nops++;
19052 }
19053 if (!rtx_equal_p (tmp, out))
19054 {
19055 if (nops == 1)
19056 out = force_operand (tmp, copy_rtx (out));
19057 else
19058 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19059 }
19060 if (!rtx_equal_p (out, operands[0]))
19061 emit_move_insn (operands[0], copy_rtx (out));
19062
19063 return true;
19064 }
19065
19066 /*
19067 * General case: Jumpful:
19068 * xorl dest,dest cmpl op1, op2
19069 * cmpl op1, op2 movl ct, dest
19070 * setcc dest jcc 1f
19071 * decl dest movl cf, dest
19072 * andl (cf-ct),dest 1:
19073 * addl ct,dest
19074 *
19075 * Size 20. Size 14.
19076 *
19077 * This is reasonably steep, but branch mispredict costs are
19078 * high on modern cpus, so consider failing only if optimizing
19079 * for space.
19080 */
19081
19082 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19083 && BRANCH_COST (optimize_insn_for_speed_p (),
19084 false) >= 2)
19085 {
19086 if (cf == 0)
19087 {
19088 enum machine_mode cmp_mode = GET_MODE (op0);
19089
19090 cf = ct;
19091 ct = 0;
19092
19093 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19094 {
19095 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19096
19097 /* We may be reversing unordered compare to normal compare,
19098 that is not valid in general (we may convert non-trapping
19099 condition to trapping one), however on i386 we currently
19100 emit all comparisons unordered. */
19101 code = reverse_condition_maybe_unordered (code);
19102 }
19103 else
19104 {
19105 code = reverse_condition (code);
19106 if (compare_code != UNKNOWN)
19107 compare_code = reverse_condition (compare_code);
19108 }
19109 }
19110
19111 if (compare_code != UNKNOWN)
19112 {
19113 /* notl op1 (if needed)
19114 sarl $31, op1
19115 andl (cf-ct), op1
19116 addl ct, op1
19117
19118 For x < 0 (resp. x <= -1) there will be no notl,
19119 so if possible swap the constants to get rid of the
19120 complement.
19121 True/false will be -1/0 while code below (store flag
19122 followed by decrement) is 0/-1, so the constants need
19123 to be exchanged once more. */
19124
19125 if (compare_code == GE || !cf)
19126 {
19127 code = reverse_condition (code);
19128 compare_code = LT;
19129 }
19130 else
19131 {
19132 HOST_WIDE_INT tmp = cf;
19133 cf = ct;
19134 ct = tmp;
19135 }
19136
19137 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19138 }
19139 else
19140 {
19141 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19142
19143 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19144 constm1_rtx,
19145 copy_rtx (out), 1, OPTAB_DIRECT);
19146 }
19147
19148 out = expand_simple_binop (mode, AND, copy_rtx (out),
19149 gen_int_mode (cf - ct, mode),
19150 copy_rtx (out), 1, OPTAB_DIRECT);
19151 if (ct)
19152 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19153 copy_rtx (out), 1, OPTAB_DIRECT);
19154 if (!rtx_equal_p (out, operands[0]))
19155 emit_move_insn (operands[0], copy_rtx (out));
19156
19157 return true;
19158 }
19159 }
19160
19161 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19162 {
19163 /* Try a few things more with specific constants and a variable. */
19164
19165 optab op;
19166 rtx var, orig_out, out, tmp;
19167
19168 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19169 return false;
19170
19171 /* If one of the two operands is an interesting constant, load a
19172 constant with the above and mask it in with a logical operation. */
19173
19174 if (CONST_INT_P (operands[2]))
19175 {
19176 var = operands[3];
19177 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19178 operands[3] = constm1_rtx, op = and_optab;
19179 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19180 operands[3] = const0_rtx, op = ior_optab;
19181 else
19182 return false;
19183 }
19184 else if (CONST_INT_P (operands[3]))
19185 {
19186 var = operands[2];
19187 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19188 operands[2] = constm1_rtx, op = and_optab;
19189 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19190 operands[2] = const0_rtx, op = ior_optab;
19191 else
19192 return false;
19193 }
19194 else
19195 return false;
19196
19197 orig_out = operands[0];
19198 tmp = gen_reg_rtx (mode);
19199 operands[0] = tmp;
19200
19201 /* Recurse to get the constant loaded. */
19202 if (ix86_expand_int_movcc (operands) == 0)
19203 return false;
19204
19205 /* Mask in the interesting variable. */
19206 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19207 OPTAB_WIDEN);
19208 if (!rtx_equal_p (out, orig_out))
19209 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19210
19211 return true;
19212 }
19213
19214 /*
19215 * For comparison with above,
19216 *
19217 * movl cf,dest
19218 * movl ct,tmp
19219 * cmpl op1,op2
19220 * cmovcc tmp,dest
19221 *
19222 * Size 15.
19223 */
19224
19225 if (! nonimmediate_operand (operands[2], mode))
19226 operands[2] = force_reg (mode, operands[2]);
19227 if (! nonimmediate_operand (operands[3], mode))
19228 operands[3] = force_reg (mode, operands[3]);
19229
19230 if (! register_operand (operands[2], VOIDmode)
19231 && (mode == QImode
19232 || ! register_operand (operands[3], VOIDmode)))
19233 operands[2] = force_reg (mode, operands[2]);
19234
19235 if (mode == QImode
19236 && ! register_operand (operands[3], VOIDmode))
19237 operands[3] = force_reg (mode, operands[3]);
19238
19239 emit_insn (compare_seq);
19240 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19241 gen_rtx_IF_THEN_ELSE (mode,
19242 compare_op, operands[2],
19243 operands[3])));
19244 return true;
19245 }
19246
19247 /* Swap, force into registers, or otherwise massage the two operands
19248 to an sse comparison with a mask result. Thus we differ a bit from
19249 ix86_prepare_fp_compare_args which expects to produce a flags result.
19250
19251 The DEST operand exists to help determine whether to commute commutative
19252 operators. The POP0/POP1 operands are updated in place. The new
19253 comparison code is returned, or UNKNOWN if not implementable. */
19254
19255 static enum rtx_code
19256 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19257 rtx *pop0, rtx *pop1)
19258 {
19259 rtx tmp;
19260
19261 switch (code)
19262 {
19263 case LTGT:
19264 case UNEQ:
19265 /* AVX supports all the needed comparisons. */
19266 if (TARGET_AVX)
19267 break;
19268 /* We have no LTGT as an operator. We could implement it with
19269 NE & ORDERED, but this requires an extra temporary. It's
19270 not clear that it's worth it. */
19271 return UNKNOWN;
19272
19273 case LT:
19274 case LE:
19275 case UNGT:
19276 case UNGE:
19277 /* These are supported directly. */
19278 break;
19279
19280 case EQ:
19281 case NE:
19282 case UNORDERED:
19283 case ORDERED:
19284 /* AVX has 3 operand comparisons, no need to swap anything. */
19285 if (TARGET_AVX)
19286 break;
19287 /* For commutative operators, try to canonicalize the destination
19288 operand to be first in the comparison - this helps reload to
19289 avoid extra moves. */
19290 if (!dest || !rtx_equal_p (dest, *pop1))
19291 break;
19292 /* FALLTHRU */
19293
19294 case GE:
19295 case GT:
19296 case UNLE:
19297 case UNLT:
19298 /* These are not supported directly before AVX, and furthermore
19299 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19300 comparison operands to transform into something that is
19301 supported. */
19302 tmp = *pop0;
19303 *pop0 = *pop1;
19304 *pop1 = tmp;
19305 code = swap_condition (code);
19306 break;
19307
19308 default:
19309 gcc_unreachable ();
19310 }
19311
19312 return code;
19313 }
19314
19315 /* Detect conditional moves that exactly match min/max operational
19316 semantics. Note that this is IEEE safe, as long as we don't
19317 interchange the operands.
19318
19319 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19320 and TRUE if the operation is successful and instructions are emitted. */
19321
19322 static bool
19323 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19324 rtx cmp_op1, rtx if_true, rtx if_false)
19325 {
19326 enum machine_mode mode;
19327 bool is_min;
19328 rtx tmp;
19329
19330 if (code == LT)
19331 ;
19332 else if (code == UNGE)
19333 {
19334 tmp = if_true;
19335 if_true = if_false;
19336 if_false = tmp;
19337 }
19338 else
19339 return false;
19340
19341 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19342 is_min = true;
19343 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19344 is_min = false;
19345 else
19346 return false;
19347
19348 mode = GET_MODE (dest);
19349
19350 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19351 but MODE may be a vector mode and thus not appropriate. */
19352 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19353 {
19354 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19355 rtvec v;
19356
19357 if_true = force_reg (mode, if_true);
19358 v = gen_rtvec (2, if_true, if_false);
19359 tmp = gen_rtx_UNSPEC (mode, v, u);
19360 }
19361 else
19362 {
19363 code = is_min ? SMIN : SMAX;
19364 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19365 }
19366
19367 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19368 return true;
19369 }
19370
19371 /* Expand an sse vector comparison. Return the register with the result. */
19372
19373 static rtx
19374 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19375 rtx op_true, rtx op_false)
19376 {
19377 enum machine_mode mode = GET_MODE (dest);
19378 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19379 rtx x;
19380
19381 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19382 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19383 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19384
19385 if (optimize
19386 || reg_overlap_mentioned_p (dest, op_true)
19387 || reg_overlap_mentioned_p (dest, op_false))
19388 dest = gen_reg_rtx (mode);
19389
19390 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19391 if (cmp_mode != mode)
19392 {
19393 x = force_reg (cmp_mode, x);
19394 convert_move (dest, x, false);
19395 }
19396 else
19397 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19398
19399 return dest;
19400 }
19401
19402 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19403 operations. This is used for both scalar and vector conditional moves. */
19404
19405 static void
19406 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19407 {
19408 enum machine_mode mode = GET_MODE (dest);
19409 rtx t2, t3, x;
19410
19411 if (vector_all_ones_operand (op_true, mode)
19412 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19413 {
19414 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19415 }
19416 else if (op_false == CONST0_RTX (mode))
19417 {
19418 op_true = force_reg (mode, op_true);
19419 x = gen_rtx_AND (mode, cmp, op_true);
19420 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19421 }
19422 else if (op_true == CONST0_RTX (mode))
19423 {
19424 op_false = force_reg (mode, op_false);
19425 x = gen_rtx_NOT (mode, cmp);
19426 x = gen_rtx_AND (mode, x, op_false);
19427 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19428 }
19429 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19430 {
19431 op_false = force_reg (mode, op_false);
19432 x = gen_rtx_IOR (mode, cmp, op_false);
19433 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19434 }
19435 else if (TARGET_XOP)
19436 {
19437 op_true = force_reg (mode, op_true);
19438
19439 if (!nonimmediate_operand (op_false, mode))
19440 op_false = force_reg (mode, op_false);
19441
19442 emit_insn (gen_rtx_SET (mode, dest,
19443 gen_rtx_IF_THEN_ELSE (mode, cmp,
19444 op_true,
19445 op_false)));
19446 }
19447 else
19448 {
19449 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19450
19451 if (!nonimmediate_operand (op_true, mode))
19452 op_true = force_reg (mode, op_true);
19453
19454 op_false = force_reg (mode, op_false);
19455
19456 switch (mode)
19457 {
19458 case V4SFmode:
19459 if (TARGET_SSE4_1)
19460 gen = gen_sse4_1_blendvps;
19461 break;
19462 case V2DFmode:
19463 if (TARGET_SSE4_1)
19464 gen = gen_sse4_1_blendvpd;
19465 break;
19466 case V16QImode:
19467 case V8HImode:
19468 case V4SImode:
19469 case V2DImode:
19470 if (TARGET_SSE4_1)
19471 {
19472 gen = gen_sse4_1_pblendvb;
19473 dest = gen_lowpart (V16QImode, dest);
19474 op_false = gen_lowpart (V16QImode, op_false);
19475 op_true = gen_lowpart (V16QImode, op_true);
19476 cmp = gen_lowpart (V16QImode, cmp);
19477 }
19478 break;
19479 case V8SFmode:
19480 if (TARGET_AVX)
19481 gen = gen_avx_blendvps256;
19482 break;
19483 case V4DFmode:
19484 if (TARGET_AVX)
19485 gen = gen_avx_blendvpd256;
19486 break;
19487 case V32QImode:
19488 case V16HImode:
19489 case V8SImode:
19490 case V4DImode:
19491 if (TARGET_AVX2)
19492 {
19493 gen = gen_avx2_pblendvb;
19494 dest = gen_lowpart (V32QImode, dest);
19495 op_false = gen_lowpart (V32QImode, op_false);
19496 op_true = gen_lowpart (V32QImode, op_true);
19497 cmp = gen_lowpart (V32QImode, cmp);
19498 }
19499 break;
19500 default:
19501 break;
19502 }
19503
19504 if (gen != NULL)
19505 emit_insn (gen (dest, op_false, op_true, cmp));
19506 else
19507 {
19508 op_true = force_reg (mode, op_true);
19509
19510 t2 = gen_reg_rtx (mode);
19511 if (optimize)
19512 t3 = gen_reg_rtx (mode);
19513 else
19514 t3 = dest;
19515
19516 x = gen_rtx_AND (mode, op_true, cmp);
19517 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19518
19519 x = gen_rtx_NOT (mode, cmp);
19520 x = gen_rtx_AND (mode, x, op_false);
19521 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19522
19523 x = gen_rtx_IOR (mode, t3, t2);
19524 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19525 }
19526 }
19527 }
19528
19529 /* Expand a floating-point conditional move. Return true if successful. */
19530
19531 bool
19532 ix86_expand_fp_movcc (rtx operands[])
19533 {
19534 enum machine_mode mode = GET_MODE (operands[0]);
19535 enum rtx_code code = GET_CODE (operands[1]);
19536 rtx tmp, compare_op;
19537 rtx op0 = XEXP (operands[1], 0);
19538 rtx op1 = XEXP (operands[1], 1);
19539
19540 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19541 {
19542 enum machine_mode cmode;
19543
19544 /* Since we've no cmove for sse registers, don't force bad register
19545 allocation just to gain access to it. Deny movcc when the
19546 comparison mode doesn't match the move mode. */
19547 cmode = GET_MODE (op0);
19548 if (cmode == VOIDmode)
19549 cmode = GET_MODE (op1);
19550 if (cmode != mode)
19551 return false;
19552
19553 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19554 if (code == UNKNOWN)
19555 return false;
19556
19557 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19558 operands[2], operands[3]))
19559 return true;
19560
19561 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19562 operands[2], operands[3]);
19563 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19564 return true;
19565 }
19566
19567 /* The floating point conditional move instructions don't directly
19568 support conditions resulting from a signed integer comparison. */
19569
19570 compare_op = ix86_expand_compare (code, op0, op1);
19571 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19572 {
19573 tmp = gen_reg_rtx (QImode);
19574 ix86_expand_setcc (tmp, code, op0, op1);
19575
19576 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19577 }
19578
19579 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19580 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19581 operands[2], operands[3])));
19582
19583 return true;
19584 }
19585
19586 /* Expand a floating-point vector conditional move; a vcond operation
19587 rather than a movcc operation. */
19588
19589 bool
19590 ix86_expand_fp_vcond (rtx operands[])
19591 {
19592 enum rtx_code code = GET_CODE (operands[3]);
19593 rtx cmp;
19594
19595 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19596 &operands[4], &operands[5]);
19597 if (code == UNKNOWN)
19598 {
19599 rtx temp;
19600 switch (GET_CODE (operands[3]))
19601 {
19602 case LTGT:
19603 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19604 operands[5], operands[0], operands[0]);
19605 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19606 operands[5], operands[1], operands[2]);
19607 code = AND;
19608 break;
19609 case UNEQ:
19610 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19611 operands[5], operands[0], operands[0]);
19612 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19613 operands[5], operands[1], operands[2]);
19614 code = IOR;
19615 break;
19616 default:
19617 gcc_unreachable ();
19618 }
19619 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19620 OPTAB_DIRECT);
19621 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19622 return true;
19623 }
19624
19625 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19626 operands[5], operands[1], operands[2]))
19627 return true;
19628
19629 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19630 operands[1], operands[2]);
19631 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19632 return true;
19633 }
19634
19635 /* Expand a signed/unsigned integral vector conditional move. */
19636
19637 bool
19638 ix86_expand_int_vcond (rtx operands[])
19639 {
19640 enum machine_mode data_mode = GET_MODE (operands[0]);
19641 enum machine_mode mode = GET_MODE (operands[4]);
19642 enum rtx_code code = GET_CODE (operands[3]);
19643 bool negate = false;
19644 rtx x, cop0, cop1;
19645
19646 cop0 = operands[4];
19647 cop1 = operands[5];
19648
19649 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19650 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19651 if ((code == LT || code == GE)
19652 && data_mode == mode
19653 && cop1 == CONST0_RTX (mode)
19654 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19655 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19656 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19657 && (GET_MODE_SIZE (data_mode) == 16
19658 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19659 {
19660 rtx negop = operands[2 - (code == LT)];
19661 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19662 if (negop == CONST1_RTX (data_mode))
19663 {
19664 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19665 operands[0], 1, OPTAB_DIRECT);
19666 if (res != operands[0])
19667 emit_move_insn (operands[0], res);
19668 return true;
19669 }
19670 else if (GET_MODE_INNER (data_mode) != DImode
19671 && vector_all_ones_operand (negop, data_mode))
19672 {
19673 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19674 operands[0], 0, OPTAB_DIRECT);
19675 if (res != operands[0])
19676 emit_move_insn (operands[0], res);
19677 return true;
19678 }
19679 }
19680
19681 if (!nonimmediate_operand (cop1, mode))
19682 cop1 = force_reg (mode, cop1);
19683 if (!general_operand (operands[1], data_mode))
19684 operands[1] = force_reg (data_mode, operands[1]);
19685 if (!general_operand (operands[2], data_mode))
19686 operands[2] = force_reg (data_mode, operands[2]);
19687
19688 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19689 if (TARGET_XOP
19690 && (mode == V16QImode || mode == V8HImode
19691 || mode == V4SImode || mode == V2DImode))
19692 ;
19693 else
19694 {
19695 /* Canonicalize the comparison to EQ, GT, GTU. */
19696 switch (code)
19697 {
19698 case EQ:
19699 case GT:
19700 case GTU:
19701 break;
19702
19703 case NE:
19704 case LE:
19705 case LEU:
19706 code = reverse_condition (code);
19707 negate = true;
19708 break;
19709
19710 case GE:
19711 case GEU:
19712 code = reverse_condition (code);
19713 negate = true;
19714 /* FALLTHRU */
19715
19716 case LT:
19717 case LTU:
19718 code = swap_condition (code);
19719 x = cop0, cop0 = cop1, cop1 = x;
19720 break;
19721
19722 default:
19723 gcc_unreachable ();
19724 }
19725
19726 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19727 if (mode == V2DImode)
19728 {
19729 switch (code)
19730 {
19731 case EQ:
19732 /* SSE4.1 supports EQ. */
19733 if (!TARGET_SSE4_1)
19734 return false;
19735 break;
19736
19737 case GT:
19738 case GTU:
19739 /* SSE4.2 supports GT/GTU. */
19740 if (!TARGET_SSE4_2)
19741 return false;
19742 break;
19743
19744 default:
19745 gcc_unreachable ();
19746 }
19747 }
19748
19749 /* Unsigned parallel compare is not supported by the hardware.
19750 Play some tricks to turn this into a signed comparison
19751 against 0. */
19752 if (code == GTU)
19753 {
19754 cop0 = force_reg (mode, cop0);
19755
19756 switch (mode)
19757 {
19758 case V8SImode:
19759 case V4DImode:
19760 case V4SImode:
19761 case V2DImode:
19762 {
19763 rtx t1, t2, mask;
19764 rtx (*gen_sub3) (rtx, rtx, rtx);
19765
19766 switch (mode)
19767 {
19768 case V8SImode: gen_sub3 = gen_subv8si3; break;
19769 case V4DImode: gen_sub3 = gen_subv4di3; break;
19770 case V4SImode: gen_sub3 = gen_subv4si3; break;
19771 case V2DImode: gen_sub3 = gen_subv2di3; break;
19772 default:
19773 gcc_unreachable ();
19774 }
19775 /* Subtract (-(INT MAX) - 1) from both operands to make
19776 them signed. */
19777 mask = ix86_build_signbit_mask (mode, true, false);
19778 t1 = gen_reg_rtx (mode);
19779 emit_insn (gen_sub3 (t1, cop0, mask));
19780
19781 t2 = gen_reg_rtx (mode);
19782 emit_insn (gen_sub3 (t2, cop1, mask));
19783
19784 cop0 = t1;
19785 cop1 = t2;
19786 code = GT;
19787 }
19788 break;
19789
19790 case V32QImode:
19791 case V16HImode:
19792 case V16QImode:
19793 case V8HImode:
19794 /* Perform a parallel unsigned saturating subtraction. */
19795 x = gen_reg_rtx (mode);
19796 emit_insn (gen_rtx_SET (VOIDmode, x,
19797 gen_rtx_US_MINUS (mode, cop0, cop1)));
19798
19799 cop0 = x;
19800 cop1 = CONST0_RTX (mode);
19801 code = EQ;
19802 negate = !negate;
19803 break;
19804
19805 default:
19806 gcc_unreachable ();
19807 }
19808 }
19809 }
19810
19811 /* Allow the comparison to be done in one mode, but the movcc to
19812 happen in another mode. */
19813 if (data_mode == mode)
19814 {
19815 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19816 operands[1+negate], operands[2-negate]);
19817 }
19818 else
19819 {
19820 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19821 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19822 code, cop0, cop1,
19823 operands[1+negate], operands[2-negate]);
19824 x = gen_lowpart (data_mode, x);
19825 }
19826
19827 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19828 operands[2-negate]);
19829 return true;
19830 }
19831
19832 /* Expand a variable vector permutation. */
19833
19834 void
19835 ix86_expand_vec_perm (rtx operands[])
19836 {
19837 rtx target = operands[0];
19838 rtx op0 = operands[1];
19839 rtx op1 = operands[2];
19840 rtx mask = operands[3];
19841 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19842 enum machine_mode mode = GET_MODE (op0);
19843 enum machine_mode maskmode = GET_MODE (mask);
19844 int w, e, i;
19845 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19846
19847 /* Number of elements in the vector. */
19848 w = GET_MODE_NUNITS (mode);
19849 e = GET_MODE_UNIT_SIZE (mode);
19850 gcc_assert (w <= 32);
19851
19852 if (TARGET_AVX2)
19853 {
19854 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19855 {
19856 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19857 an constant shuffle operand. With a tiny bit of effort we can
19858 use VPERMD instead. A re-interpretation stall for V4DFmode is
19859 unfortunate but there's no avoiding it.
19860 Similarly for V16HImode we don't have instructions for variable
19861 shuffling, while for V32QImode we can use after preparing suitable
19862 masks vpshufb; vpshufb; vpermq; vpor. */
19863
19864 if (mode == V16HImode)
19865 {
19866 maskmode = mode = V32QImode;
19867 w = 32;
19868 e = 1;
19869 }
19870 else
19871 {
19872 maskmode = mode = V8SImode;
19873 w = 8;
19874 e = 4;
19875 }
19876 t1 = gen_reg_rtx (maskmode);
19877
19878 /* Replicate the low bits of the V4DImode mask into V8SImode:
19879 mask = { A B C D }
19880 t1 = { A A B B C C D D }. */
19881 for (i = 0; i < w / 2; ++i)
19882 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19883 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19884 vt = force_reg (maskmode, vt);
19885 mask = gen_lowpart (maskmode, mask);
19886 if (maskmode == V8SImode)
19887 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
19888 else
19889 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19890
19891 /* Multiply the shuffle indicies by two. */
19892 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19893 OPTAB_DIRECT);
19894
19895 /* Add one to the odd shuffle indicies:
19896 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19897 for (i = 0; i < w / 2; ++i)
19898 {
19899 vec[i * 2] = const0_rtx;
19900 vec[i * 2 + 1] = const1_rtx;
19901 }
19902 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19903 vt = force_const_mem (maskmode, vt);
19904 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19905 OPTAB_DIRECT);
19906
19907 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19908 operands[3] = mask = t1;
19909 target = gen_lowpart (mode, target);
19910 op0 = gen_lowpart (mode, op0);
19911 op1 = gen_lowpart (mode, op1);
19912 }
19913
19914 switch (mode)
19915 {
19916 case V8SImode:
19917 /* The VPERMD and VPERMPS instructions already properly ignore
19918 the high bits of the shuffle elements. No need for us to
19919 perform an AND ourselves. */
19920 if (one_operand_shuffle)
19921 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
19922 else
19923 {
19924 t1 = gen_reg_rtx (V8SImode);
19925 t2 = gen_reg_rtx (V8SImode);
19926 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
19927 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
19928 goto merge_two;
19929 }
19930 return;
19931
19932 case V8SFmode:
19933 mask = gen_lowpart (V8SFmode, mask);
19934 if (one_operand_shuffle)
19935 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
19936 else
19937 {
19938 t1 = gen_reg_rtx (V8SFmode);
19939 t2 = gen_reg_rtx (V8SFmode);
19940 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
19941 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
19942 goto merge_two;
19943 }
19944 return;
19945
19946 case V4SImode:
19947 /* By combining the two 128-bit input vectors into one 256-bit
19948 input vector, we can use VPERMD and VPERMPS for the full
19949 two-operand shuffle. */
19950 t1 = gen_reg_rtx (V8SImode);
19951 t2 = gen_reg_rtx (V8SImode);
19952 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19953 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19954 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
19955 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19956 return;
19957
19958 case V4SFmode:
19959 t1 = gen_reg_rtx (V8SFmode);
19960 t2 = gen_reg_rtx (V8SImode);
19961 mask = gen_lowpart (V4SImode, mask);
19962 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19963 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19964 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
19965 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19966 return;
19967
19968 case V32QImode:
19969 t1 = gen_reg_rtx (V32QImode);
19970 t2 = gen_reg_rtx (V32QImode);
19971 t3 = gen_reg_rtx (V32QImode);
19972 vt2 = GEN_INT (128);
19973 for (i = 0; i < 32; i++)
19974 vec[i] = vt2;
19975 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19976 vt = force_reg (V32QImode, vt);
19977 for (i = 0; i < 32; i++)
19978 vec[i] = i < 16 ? vt2 : const0_rtx;
19979 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19980 vt2 = force_reg (V32QImode, vt2);
19981 /* From mask create two adjusted masks, which contain the same
19982 bits as mask in the low 7 bits of each vector element.
19983 The first mask will have the most significant bit clear
19984 if it requests element from the same 128-bit lane
19985 and MSB set if it requests element from the other 128-bit lane.
19986 The second mask will have the opposite values of the MSB,
19987 and additionally will have its 128-bit lanes swapped.
19988 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19989 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19990 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19991 stands for other 12 bytes. */
19992 /* The bit whether element is from the same lane or the other
19993 lane is bit 4, so shift it up by 3 to the MSB position. */
19994 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19995 gen_lowpart (V4DImode, mask),
19996 GEN_INT (3)));
19997 /* Clear MSB bits from the mask just in case it had them set. */
19998 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19999 /* After this t1 will have MSB set for elements from other lane. */
20000 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20001 /* Clear bits other than MSB. */
20002 emit_insn (gen_andv32qi3 (t1, t1, vt));
20003 /* Or in the lower bits from mask into t3. */
20004 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20005 /* And invert MSB bits in t1, so MSB is set for elements from the same
20006 lane. */
20007 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20008 /* Swap 128-bit lanes in t3. */
20009 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20010 gen_lowpart (V4DImode, t3),
20011 const2_rtx, GEN_INT (3),
20012 const0_rtx, const1_rtx));
20013 /* And or in the lower bits from mask into t1. */
20014 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20015 if (one_operand_shuffle)
20016 {
20017 /* Each of these shuffles will put 0s in places where
20018 element from the other 128-bit lane is needed, otherwise
20019 will shuffle in the requested value. */
20020 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20021 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20022 /* For t3 the 128-bit lanes are swapped again. */
20023 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20024 gen_lowpart (V4DImode, t3),
20025 const2_rtx, GEN_INT (3),
20026 const0_rtx, const1_rtx));
20027 /* And oring both together leads to the result. */
20028 emit_insn (gen_iorv32qi3 (target, t1, t3));
20029 return;
20030 }
20031
20032 t4 = gen_reg_rtx (V32QImode);
20033 /* Similarly to the above one_operand_shuffle code,
20034 just for repeated twice for each operand. merge_two:
20035 code will merge the two results together. */
20036 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20037 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20038 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20039 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20040 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20041 gen_lowpart (V4DImode, t4),
20042 const2_rtx, GEN_INT (3),
20043 const0_rtx, const1_rtx));
20044 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20045 gen_lowpart (V4DImode, t3),
20046 const2_rtx, GEN_INT (3),
20047 const0_rtx, const1_rtx));
20048 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20049 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20050 t1 = t4;
20051 t2 = t3;
20052 goto merge_two;
20053
20054 default:
20055 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20056 break;
20057 }
20058 }
20059
20060 if (TARGET_XOP)
20061 {
20062 /* The XOP VPPERM insn supports three inputs. By ignoring the
20063 one_operand_shuffle special case, we avoid creating another
20064 set of constant vectors in memory. */
20065 one_operand_shuffle = false;
20066
20067 /* mask = mask & {2*w-1, ...} */
20068 vt = GEN_INT (2*w - 1);
20069 }
20070 else
20071 {
20072 /* mask = mask & {w-1, ...} */
20073 vt = GEN_INT (w - 1);
20074 }
20075
20076 for (i = 0; i < w; i++)
20077 vec[i] = vt;
20078 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20079 mask = expand_simple_binop (maskmode, AND, mask, vt,
20080 NULL_RTX, 0, OPTAB_DIRECT);
20081
20082 /* For non-QImode operations, convert the word permutation control
20083 into a byte permutation control. */
20084 if (mode != V16QImode)
20085 {
20086 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20087 GEN_INT (exact_log2 (e)),
20088 NULL_RTX, 0, OPTAB_DIRECT);
20089
20090 /* Convert mask to vector of chars. */
20091 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20092
20093 /* Replicate each of the input bytes into byte positions:
20094 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20095 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20096 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20097 for (i = 0; i < 16; ++i)
20098 vec[i] = GEN_INT (i/e * e);
20099 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20100 vt = force_const_mem (V16QImode, vt);
20101 if (TARGET_XOP)
20102 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20103 else
20104 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20105
20106 /* Convert it into the byte positions by doing
20107 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20108 for (i = 0; i < 16; ++i)
20109 vec[i] = GEN_INT (i % e);
20110 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20111 vt = force_const_mem (V16QImode, vt);
20112 emit_insn (gen_addv16qi3 (mask, mask, vt));
20113 }
20114
20115 /* The actual shuffle operations all operate on V16QImode. */
20116 op0 = gen_lowpart (V16QImode, op0);
20117 op1 = gen_lowpart (V16QImode, op1);
20118 target = gen_lowpart (V16QImode, target);
20119
20120 if (TARGET_XOP)
20121 {
20122 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20123 }
20124 else if (one_operand_shuffle)
20125 {
20126 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20127 }
20128 else
20129 {
20130 rtx xops[6];
20131 bool ok;
20132
20133 /* Shuffle the two input vectors independently. */
20134 t1 = gen_reg_rtx (V16QImode);
20135 t2 = gen_reg_rtx (V16QImode);
20136 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20137 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20138
20139 merge_two:
20140 /* Then merge them together. The key is whether any given control
20141 element contained a bit set that indicates the second word. */
20142 mask = operands[3];
20143 vt = GEN_INT (w);
20144 if (maskmode == V2DImode && !TARGET_SSE4_1)
20145 {
20146 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20147 more shuffle to convert the V2DI input mask into a V4SI
20148 input mask. At which point the masking that expand_int_vcond
20149 will work as desired. */
20150 rtx t3 = gen_reg_rtx (V4SImode);
20151 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20152 const0_rtx, const0_rtx,
20153 const2_rtx, const2_rtx));
20154 mask = t3;
20155 maskmode = V4SImode;
20156 e = w = 4;
20157 }
20158
20159 for (i = 0; i < w; i++)
20160 vec[i] = vt;
20161 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20162 vt = force_reg (maskmode, vt);
20163 mask = expand_simple_binop (maskmode, AND, mask, vt,
20164 NULL_RTX, 0, OPTAB_DIRECT);
20165
20166 xops[0] = gen_lowpart (mode, operands[0]);
20167 xops[1] = gen_lowpart (mode, t2);
20168 xops[2] = gen_lowpart (mode, t1);
20169 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20170 xops[4] = mask;
20171 xops[5] = vt;
20172 ok = ix86_expand_int_vcond (xops);
20173 gcc_assert (ok);
20174 }
20175 }
20176
20177 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20178 true if we should do zero extension, else sign extension. HIGH_P is
20179 true if we want the N/2 high elements, else the low elements. */
20180
20181 void
20182 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20183 {
20184 enum machine_mode imode = GET_MODE (operands[1]);
20185 rtx tmp, dest;
20186
20187 if (TARGET_SSE4_1)
20188 {
20189 rtx (*unpack)(rtx, rtx);
20190 rtx (*extract)(rtx, rtx) = NULL;
20191 enum machine_mode halfmode = BLKmode;
20192
20193 switch (imode)
20194 {
20195 case V32QImode:
20196 if (unsigned_p)
20197 unpack = gen_avx2_zero_extendv16qiv16hi2;
20198 else
20199 unpack = gen_avx2_sign_extendv16qiv16hi2;
20200 halfmode = V16QImode;
20201 extract
20202 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20203 break;
20204 case V16HImode:
20205 if (unsigned_p)
20206 unpack = gen_avx2_zero_extendv8hiv8si2;
20207 else
20208 unpack = gen_avx2_sign_extendv8hiv8si2;
20209 halfmode = V8HImode;
20210 extract
20211 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20212 break;
20213 case V8SImode:
20214 if (unsigned_p)
20215 unpack = gen_avx2_zero_extendv4siv4di2;
20216 else
20217 unpack = gen_avx2_sign_extendv4siv4di2;
20218 halfmode = V4SImode;
20219 extract
20220 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20221 break;
20222 case V16QImode:
20223 if (unsigned_p)
20224 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20225 else
20226 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20227 break;
20228 case V8HImode:
20229 if (unsigned_p)
20230 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20231 else
20232 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20233 break;
20234 case V4SImode:
20235 if (unsigned_p)
20236 unpack = gen_sse4_1_zero_extendv2siv2di2;
20237 else
20238 unpack = gen_sse4_1_sign_extendv2siv2di2;
20239 break;
20240 default:
20241 gcc_unreachable ();
20242 }
20243
20244 if (GET_MODE_SIZE (imode) == 32)
20245 {
20246 tmp = gen_reg_rtx (halfmode);
20247 emit_insn (extract (tmp, operands[1]));
20248 }
20249 else if (high_p)
20250 {
20251 /* Shift higher 8 bytes to lower 8 bytes. */
20252 tmp = gen_reg_rtx (imode);
20253 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20254 gen_lowpart (V1TImode, operands[1]),
20255 GEN_INT (64)));
20256 }
20257 else
20258 tmp = operands[1];
20259
20260 emit_insn (unpack (operands[0], tmp));
20261 }
20262 else
20263 {
20264 rtx (*unpack)(rtx, rtx, rtx);
20265
20266 switch (imode)
20267 {
20268 case V16QImode:
20269 if (high_p)
20270 unpack = gen_vec_interleave_highv16qi;
20271 else
20272 unpack = gen_vec_interleave_lowv16qi;
20273 break;
20274 case V8HImode:
20275 if (high_p)
20276 unpack = gen_vec_interleave_highv8hi;
20277 else
20278 unpack = gen_vec_interleave_lowv8hi;
20279 break;
20280 case V4SImode:
20281 if (high_p)
20282 unpack = gen_vec_interleave_highv4si;
20283 else
20284 unpack = gen_vec_interleave_lowv4si;
20285 break;
20286 default:
20287 gcc_unreachable ();
20288 }
20289
20290 dest = gen_lowpart (imode, operands[0]);
20291
20292 if (unsigned_p)
20293 tmp = force_reg (imode, CONST0_RTX (imode));
20294 else
20295 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20296 operands[1], pc_rtx, pc_rtx);
20297
20298 emit_insn (unpack (dest, operands[1], tmp));
20299 }
20300 }
20301
20302 /* Expand conditional increment or decrement using adb/sbb instructions.
20303 The default case using setcc followed by the conditional move can be
20304 done by generic code. */
20305 bool
20306 ix86_expand_int_addcc (rtx operands[])
20307 {
20308 enum rtx_code code = GET_CODE (operands[1]);
20309 rtx flags;
20310 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20311 rtx compare_op;
20312 rtx val = const0_rtx;
20313 bool fpcmp = false;
20314 enum machine_mode mode;
20315 rtx op0 = XEXP (operands[1], 0);
20316 rtx op1 = XEXP (operands[1], 1);
20317
20318 if (operands[3] != const1_rtx
20319 && operands[3] != constm1_rtx)
20320 return false;
20321 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20322 return false;
20323 code = GET_CODE (compare_op);
20324
20325 flags = XEXP (compare_op, 0);
20326
20327 if (GET_MODE (flags) == CCFPmode
20328 || GET_MODE (flags) == CCFPUmode)
20329 {
20330 fpcmp = true;
20331 code = ix86_fp_compare_code_to_integer (code);
20332 }
20333
20334 if (code != LTU)
20335 {
20336 val = constm1_rtx;
20337 if (fpcmp)
20338 PUT_CODE (compare_op,
20339 reverse_condition_maybe_unordered
20340 (GET_CODE (compare_op)));
20341 else
20342 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20343 }
20344
20345 mode = GET_MODE (operands[0]);
20346
20347 /* Construct either adc or sbb insn. */
20348 if ((code == LTU) == (operands[3] == constm1_rtx))
20349 {
20350 switch (mode)
20351 {
20352 case QImode:
20353 insn = gen_subqi3_carry;
20354 break;
20355 case HImode:
20356 insn = gen_subhi3_carry;
20357 break;
20358 case SImode:
20359 insn = gen_subsi3_carry;
20360 break;
20361 case DImode:
20362 insn = gen_subdi3_carry;
20363 break;
20364 default:
20365 gcc_unreachable ();
20366 }
20367 }
20368 else
20369 {
20370 switch (mode)
20371 {
20372 case QImode:
20373 insn = gen_addqi3_carry;
20374 break;
20375 case HImode:
20376 insn = gen_addhi3_carry;
20377 break;
20378 case SImode:
20379 insn = gen_addsi3_carry;
20380 break;
20381 case DImode:
20382 insn = gen_adddi3_carry;
20383 break;
20384 default:
20385 gcc_unreachable ();
20386 }
20387 }
20388 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20389
20390 return true;
20391 }
20392
20393
20394 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20395 but works for floating pointer parameters and nonoffsetable memories.
20396 For pushes, it returns just stack offsets; the values will be saved
20397 in the right order. Maximally three parts are generated. */
20398
20399 static int
20400 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20401 {
20402 int size;
20403
20404 if (!TARGET_64BIT)
20405 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20406 else
20407 size = (GET_MODE_SIZE (mode) + 4) / 8;
20408
20409 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20410 gcc_assert (size >= 2 && size <= 4);
20411
20412 /* Optimize constant pool reference to immediates. This is used by fp
20413 moves, that force all constants to memory to allow combining. */
20414 if (MEM_P (operand) && MEM_READONLY_P (operand))
20415 {
20416 rtx tmp = maybe_get_pool_constant (operand);
20417 if (tmp)
20418 operand = tmp;
20419 }
20420
20421 if (MEM_P (operand) && !offsettable_memref_p (operand))
20422 {
20423 /* The only non-offsetable memories we handle are pushes. */
20424 int ok = push_operand (operand, VOIDmode);
20425
20426 gcc_assert (ok);
20427
20428 operand = copy_rtx (operand);
20429 PUT_MODE (operand, word_mode);
20430 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20431 return size;
20432 }
20433
20434 if (GET_CODE (operand) == CONST_VECTOR)
20435 {
20436 enum machine_mode imode = int_mode_for_mode (mode);
20437 /* Caution: if we looked through a constant pool memory above,
20438 the operand may actually have a different mode now. That's
20439 ok, since we want to pun this all the way back to an integer. */
20440 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20441 gcc_assert (operand != NULL);
20442 mode = imode;
20443 }
20444
20445 if (!TARGET_64BIT)
20446 {
20447 if (mode == DImode)
20448 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20449 else
20450 {
20451 int i;
20452
20453 if (REG_P (operand))
20454 {
20455 gcc_assert (reload_completed);
20456 for (i = 0; i < size; i++)
20457 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20458 }
20459 else if (offsettable_memref_p (operand))
20460 {
20461 operand = adjust_address (operand, SImode, 0);
20462 parts[0] = operand;
20463 for (i = 1; i < size; i++)
20464 parts[i] = adjust_address (operand, SImode, 4 * i);
20465 }
20466 else if (GET_CODE (operand) == CONST_DOUBLE)
20467 {
20468 REAL_VALUE_TYPE r;
20469 long l[4];
20470
20471 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20472 switch (mode)
20473 {
20474 case TFmode:
20475 real_to_target (l, &r, mode);
20476 parts[3] = gen_int_mode (l[3], SImode);
20477 parts[2] = gen_int_mode (l[2], SImode);
20478 break;
20479 case XFmode:
20480 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20481 parts[2] = gen_int_mode (l[2], SImode);
20482 break;
20483 case DFmode:
20484 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20485 break;
20486 default:
20487 gcc_unreachable ();
20488 }
20489 parts[1] = gen_int_mode (l[1], SImode);
20490 parts[0] = gen_int_mode (l[0], SImode);
20491 }
20492 else
20493 gcc_unreachable ();
20494 }
20495 }
20496 else
20497 {
20498 if (mode == TImode)
20499 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20500 if (mode == XFmode || mode == TFmode)
20501 {
20502 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20503 if (REG_P (operand))
20504 {
20505 gcc_assert (reload_completed);
20506 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20507 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20508 }
20509 else if (offsettable_memref_p (operand))
20510 {
20511 operand = adjust_address (operand, DImode, 0);
20512 parts[0] = operand;
20513 parts[1] = adjust_address (operand, upper_mode, 8);
20514 }
20515 else if (GET_CODE (operand) == CONST_DOUBLE)
20516 {
20517 REAL_VALUE_TYPE r;
20518 long l[4];
20519
20520 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20521 real_to_target (l, &r, mode);
20522
20523 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20524 if (HOST_BITS_PER_WIDE_INT >= 64)
20525 parts[0]
20526 = gen_int_mode
20527 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20528 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20529 DImode);
20530 else
20531 parts[0] = immed_double_const (l[0], l[1], DImode);
20532
20533 if (upper_mode == SImode)
20534 parts[1] = gen_int_mode (l[2], SImode);
20535 else if (HOST_BITS_PER_WIDE_INT >= 64)
20536 parts[1]
20537 = gen_int_mode
20538 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20539 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20540 DImode);
20541 else
20542 parts[1] = immed_double_const (l[2], l[3], DImode);
20543 }
20544 else
20545 gcc_unreachable ();
20546 }
20547 }
20548
20549 return size;
20550 }
20551
20552 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20553 Return false when normal moves are needed; true when all required
20554 insns have been emitted. Operands 2-4 contain the input values
20555 int the correct order; operands 5-7 contain the output values. */
20556
20557 void
20558 ix86_split_long_move (rtx operands[])
20559 {
20560 rtx part[2][4];
20561 int nparts, i, j;
20562 int push = 0;
20563 int collisions = 0;
20564 enum machine_mode mode = GET_MODE (operands[0]);
20565 bool collisionparts[4];
20566
20567 /* The DFmode expanders may ask us to move double.
20568 For 64bit target this is single move. By hiding the fact
20569 here we simplify i386.md splitters. */
20570 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20571 {
20572 /* Optimize constant pool reference to immediates. This is used by
20573 fp moves, that force all constants to memory to allow combining. */
20574
20575 if (MEM_P (operands[1])
20576 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20577 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20578 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20579 if (push_operand (operands[0], VOIDmode))
20580 {
20581 operands[0] = copy_rtx (operands[0]);
20582 PUT_MODE (operands[0], word_mode);
20583 }
20584 else
20585 operands[0] = gen_lowpart (DImode, operands[0]);
20586 operands[1] = gen_lowpart (DImode, operands[1]);
20587 emit_move_insn (operands[0], operands[1]);
20588 return;
20589 }
20590
20591 /* The only non-offsettable memory we handle is push. */
20592 if (push_operand (operands[0], VOIDmode))
20593 push = 1;
20594 else
20595 gcc_assert (!MEM_P (operands[0])
20596 || offsettable_memref_p (operands[0]));
20597
20598 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20599 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20600
20601 /* When emitting push, take care for source operands on the stack. */
20602 if (push && MEM_P (operands[1])
20603 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20604 {
20605 rtx src_base = XEXP (part[1][nparts - 1], 0);
20606
20607 /* Compensate for the stack decrement by 4. */
20608 if (!TARGET_64BIT && nparts == 3
20609 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20610 src_base = plus_constant (Pmode, src_base, 4);
20611
20612 /* src_base refers to the stack pointer and is
20613 automatically decreased by emitted push. */
20614 for (i = 0; i < nparts; i++)
20615 part[1][i] = change_address (part[1][i],
20616 GET_MODE (part[1][i]), src_base);
20617 }
20618
20619 /* We need to do copy in the right order in case an address register
20620 of the source overlaps the destination. */
20621 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20622 {
20623 rtx tmp;
20624
20625 for (i = 0; i < nparts; i++)
20626 {
20627 collisionparts[i]
20628 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20629 if (collisionparts[i])
20630 collisions++;
20631 }
20632
20633 /* Collision in the middle part can be handled by reordering. */
20634 if (collisions == 1 && nparts == 3 && collisionparts [1])
20635 {
20636 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20637 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20638 }
20639 else if (collisions == 1
20640 && nparts == 4
20641 && (collisionparts [1] || collisionparts [2]))
20642 {
20643 if (collisionparts [1])
20644 {
20645 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20646 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20647 }
20648 else
20649 {
20650 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20651 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20652 }
20653 }
20654
20655 /* If there are more collisions, we can't handle it by reordering.
20656 Do an lea to the last part and use only one colliding move. */
20657 else if (collisions > 1)
20658 {
20659 rtx base;
20660
20661 collisions = 1;
20662
20663 base = part[0][nparts - 1];
20664
20665 /* Handle the case when the last part isn't valid for lea.
20666 Happens in 64-bit mode storing the 12-byte XFmode. */
20667 if (GET_MODE (base) != Pmode)
20668 base = gen_rtx_REG (Pmode, REGNO (base));
20669
20670 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20671 part[1][0] = replace_equiv_address (part[1][0], base);
20672 for (i = 1; i < nparts; i++)
20673 {
20674 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20675 part[1][i] = replace_equiv_address (part[1][i], tmp);
20676 }
20677 }
20678 }
20679
20680 if (push)
20681 {
20682 if (!TARGET_64BIT)
20683 {
20684 if (nparts == 3)
20685 {
20686 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20687 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20688 stack_pointer_rtx, GEN_INT (-4)));
20689 emit_move_insn (part[0][2], part[1][2]);
20690 }
20691 else if (nparts == 4)
20692 {
20693 emit_move_insn (part[0][3], part[1][3]);
20694 emit_move_insn (part[0][2], part[1][2]);
20695 }
20696 }
20697 else
20698 {
20699 /* In 64bit mode we don't have 32bit push available. In case this is
20700 register, it is OK - we will just use larger counterpart. We also
20701 retype memory - these comes from attempt to avoid REX prefix on
20702 moving of second half of TFmode value. */
20703 if (GET_MODE (part[1][1]) == SImode)
20704 {
20705 switch (GET_CODE (part[1][1]))
20706 {
20707 case MEM:
20708 part[1][1] = adjust_address (part[1][1], DImode, 0);
20709 break;
20710
20711 case REG:
20712 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20713 break;
20714
20715 default:
20716 gcc_unreachable ();
20717 }
20718
20719 if (GET_MODE (part[1][0]) == SImode)
20720 part[1][0] = part[1][1];
20721 }
20722 }
20723 emit_move_insn (part[0][1], part[1][1]);
20724 emit_move_insn (part[0][0], part[1][0]);
20725 return;
20726 }
20727
20728 /* Choose correct order to not overwrite the source before it is copied. */
20729 if ((REG_P (part[0][0])
20730 && REG_P (part[1][1])
20731 && (REGNO (part[0][0]) == REGNO (part[1][1])
20732 || (nparts == 3
20733 && REGNO (part[0][0]) == REGNO (part[1][2]))
20734 || (nparts == 4
20735 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20736 || (collisions > 0
20737 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20738 {
20739 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20740 {
20741 operands[2 + i] = part[0][j];
20742 operands[6 + i] = part[1][j];
20743 }
20744 }
20745 else
20746 {
20747 for (i = 0; i < nparts; i++)
20748 {
20749 operands[2 + i] = part[0][i];
20750 operands[6 + i] = part[1][i];
20751 }
20752 }
20753
20754 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20755 if (optimize_insn_for_size_p ())
20756 {
20757 for (j = 0; j < nparts - 1; j++)
20758 if (CONST_INT_P (operands[6 + j])
20759 && operands[6 + j] != const0_rtx
20760 && REG_P (operands[2 + j]))
20761 for (i = j; i < nparts - 1; i++)
20762 if (CONST_INT_P (operands[7 + i])
20763 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20764 operands[7 + i] = operands[2 + j];
20765 }
20766
20767 for (i = 0; i < nparts; i++)
20768 emit_move_insn (operands[2 + i], operands[6 + i]);
20769
20770 return;
20771 }
20772
20773 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20774 left shift by a constant, either using a single shift or
20775 a sequence of add instructions. */
20776
20777 static void
20778 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20779 {
20780 rtx (*insn)(rtx, rtx, rtx);
20781
20782 if (count == 1
20783 || (count * ix86_cost->add <= ix86_cost->shift_const
20784 && !optimize_insn_for_size_p ()))
20785 {
20786 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20787 while (count-- > 0)
20788 emit_insn (insn (operand, operand, operand));
20789 }
20790 else
20791 {
20792 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20793 emit_insn (insn (operand, operand, GEN_INT (count)));
20794 }
20795 }
20796
20797 void
20798 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20799 {
20800 rtx (*gen_ashl3)(rtx, rtx, rtx);
20801 rtx (*gen_shld)(rtx, rtx, rtx);
20802 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20803
20804 rtx low[2], high[2];
20805 int count;
20806
20807 if (CONST_INT_P (operands[2]))
20808 {
20809 split_double_mode (mode, operands, 2, low, high);
20810 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20811
20812 if (count >= half_width)
20813 {
20814 emit_move_insn (high[0], low[1]);
20815 emit_move_insn (low[0], const0_rtx);
20816
20817 if (count > half_width)
20818 ix86_expand_ashl_const (high[0], count - half_width, mode);
20819 }
20820 else
20821 {
20822 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20823
20824 if (!rtx_equal_p (operands[0], operands[1]))
20825 emit_move_insn (operands[0], operands[1]);
20826
20827 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20828 ix86_expand_ashl_const (low[0], count, mode);
20829 }
20830 return;
20831 }
20832
20833 split_double_mode (mode, operands, 1, low, high);
20834
20835 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20836
20837 if (operands[1] == const1_rtx)
20838 {
20839 /* Assuming we've chosen a QImode capable registers, then 1 << N
20840 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20841 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20842 {
20843 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20844
20845 ix86_expand_clear (low[0]);
20846 ix86_expand_clear (high[0]);
20847 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20848
20849 d = gen_lowpart (QImode, low[0]);
20850 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20851 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20852 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20853
20854 d = gen_lowpart (QImode, high[0]);
20855 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20856 s = gen_rtx_NE (QImode, flags, const0_rtx);
20857 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20858 }
20859
20860 /* Otherwise, we can get the same results by manually performing
20861 a bit extract operation on bit 5/6, and then performing the two
20862 shifts. The two methods of getting 0/1 into low/high are exactly
20863 the same size. Avoiding the shift in the bit extract case helps
20864 pentium4 a bit; no one else seems to care much either way. */
20865 else
20866 {
20867 enum machine_mode half_mode;
20868 rtx (*gen_lshr3)(rtx, rtx, rtx);
20869 rtx (*gen_and3)(rtx, rtx, rtx);
20870 rtx (*gen_xor3)(rtx, rtx, rtx);
20871 HOST_WIDE_INT bits;
20872 rtx x;
20873
20874 if (mode == DImode)
20875 {
20876 half_mode = SImode;
20877 gen_lshr3 = gen_lshrsi3;
20878 gen_and3 = gen_andsi3;
20879 gen_xor3 = gen_xorsi3;
20880 bits = 5;
20881 }
20882 else
20883 {
20884 half_mode = DImode;
20885 gen_lshr3 = gen_lshrdi3;
20886 gen_and3 = gen_anddi3;
20887 gen_xor3 = gen_xordi3;
20888 bits = 6;
20889 }
20890
20891 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20892 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20893 else
20894 x = gen_lowpart (half_mode, operands[2]);
20895 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20896
20897 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20898 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20899 emit_move_insn (low[0], high[0]);
20900 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20901 }
20902
20903 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20904 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20905 return;
20906 }
20907
20908 if (operands[1] == constm1_rtx)
20909 {
20910 /* For -1 << N, we can avoid the shld instruction, because we
20911 know that we're shifting 0...31/63 ones into a -1. */
20912 emit_move_insn (low[0], constm1_rtx);
20913 if (optimize_insn_for_size_p ())
20914 emit_move_insn (high[0], low[0]);
20915 else
20916 emit_move_insn (high[0], constm1_rtx);
20917 }
20918 else
20919 {
20920 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20921
20922 if (!rtx_equal_p (operands[0], operands[1]))
20923 emit_move_insn (operands[0], operands[1]);
20924
20925 split_double_mode (mode, operands, 1, low, high);
20926 emit_insn (gen_shld (high[0], low[0], operands[2]));
20927 }
20928
20929 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20930
20931 if (TARGET_CMOVE && scratch)
20932 {
20933 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20934 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20935
20936 ix86_expand_clear (scratch);
20937 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20938 }
20939 else
20940 {
20941 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20942 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20943
20944 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20945 }
20946 }
20947
20948 void
20949 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20950 {
20951 rtx (*gen_ashr3)(rtx, rtx, rtx)
20952 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20953 rtx (*gen_shrd)(rtx, rtx, rtx);
20954 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20955
20956 rtx low[2], high[2];
20957 int count;
20958
20959 if (CONST_INT_P (operands[2]))
20960 {
20961 split_double_mode (mode, operands, 2, low, high);
20962 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20963
20964 if (count == GET_MODE_BITSIZE (mode) - 1)
20965 {
20966 emit_move_insn (high[0], high[1]);
20967 emit_insn (gen_ashr3 (high[0], high[0],
20968 GEN_INT (half_width - 1)));
20969 emit_move_insn (low[0], high[0]);
20970
20971 }
20972 else if (count >= half_width)
20973 {
20974 emit_move_insn (low[0], high[1]);
20975 emit_move_insn (high[0], low[0]);
20976 emit_insn (gen_ashr3 (high[0], high[0],
20977 GEN_INT (half_width - 1)));
20978
20979 if (count > half_width)
20980 emit_insn (gen_ashr3 (low[0], low[0],
20981 GEN_INT (count - half_width)));
20982 }
20983 else
20984 {
20985 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20986
20987 if (!rtx_equal_p (operands[0], operands[1]))
20988 emit_move_insn (operands[0], operands[1]);
20989
20990 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20991 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20992 }
20993 }
20994 else
20995 {
20996 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20997
20998 if (!rtx_equal_p (operands[0], operands[1]))
20999 emit_move_insn (operands[0], operands[1]);
21000
21001 split_double_mode (mode, operands, 1, low, high);
21002
21003 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21004 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21005
21006 if (TARGET_CMOVE && scratch)
21007 {
21008 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21009 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21010
21011 emit_move_insn (scratch, high[0]);
21012 emit_insn (gen_ashr3 (scratch, scratch,
21013 GEN_INT (half_width - 1)));
21014 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21015 scratch));
21016 }
21017 else
21018 {
21019 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21020 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21021
21022 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21023 }
21024 }
21025 }
21026
21027 void
21028 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21029 {
21030 rtx (*gen_lshr3)(rtx, rtx, rtx)
21031 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21032 rtx (*gen_shrd)(rtx, rtx, rtx);
21033 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21034
21035 rtx low[2], high[2];
21036 int count;
21037
21038 if (CONST_INT_P (operands[2]))
21039 {
21040 split_double_mode (mode, operands, 2, low, high);
21041 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21042
21043 if (count >= half_width)
21044 {
21045 emit_move_insn (low[0], high[1]);
21046 ix86_expand_clear (high[0]);
21047
21048 if (count > half_width)
21049 emit_insn (gen_lshr3 (low[0], low[0],
21050 GEN_INT (count - half_width)));
21051 }
21052 else
21053 {
21054 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21055
21056 if (!rtx_equal_p (operands[0], operands[1]))
21057 emit_move_insn (operands[0], operands[1]);
21058
21059 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21060 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21061 }
21062 }
21063 else
21064 {
21065 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21066
21067 if (!rtx_equal_p (operands[0], operands[1]))
21068 emit_move_insn (operands[0], operands[1]);
21069
21070 split_double_mode (mode, operands, 1, low, high);
21071
21072 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21073 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21074
21075 if (TARGET_CMOVE && scratch)
21076 {
21077 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21078 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21079
21080 ix86_expand_clear (scratch);
21081 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21082 scratch));
21083 }
21084 else
21085 {
21086 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21087 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21088
21089 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21090 }
21091 }
21092 }
21093
21094 /* Predict just emitted jump instruction to be taken with probability PROB. */
21095 static void
21096 predict_jump (int prob)
21097 {
21098 rtx insn = get_last_insn ();
21099 gcc_assert (JUMP_P (insn));
21100 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21101 }
21102
21103 /* Helper function for the string operations below. Dest VARIABLE whether
21104 it is aligned to VALUE bytes. If true, jump to the label. */
21105 static rtx
21106 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21107 {
21108 rtx label = gen_label_rtx ();
21109 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21110 if (GET_MODE (variable) == DImode)
21111 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21112 else
21113 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21114 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21115 1, label);
21116 if (epilogue)
21117 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21118 else
21119 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21120 return label;
21121 }
21122
21123 /* Adjust COUNTER by the VALUE. */
21124 static void
21125 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21126 {
21127 rtx (*gen_add)(rtx, rtx, rtx)
21128 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21129
21130 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21131 }
21132
21133 /* Zero extend possibly SImode EXP to Pmode register. */
21134 rtx
21135 ix86_zero_extend_to_Pmode (rtx exp)
21136 {
21137 if (GET_MODE (exp) != Pmode)
21138 exp = convert_to_mode (Pmode, exp, 1);
21139 return force_reg (Pmode, exp);
21140 }
21141
21142 /* Divide COUNTREG by SCALE. */
21143 static rtx
21144 scale_counter (rtx countreg, int scale)
21145 {
21146 rtx sc;
21147
21148 if (scale == 1)
21149 return countreg;
21150 if (CONST_INT_P (countreg))
21151 return GEN_INT (INTVAL (countreg) / scale);
21152 gcc_assert (REG_P (countreg));
21153
21154 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21155 GEN_INT (exact_log2 (scale)),
21156 NULL, 1, OPTAB_DIRECT);
21157 return sc;
21158 }
21159
21160 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21161 DImode for constant loop counts. */
21162
21163 static enum machine_mode
21164 counter_mode (rtx count_exp)
21165 {
21166 if (GET_MODE (count_exp) != VOIDmode)
21167 return GET_MODE (count_exp);
21168 if (!CONST_INT_P (count_exp))
21169 return Pmode;
21170 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21171 return DImode;
21172 return SImode;
21173 }
21174
21175 /* When SRCPTR is non-NULL, output simple loop to move memory
21176 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21177 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21178 equivalent loop to set memory by VALUE (supposed to be in MODE).
21179
21180 The size is rounded down to whole number of chunk size moved at once.
21181 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21182
21183
21184 static void
21185 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21186 rtx destptr, rtx srcptr, rtx value,
21187 rtx count, enum machine_mode mode, int unroll,
21188 int expected_size)
21189 {
21190 rtx out_label, top_label, iter, tmp;
21191 enum machine_mode iter_mode = counter_mode (count);
21192 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21193 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21194 rtx size;
21195 rtx x_addr;
21196 rtx y_addr;
21197 int i;
21198
21199 top_label = gen_label_rtx ();
21200 out_label = gen_label_rtx ();
21201 iter = gen_reg_rtx (iter_mode);
21202
21203 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21204 NULL, 1, OPTAB_DIRECT);
21205 /* Those two should combine. */
21206 if (piece_size == const1_rtx)
21207 {
21208 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21209 true, out_label);
21210 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21211 }
21212 emit_move_insn (iter, const0_rtx);
21213
21214 emit_label (top_label);
21215
21216 tmp = convert_modes (Pmode, iter_mode, iter, true);
21217 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21218 destmem = change_address (destmem, mode, x_addr);
21219
21220 if (srcmem)
21221 {
21222 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21223 srcmem = change_address (srcmem, mode, y_addr);
21224
21225 /* When unrolling for chips that reorder memory reads and writes,
21226 we can save registers by using single temporary.
21227 Also using 4 temporaries is overkill in 32bit mode. */
21228 if (!TARGET_64BIT && 0)
21229 {
21230 for (i = 0; i < unroll; i++)
21231 {
21232 if (i)
21233 {
21234 destmem =
21235 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21236 srcmem =
21237 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21238 }
21239 emit_move_insn (destmem, srcmem);
21240 }
21241 }
21242 else
21243 {
21244 rtx tmpreg[4];
21245 gcc_assert (unroll <= 4);
21246 for (i = 0; i < unroll; i++)
21247 {
21248 tmpreg[i] = gen_reg_rtx (mode);
21249 if (i)
21250 {
21251 srcmem =
21252 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21253 }
21254 emit_move_insn (tmpreg[i], srcmem);
21255 }
21256 for (i = 0; i < unroll; i++)
21257 {
21258 if (i)
21259 {
21260 destmem =
21261 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21262 }
21263 emit_move_insn (destmem, tmpreg[i]);
21264 }
21265 }
21266 }
21267 else
21268 for (i = 0; i < unroll; i++)
21269 {
21270 if (i)
21271 destmem =
21272 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21273 emit_move_insn (destmem, value);
21274 }
21275
21276 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21277 true, OPTAB_LIB_WIDEN);
21278 if (tmp != iter)
21279 emit_move_insn (iter, tmp);
21280
21281 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21282 true, top_label);
21283 if (expected_size != -1)
21284 {
21285 expected_size /= GET_MODE_SIZE (mode) * unroll;
21286 if (expected_size == 0)
21287 predict_jump (0);
21288 else if (expected_size > REG_BR_PROB_BASE)
21289 predict_jump (REG_BR_PROB_BASE - 1);
21290 else
21291 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21292 }
21293 else
21294 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21295 iter = ix86_zero_extend_to_Pmode (iter);
21296 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21297 true, OPTAB_LIB_WIDEN);
21298 if (tmp != destptr)
21299 emit_move_insn (destptr, tmp);
21300 if (srcptr)
21301 {
21302 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21303 true, OPTAB_LIB_WIDEN);
21304 if (tmp != srcptr)
21305 emit_move_insn (srcptr, tmp);
21306 }
21307 emit_label (out_label);
21308 }
21309
21310 /* Output "rep; mov" instruction.
21311 Arguments have same meaning as for previous function */
21312 static void
21313 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21314 rtx destptr, rtx srcptr,
21315 rtx count,
21316 enum machine_mode mode)
21317 {
21318 rtx destexp;
21319 rtx srcexp;
21320 rtx countreg;
21321 HOST_WIDE_INT rounded_count;
21322
21323 /* If the size is known, it is shorter to use rep movs. */
21324 if (mode == QImode && CONST_INT_P (count)
21325 && !(INTVAL (count) & 3))
21326 mode = SImode;
21327
21328 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21329 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21330 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21331 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21332 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21333 if (mode != QImode)
21334 {
21335 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21336 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21337 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21338 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21339 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21340 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21341 }
21342 else
21343 {
21344 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21345 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21346 }
21347 if (CONST_INT_P (count))
21348 {
21349 rounded_count = (INTVAL (count)
21350 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21351 destmem = shallow_copy_rtx (destmem);
21352 srcmem = shallow_copy_rtx (srcmem);
21353 set_mem_size (destmem, rounded_count);
21354 set_mem_size (srcmem, rounded_count);
21355 }
21356 else
21357 {
21358 if (MEM_SIZE_KNOWN_P (destmem))
21359 clear_mem_size (destmem);
21360 if (MEM_SIZE_KNOWN_P (srcmem))
21361 clear_mem_size (srcmem);
21362 }
21363 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21364 destexp, srcexp));
21365 }
21366
21367 /* Output "rep; stos" instruction.
21368 Arguments have same meaning as for previous function */
21369 static void
21370 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21371 rtx count, enum machine_mode mode,
21372 rtx orig_value)
21373 {
21374 rtx destexp;
21375 rtx countreg;
21376 HOST_WIDE_INT rounded_count;
21377
21378 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21379 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21380 value = force_reg (mode, gen_lowpart (mode, value));
21381 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21382 if (mode != QImode)
21383 {
21384 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21385 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21386 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21387 }
21388 else
21389 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21390 if (orig_value == const0_rtx && CONST_INT_P (count))
21391 {
21392 rounded_count = (INTVAL (count)
21393 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21394 destmem = shallow_copy_rtx (destmem);
21395 set_mem_size (destmem, rounded_count);
21396 }
21397 else if (MEM_SIZE_KNOWN_P (destmem))
21398 clear_mem_size (destmem);
21399 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21400 }
21401
21402 static void
21403 emit_strmov (rtx destmem, rtx srcmem,
21404 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21405 {
21406 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21407 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21408 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21409 }
21410
21411 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21412 static void
21413 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21414 rtx destptr, rtx srcptr, rtx count, int max_size)
21415 {
21416 rtx src, dest;
21417 if (CONST_INT_P (count))
21418 {
21419 HOST_WIDE_INT countval = INTVAL (count);
21420 int offset = 0;
21421
21422 if ((countval & 0x10) && max_size > 16)
21423 {
21424 if (TARGET_64BIT)
21425 {
21426 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21427 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21428 }
21429 else
21430 gcc_unreachable ();
21431 offset += 16;
21432 }
21433 if ((countval & 0x08) && max_size > 8)
21434 {
21435 if (TARGET_64BIT)
21436 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21437 else
21438 {
21439 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21440 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21441 }
21442 offset += 8;
21443 }
21444 if ((countval & 0x04) && max_size > 4)
21445 {
21446 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21447 offset += 4;
21448 }
21449 if ((countval & 0x02) && max_size > 2)
21450 {
21451 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21452 offset += 2;
21453 }
21454 if ((countval & 0x01) && max_size > 1)
21455 {
21456 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21457 offset += 1;
21458 }
21459 return;
21460 }
21461 if (max_size > 8)
21462 {
21463 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21464 count, 1, OPTAB_DIRECT);
21465 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21466 count, QImode, 1, 4);
21467 return;
21468 }
21469
21470 /* When there are stringops, we can cheaply increase dest and src pointers.
21471 Otherwise we save code size by maintaining offset (zero is readily
21472 available from preceding rep operation) and using x86 addressing modes.
21473 */
21474 if (TARGET_SINGLE_STRINGOP)
21475 {
21476 if (max_size > 4)
21477 {
21478 rtx label = ix86_expand_aligntest (count, 4, true);
21479 src = change_address (srcmem, SImode, srcptr);
21480 dest = change_address (destmem, SImode, destptr);
21481 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21482 emit_label (label);
21483 LABEL_NUSES (label) = 1;
21484 }
21485 if (max_size > 2)
21486 {
21487 rtx label = ix86_expand_aligntest (count, 2, true);
21488 src = change_address (srcmem, HImode, srcptr);
21489 dest = change_address (destmem, HImode, destptr);
21490 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21491 emit_label (label);
21492 LABEL_NUSES (label) = 1;
21493 }
21494 if (max_size > 1)
21495 {
21496 rtx label = ix86_expand_aligntest (count, 1, true);
21497 src = change_address (srcmem, QImode, srcptr);
21498 dest = change_address (destmem, QImode, destptr);
21499 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21500 emit_label (label);
21501 LABEL_NUSES (label) = 1;
21502 }
21503 }
21504 else
21505 {
21506 rtx offset = force_reg (Pmode, const0_rtx);
21507 rtx tmp;
21508
21509 if (max_size > 4)
21510 {
21511 rtx label = ix86_expand_aligntest (count, 4, true);
21512 src = change_address (srcmem, SImode, srcptr);
21513 dest = change_address (destmem, SImode, destptr);
21514 emit_move_insn (dest, src);
21515 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21516 true, OPTAB_LIB_WIDEN);
21517 if (tmp != offset)
21518 emit_move_insn (offset, tmp);
21519 emit_label (label);
21520 LABEL_NUSES (label) = 1;
21521 }
21522 if (max_size > 2)
21523 {
21524 rtx label = ix86_expand_aligntest (count, 2, true);
21525 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21526 src = change_address (srcmem, HImode, tmp);
21527 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21528 dest = change_address (destmem, HImode, tmp);
21529 emit_move_insn (dest, src);
21530 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21531 true, OPTAB_LIB_WIDEN);
21532 if (tmp != offset)
21533 emit_move_insn (offset, tmp);
21534 emit_label (label);
21535 LABEL_NUSES (label) = 1;
21536 }
21537 if (max_size > 1)
21538 {
21539 rtx label = ix86_expand_aligntest (count, 1, true);
21540 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21541 src = change_address (srcmem, QImode, tmp);
21542 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21543 dest = change_address (destmem, QImode, tmp);
21544 emit_move_insn (dest, src);
21545 emit_label (label);
21546 LABEL_NUSES (label) = 1;
21547 }
21548 }
21549 }
21550
21551 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21552 static void
21553 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21554 rtx count, int max_size)
21555 {
21556 count =
21557 expand_simple_binop (counter_mode (count), AND, count,
21558 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21559 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21560 gen_lowpart (QImode, value), count, QImode,
21561 1, max_size / 2);
21562 }
21563
21564 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21565 static void
21566 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21567 {
21568 rtx dest;
21569
21570 if (CONST_INT_P (count))
21571 {
21572 HOST_WIDE_INT countval = INTVAL (count);
21573 int offset = 0;
21574
21575 if ((countval & 0x10) && max_size > 16)
21576 {
21577 if (TARGET_64BIT)
21578 {
21579 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21580 emit_insn (gen_strset (destptr, dest, value));
21581 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21582 emit_insn (gen_strset (destptr, dest, value));
21583 }
21584 else
21585 gcc_unreachable ();
21586 offset += 16;
21587 }
21588 if ((countval & 0x08) && max_size > 8)
21589 {
21590 if (TARGET_64BIT)
21591 {
21592 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21593 emit_insn (gen_strset (destptr, dest, value));
21594 }
21595 else
21596 {
21597 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21598 emit_insn (gen_strset (destptr, dest, value));
21599 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21600 emit_insn (gen_strset (destptr, dest, value));
21601 }
21602 offset += 8;
21603 }
21604 if ((countval & 0x04) && max_size > 4)
21605 {
21606 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21607 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21608 offset += 4;
21609 }
21610 if ((countval & 0x02) && max_size > 2)
21611 {
21612 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21613 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21614 offset += 2;
21615 }
21616 if ((countval & 0x01) && max_size > 1)
21617 {
21618 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21619 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21620 offset += 1;
21621 }
21622 return;
21623 }
21624 if (max_size > 32)
21625 {
21626 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21627 return;
21628 }
21629 if (max_size > 16)
21630 {
21631 rtx label = ix86_expand_aligntest (count, 16, true);
21632 if (TARGET_64BIT)
21633 {
21634 dest = change_address (destmem, DImode, destptr);
21635 emit_insn (gen_strset (destptr, dest, value));
21636 emit_insn (gen_strset (destptr, dest, value));
21637 }
21638 else
21639 {
21640 dest = change_address (destmem, SImode, destptr);
21641 emit_insn (gen_strset (destptr, dest, value));
21642 emit_insn (gen_strset (destptr, dest, value));
21643 emit_insn (gen_strset (destptr, dest, value));
21644 emit_insn (gen_strset (destptr, dest, value));
21645 }
21646 emit_label (label);
21647 LABEL_NUSES (label) = 1;
21648 }
21649 if (max_size > 8)
21650 {
21651 rtx label = ix86_expand_aligntest (count, 8, true);
21652 if (TARGET_64BIT)
21653 {
21654 dest = change_address (destmem, DImode, destptr);
21655 emit_insn (gen_strset (destptr, dest, value));
21656 }
21657 else
21658 {
21659 dest = change_address (destmem, SImode, destptr);
21660 emit_insn (gen_strset (destptr, dest, value));
21661 emit_insn (gen_strset (destptr, dest, value));
21662 }
21663 emit_label (label);
21664 LABEL_NUSES (label) = 1;
21665 }
21666 if (max_size > 4)
21667 {
21668 rtx label = ix86_expand_aligntest (count, 4, true);
21669 dest = change_address (destmem, SImode, destptr);
21670 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21671 emit_label (label);
21672 LABEL_NUSES (label) = 1;
21673 }
21674 if (max_size > 2)
21675 {
21676 rtx label = ix86_expand_aligntest (count, 2, true);
21677 dest = change_address (destmem, HImode, destptr);
21678 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21679 emit_label (label);
21680 LABEL_NUSES (label) = 1;
21681 }
21682 if (max_size > 1)
21683 {
21684 rtx label = ix86_expand_aligntest (count, 1, true);
21685 dest = change_address (destmem, QImode, destptr);
21686 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21687 emit_label (label);
21688 LABEL_NUSES (label) = 1;
21689 }
21690 }
21691
21692 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21693 DESIRED_ALIGNMENT. */
21694 static void
21695 expand_movmem_prologue (rtx destmem, rtx srcmem,
21696 rtx destptr, rtx srcptr, rtx count,
21697 int align, int desired_alignment)
21698 {
21699 if (align <= 1 && desired_alignment > 1)
21700 {
21701 rtx label = ix86_expand_aligntest (destptr, 1, false);
21702 srcmem = change_address (srcmem, QImode, srcptr);
21703 destmem = change_address (destmem, QImode, destptr);
21704 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21705 ix86_adjust_counter (count, 1);
21706 emit_label (label);
21707 LABEL_NUSES (label) = 1;
21708 }
21709 if (align <= 2 && desired_alignment > 2)
21710 {
21711 rtx label = ix86_expand_aligntest (destptr, 2, false);
21712 srcmem = change_address (srcmem, HImode, srcptr);
21713 destmem = change_address (destmem, HImode, destptr);
21714 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21715 ix86_adjust_counter (count, 2);
21716 emit_label (label);
21717 LABEL_NUSES (label) = 1;
21718 }
21719 if (align <= 4 && desired_alignment > 4)
21720 {
21721 rtx label = ix86_expand_aligntest (destptr, 4, false);
21722 srcmem = change_address (srcmem, SImode, srcptr);
21723 destmem = change_address (destmem, SImode, destptr);
21724 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21725 ix86_adjust_counter (count, 4);
21726 emit_label (label);
21727 LABEL_NUSES (label) = 1;
21728 }
21729 gcc_assert (desired_alignment <= 8);
21730 }
21731
21732 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21733 ALIGN_BYTES is how many bytes need to be copied. */
21734 static rtx
21735 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21736 int desired_align, int align_bytes)
21737 {
21738 rtx src = *srcp;
21739 rtx orig_dst = dst;
21740 rtx orig_src = src;
21741 int off = 0;
21742 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21743 if (src_align_bytes >= 0)
21744 src_align_bytes = desired_align - src_align_bytes;
21745 if (align_bytes & 1)
21746 {
21747 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21748 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21749 off = 1;
21750 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21751 }
21752 if (align_bytes & 2)
21753 {
21754 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21755 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21756 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21757 set_mem_align (dst, 2 * BITS_PER_UNIT);
21758 if (src_align_bytes >= 0
21759 && (src_align_bytes & 1) == (align_bytes & 1)
21760 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21761 set_mem_align (src, 2 * BITS_PER_UNIT);
21762 off = 2;
21763 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21764 }
21765 if (align_bytes & 4)
21766 {
21767 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21768 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21769 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21770 set_mem_align (dst, 4 * BITS_PER_UNIT);
21771 if (src_align_bytes >= 0)
21772 {
21773 unsigned int src_align = 0;
21774 if ((src_align_bytes & 3) == (align_bytes & 3))
21775 src_align = 4;
21776 else if ((src_align_bytes & 1) == (align_bytes & 1))
21777 src_align = 2;
21778 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21779 set_mem_align (src, src_align * BITS_PER_UNIT);
21780 }
21781 off = 4;
21782 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21783 }
21784 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21785 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21786 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21787 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21788 if (src_align_bytes >= 0)
21789 {
21790 unsigned int src_align = 0;
21791 if ((src_align_bytes & 7) == (align_bytes & 7))
21792 src_align = 8;
21793 else if ((src_align_bytes & 3) == (align_bytes & 3))
21794 src_align = 4;
21795 else if ((src_align_bytes & 1) == (align_bytes & 1))
21796 src_align = 2;
21797 if (src_align > (unsigned int) desired_align)
21798 src_align = desired_align;
21799 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21800 set_mem_align (src, src_align * BITS_PER_UNIT);
21801 }
21802 if (MEM_SIZE_KNOWN_P (orig_dst))
21803 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21804 if (MEM_SIZE_KNOWN_P (orig_src))
21805 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21806 *srcp = src;
21807 return dst;
21808 }
21809
21810 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21811 DESIRED_ALIGNMENT. */
21812 static void
21813 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21814 int align, int desired_alignment)
21815 {
21816 if (align <= 1 && desired_alignment > 1)
21817 {
21818 rtx label = ix86_expand_aligntest (destptr, 1, false);
21819 destmem = change_address (destmem, QImode, destptr);
21820 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21821 ix86_adjust_counter (count, 1);
21822 emit_label (label);
21823 LABEL_NUSES (label) = 1;
21824 }
21825 if (align <= 2 && desired_alignment > 2)
21826 {
21827 rtx label = ix86_expand_aligntest (destptr, 2, false);
21828 destmem = change_address (destmem, HImode, destptr);
21829 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21830 ix86_adjust_counter (count, 2);
21831 emit_label (label);
21832 LABEL_NUSES (label) = 1;
21833 }
21834 if (align <= 4 && desired_alignment > 4)
21835 {
21836 rtx label = ix86_expand_aligntest (destptr, 4, false);
21837 destmem = change_address (destmem, SImode, destptr);
21838 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21839 ix86_adjust_counter (count, 4);
21840 emit_label (label);
21841 LABEL_NUSES (label) = 1;
21842 }
21843 gcc_assert (desired_alignment <= 8);
21844 }
21845
21846 /* Set enough from DST to align DST known to by aligned by ALIGN to
21847 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21848 static rtx
21849 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21850 int desired_align, int align_bytes)
21851 {
21852 int off = 0;
21853 rtx orig_dst = dst;
21854 if (align_bytes & 1)
21855 {
21856 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21857 off = 1;
21858 emit_insn (gen_strset (destreg, dst,
21859 gen_lowpart (QImode, value)));
21860 }
21861 if (align_bytes & 2)
21862 {
21863 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21864 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21865 set_mem_align (dst, 2 * BITS_PER_UNIT);
21866 off = 2;
21867 emit_insn (gen_strset (destreg, dst,
21868 gen_lowpart (HImode, value)));
21869 }
21870 if (align_bytes & 4)
21871 {
21872 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21873 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21874 set_mem_align (dst, 4 * BITS_PER_UNIT);
21875 off = 4;
21876 emit_insn (gen_strset (destreg, dst,
21877 gen_lowpart (SImode, value)));
21878 }
21879 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21880 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21881 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21882 if (MEM_SIZE_KNOWN_P (orig_dst))
21883 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21884 return dst;
21885 }
21886
21887 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21888 static enum stringop_alg
21889 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21890 int *dynamic_check)
21891 {
21892 const struct stringop_algs * algs;
21893 bool optimize_for_speed;
21894 /* Algorithms using the rep prefix want at least edi and ecx;
21895 additionally, memset wants eax and memcpy wants esi. Don't
21896 consider such algorithms if the user has appropriated those
21897 registers for their own purposes. */
21898 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21899 || (memset
21900 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21901
21902 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21903 || (alg != rep_prefix_1_byte \
21904 && alg != rep_prefix_4_byte \
21905 && alg != rep_prefix_8_byte))
21906 const struct processor_costs *cost;
21907
21908 /* Even if the string operation call is cold, we still might spend a lot
21909 of time processing large blocks. */
21910 if (optimize_function_for_size_p (cfun)
21911 || (optimize_insn_for_size_p ()
21912 && expected_size != -1 && expected_size < 256))
21913 optimize_for_speed = false;
21914 else
21915 optimize_for_speed = true;
21916
21917 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21918
21919 *dynamic_check = -1;
21920 if (memset)
21921 algs = &cost->memset[TARGET_64BIT != 0];
21922 else
21923 algs = &cost->memcpy[TARGET_64BIT != 0];
21924 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21925 return ix86_stringop_alg;
21926 /* rep; movq or rep; movl is the smallest variant. */
21927 else if (!optimize_for_speed)
21928 {
21929 if (!count || (count & 3))
21930 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21931 else
21932 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21933 }
21934 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21935 */
21936 else if (expected_size != -1 && expected_size < 4)
21937 return loop_1_byte;
21938 else if (expected_size != -1)
21939 {
21940 unsigned int i;
21941 enum stringop_alg alg = libcall;
21942 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21943 {
21944 /* We get here if the algorithms that were not libcall-based
21945 were rep-prefix based and we are unable to use rep prefixes
21946 based on global register usage. Break out of the loop and
21947 use the heuristic below. */
21948 if (algs->size[i].max == 0)
21949 break;
21950 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21951 {
21952 enum stringop_alg candidate = algs->size[i].alg;
21953
21954 if (candidate != libcall && ALG_USABLE_P (candidate))
21955 alg = candidate;
21956 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21957 last non-libcall inline algorithm. */
21958 if (TARGET_INLINE_ALL_STRINGOPS)
21959 {
21960 /* When the current size is best to be copied by a libcall,
21961 but we are still forced to inline, run the heuristic below
21962 that will pick code for medium sized blocks. */
21963 if (alg != libcall)
21964 return alg;
21965 break;
21966 }
21967 else if (ALG_USABLE_P (candidate))
21968 return candidate;
21969 }
21970 }
21971 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21972 }
21973 /* When asked to inline the call anyway, try to pick meaningful choice.
21974 We look for maximal size of block that is faster to copy by hand and
21975 take blocks of at most of that size guessing that average size will
21976 be roughly half of the block.
21977
21978 If this turns out to be bad, we might simply specify the preferred
21979 choice in ix86_costs. */
21980 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21981 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21982 {
21983 int max = -1;
21984 enum stringop_alg alg;
21985 int i;
21986 bool any_alg_usable_p = true;
21987
21988 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21989 {
21990 enum stringop_alg candidate = algs->size[i].alg;
21991 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21992
21993 if (candidate != libcall && candidate
21994 && ALG_USABLE_P (candidate))
21995 max = algs->size[i].max;
21996 }
21997 /* If there aren't any usable algorithms, then recursing on
21998 smaller sizes isn't going to find anything. Just return the
21999 simple byte-at-a-time copy loop. */
22000 if (!any_alg_usable_p)
22001 {
22002 /* Pick something reasonable. */
22003 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22004 *dynamic_check = 128;
22005 return loop_1_byte;
22006 }
22007 if (max == -1)
22008 max = 4096;
22009 alg = decide_alg (count, max / 2, memset, dynamic_check);
22010 gcc_assert (*dynamic_check == -1);
22011 gcc_assert (alg != libcall);
22012 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22013 *dynamic_check = max;
22014 return alg;
22015 }
22016 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22017 #undef ALG_USABLE_P
22018 }
22019
22020 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22021 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22022 static int
22023 decide_alignment (int align,
22024 enum stringop_alg alg,
22025 int expected_size)
22026 {
22027 int desired_align = 0;
22028 switch (alg)
22029 {
22030 case no_stringop:
22031 gcc_unreachable ();
22032 case loop:
22033 case unrolled_loop:
22034 desired_align = GET_MODE_SIZE (Pmode);
22035 break;
22036 case rep_prefix_8_byte:
22037 desired_align = 8;
22038 break;
22039 case rep_prefix_4_byte:
22040 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22041 copying whole cacheline at once. */
22042 if (TARGET_PENTIUMPRO)
22043 desired_align = 8;
22044 else
22045 desired_align = 4;
22046 break;
22047 case rep_prefix_1_byte:
22048 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22049 copying whole cacheline at once. */
22050 if (TARGET_PENTIUMPRO)
22051 desired_align = 8;
22052 else
22053 desired_align = 1;
22054 break;
22055 case loop_1_byte:
22056 desired_align = 1;
22057 break;
22058 case libcall:
22059 return 0;
22060 }
22061
22062 if (optimize_size)
22063 desired_align = 1;
22064 if (desired_align < align)
22065 desired_align = align;
22066 if (expected_size != -1 && expected_size < 4)
22067 desired_align = align;
22068 return desired_align;
22069 }
22070
22071 /* Return the smallest power of 2 greater than VAL. */
22072 static int
22073 smallest_pow2_greater_than (int val)
22074 {
22075 int ret = 1;
22076 while (ret <= val)
22077 ret <<= 1;
22078 return ret;
22079 }
22080
22081 /* Expand string move (memcpy) operation. Use i386 string operations
22082 when profitable. expand_setmem contains similar code. The code
22083 depends upon architecture, block size and alignment, but always has
22084 the same overall structure:
22085
22086 1) Prologue guard: Conditional that jumps up to epilogues for small
22087 blocks that can be handled by epilogue alone. This is faster
22088 but also needed for correctness, since prologue assume the block
22089 is larger than the desired alignment.
22090
22091 Optional dynamic check for size and libcall for large
22092 blocks is emitted here too, with -minline-stringops-dynamically.
22093
22094 2) Prologue: copy first few bytes in order to get destination
22095 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22096 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22097 copied. We emit either a jump tree on power of two sized
22098 blocks, or a byte loop.
22099
22100 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22101 with specified algorithm.
22102
22103 4) Epilogue: code copying tail of the block that is too small to be
22104 handled by main body (or up to size guarded by prologue guard). */
22105
22106 bool
22107 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22108 rtx expected_align_exp, rtx expected_size_exp)
22109 {
22110 rtx destreg;
22111 rtx srcreg;
22112 rtx label = NULL;
22113 rtx tmp;
22114 rtx jump_around_label = NULL;
22115 HOST_WIDE_INT align = 1;
22116 unsigned HOST_WIDE_INT count = 0;
22117 HOST_WIDE_INT expected_size = -1;
22118 int size_needed = 0, epilogue_size_needed;
22119 int desired_align = 0, align_bytes = 0;
22120 enum stringop_alg alg;
22121 int dynamic_check;
22122 bool need_zero_guard = false;
22123
22124 if (CONST_INT_P (align_exp))
22125 align = INTVAL (align_exp);
22126 /* i386 can do misaligned access on reasonably increased cost. */
22127 if (CONST_INT_P (expected_align_exp)
22128 && INTVAL (expected_align_exp) > align)
22129 align = INTVAL (expected_align_exp);
22130 /* ALIGN is the minimum of destination and source alignment, but we care here
22131 just about destination alignment. */
22132 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22133 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22134
22135 if (CONST_INT_P (count_exp))
22136 count = expected_size = INTVAL (count_exp);
22137 if (CONST_INT_P (expected_size_exp) && count == 0)
22138 expected_size = INTVAL (expected_size_exp);
22139
22140 /* Make sure we don't need to care about overflow later on. */
22141 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22142 return false;
22143
22144 /* Step 0: Decide on preferred algorithm, desired alignment and
22145 size of chunks to be copied by main loop. */
22146
22147 alg = decide_alg (count, expected_size, false, &dynamic_check);
22148 desired_align = decide_alignment (align, alg, expected_size);
22149
22150 if (!TARGET_ALIGN_STRINGOPS)
22151 align = desired_align;
22152
22153 if (alg == libcall)
22154 return false;
22155 gcc_assert (alg != no_stringop);
22156 if (!count)
22157 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22158 destreg = copy_addr_to_reg (XEXP (dst, 0));
22159 srcreg = copy_addr_to_reg (XEXP (src, 0));
22160 switch (alg)
22161 {
22162 case libcall:
22163 case no_stringop:
22164 gcc_unreachable ();
22165 case loop:
22166 need_zero_guard = true;
22167 size_needed = GET_MODE_SIZE (word_mode);
22168 break;
22169 case unrolled_loop:
22170 need_zero_guard = true;
22171 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22172 break;
22173 case rep_prefix_8_byte:
22174 size_needed = 8;
22175 break;
22176 case rep_prefix_4_byte:
22177 size_needed = 4;
22178 break;
22179 case rep_prefix_1_byte:
22180 size_needed = 1;
22181 break;
22182 case loop_1_byte:
22183 need_zero_guard = true;
22184 size_needed = 1;
22185 break;
22186 }
22187
22188 epilogue_size_needed = size_needed;
22189
22190 /* Step 1: Prologue guard. */
22191
22192 /* Alignment code needs count to be in register. */
22193 if (CONST_INT_P (count_exp) && desired_align > align)
22194 {
22195 if (INTVAL (count_exp) > desired_align
22196 && INTVAL (count_exp) > size_needed)
22197 {
22198 align_bytes
22199 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22200 if (align_bytes <= 0)
22201 align_bytes = 0;
22202 else
22203 align_bytes = desired_align - align_bytes;
22204 }
22205 if (align_bytes == 0)
22206 count_exp = force_reg (counter_mode (count_exp), count_exp);
22207 }
22208 gcc_assert (desired_align >= 1 && align >= 1);
22209
22210 /* Ensure that alignment prologue won't copy past end of block. */
22211 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22212 {
22213 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22214 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22215 Make sure it is power of 2. */
22216 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22217
22218 if (count)
22219 {
22220 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22221 {
22222 /* If main algorithm works on QImode, no epilogue is needed.
22223 For small sizes just don't align anything. */
22224 if (size_needed == 1)
22225 desired_align = align;
22226 else
22227 goto epilogue;
22228 }
22229 }
22230 else
22231 {
22232 label = gen_label_rtx ();
22233 emit_cmp_and_jump_insns (count_exp,
22234 GEN_INT (epilogue_size_needed),
22235 LTU, 0, counter_mode (count_exp), 1, label);
22236 if (expected_size == -1 || expected_size < epilogue_size_needed)
22237 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22238 else
22239 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22240 }
22241 }
22242
22243 /* Emit code to decide on runtime whether library call or inline should be
22244 used. */
22245 if (dynamic_check != -1)
22246 {
22247 if (CONST_INT_P (count_exp))
22248 {
22249 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22250 {
22251 emit_block_move_via_libcall (dst, src, count_exp, false);
22252 count_exp = const0_rtx;
22253 goto epilogue;
22254 }
22255 }
22256 else
22257 {
22258 rtx hot_label = gen_label_rtx ();
22259 jump_around_label = gen_label_rtx ();
22260 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22261 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22262 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22263 emit_block_move_via_libcall (dst, src, count_exp, false);
22264 emit_jump (jump_around_label);
22265 emit_label (hot_label);
22266 }
22267 }
22268
22269 /* Step 2: Alignment prologue. */
22270
22271 if (desired_align > align)
22272 {
22273 if (align_bytes == 0)
22274 {
22275 /* Except for the first move in epilogue, we no longer know
22276 constant offset in aliasing info. It don't seems to worth
22277 the pain to maintain it for the first move, so throw away
22278 the info early. */
22279 src = change_address (src, BLKmode, srcreg);
22280 dst = change_address (dst, BLKmode, destreg);
22281 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22282 desired_align);
22283 }
22284 else
22285 {
22286 /* If we know how many bytes need to be stored before dst is
22287 sufficiently aligned, maintain aliasing info accurately. */
22288 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22289 desired_align, align_bytes);
22290 count_exp = plus_constant (counter_mode (count_exp),
22291 count_exp, -align_bytes);
22292 count -= align_bytes;
22293 }
22294 if (need_zero_guard
22295 && (count < (unsigned HOST_WIDE_INT) size_needed
22296 || (align_bytes == 0
22297 && count < ((unsigned HOST_WIDE_INT) size_needed
22298 + desired_align - align))))
22299 {
22300 /* It is possible that we copied enough so the main loop will not
22301 execute. */
22302 gcc_assert (size_needed > 1);
22303 if (label == NULL_RTX)
22304 label = gen_label_rtx ();
22305 emit_cmp_and_jump_insns (count_exp,
22306 GEN_INT (size_needed),
22307 LTU, 0, counter_mode (count_exp), 1, label);
22308 if (expected_size == -1
22309 || expected_size < (desired_align - align) / 2 + size_needed)
22310 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22311 else
22312 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22313 }
22314 }
22315 if (label && size_needed == 1)
22316 {
22317 emit_label (label);
22318 LABEL_NUSES (label) = 1;
22319 label = NULL;
22320 epilogue_size_needed = 1;
22321 }
22322 else if (label == NULL_RTX)
22323 epilogue_size_needed = size_needed;
22324
22325 /* Step 3: Main loop. */
22326
22327 switch (alg)
22328 {
22329 case libcall:
22330 case no_stringop:
22331 gcc_unreachable ();
22332 case loop_1_byte:
22333 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22334 count_exp, QImode, 1, expected_size);
22335 break;
22336 case loop:
22337 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22338 count_exp, word_mode, 1, expected_size);
22339 break;
22340 case unrolled_loop:
22341 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22342 registers for 4 temporaries anyway. */
22343 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22344 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22345 expected_size);
22346 break;
22347 case rep_prefix_8_byte:
22348 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22349 DImode);
22350 break;
22351 case rep_prefix_4_byte:
22352 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22353 SImode);
22354 break;
22355 case rep_prefix_1_byte:
22356 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22357 QImode);
22358 break;
22359 }
22360 /* Adjust properly the offset of src and dest memory for aliasing. */
22361 if (CONST_INT_P (count_exp))
22362 {
22363 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22364 (count / size_needed) * size_needed);
22365 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22366 (count / size_needed) * size_needed);
22367 }
22368 else
22369 {
22370 src = change_address (src, BLKmode, srcreg);
22371 dst = change_address (dst, BLKmode, destreg);
22372 }
22373
22374 /* Step 4: Epilogue to copy the remaining bytes. */
22375 epilogue:
22376 if (label)
22377 {
22378 /* When the main loop is done, COUNT_EXP might hold original count,
22379 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22380 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22381 bytes. Compensate if needed. */
22382
22383 if (size_needed < epilogue_size_needed)
22384 {
22385 tmp =
22386 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22387 GEN_INT (size_needed - 1), count_exp, 1,
22388 OPTAB_DIRECT);
22389 if (tmp != count_exp)
22390 emit_move_insn (count_exp, tmp);
22391 }
22392 emit_label (label);
22393 LABEL_NUSES (label) = 1;
22394 }
22395
22396 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22397 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22398 epilogue_size_needed);
22399 if (jump_around_label)
22400 emit_label (jump_around_label);
22401 return true;
22402 }
22403
22404 /* Helper function for memcpy. For QImode value 0xXY produce
22405 0xXYXYXYXY of wide specified by MODE. This is essentially
22406 a * 0x10101010, but we can do slightly better than
22407 synth_mult by unwinding the sequence by hand on CPUs with
22408 slow multiply. */
22409 static rtx
22410 promote_duplicated_reg (enum machine_mode mode, rtx val)
22411 {
22412 enum machine_mode valmode = GET_MODE (val);
22413 rtx tmp;
22414 int nops = mode == DImode ? 3 : 2;
22415
22416 gcc_assert (mode == SImode || mode == DImode);
22417 if (val == const0_rtx)
22418 return copy_to_mode_reg (mode, const0_rtx);
22419 if (CONST_INT_P (val))
22420 {
22421 HOST_WIDE_INT v = INTVAL (val) & 255;
22422
22423 v |= v << 8;
22424 v |= v << 16;
22425 if (mode == DImode)
22426 v |= (v << 16) << 16;
22427 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22428 }
22429
22430 if (valmode == VOIDmode)
22431 valmode = QImode;
22432 if (valmode != QImode)
22433 val = gen_lowpart (QImode, val);
22434 if (mode == QImode)
22435 return val;
22436 if (!TARGET_PARTIAL_REG_STALL)
22437 nops--;
22438 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22439 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22440 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22441 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22442 {
22443 rtx reg = convert_modes (mode, QImode, val, true);
22444 tmp = promote_duplicated_reg (mode, const1_rtx);
22445 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22446 OPTAB_DIRECT);
22447 }
22448 else
22449 {
22450 rtx reg = convert_modes (mode, QImode, val, true);
22451
22452 if (!TARGET_PARTIAL_REG_STALL)
22453 if (mode == SImode)
22454 emit_insn (gen_movsi_insv_1 (reg, reg));
22455 else
22456 emit_insn (gen_movdi_insv_1 (reg, reg));
22457 else
22458 {
22459 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22460 NULL, 1, OPTAB_DIRECT);
22461 reg =
22462 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22463 }
22464 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22465 NULL, 1, OPTAB_DIRECT);
22466 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22467 if (mode == SImode)
22468 return reg;
22469 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22470 NULL, 1, OPTAB_DIRECT);
22471 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22472 return reg;
22473 }
22474 }
22475
22476 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22477 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22478 alignment from ALIGN to DESIRED_ALIGN. */
22479 static rtx
22480 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22481 {
22482 rtx promoted_val;
22483
22484 if (TARGET_64BIT
22485 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22486 promoted_val = promote_duplicated_reg (DImode, val);
22487 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22488 promoted_val = promote_duplicated_reg (SImode, val);
22489 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22490 promoted_val = promote_duplicated_reg (HImode, val);
22491 else
22492 promoted_val = val;
22493
22494 return promoted_val;
22495 }
22496
22497 /* Expand string clear operation (bzero). Use i386 string operations when
22498 profitable. See expand_movmem comment for explanation of individual
22499 steps performed. */
22500 bool
22501 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22502 rtx expected_align_exp, rtx expected_size_exp)
22503 {
22504 rtx destreg;
22505 rtx label = NULL;
22506 rtx tmp;
22507 rtx jump_around_label = NULL;
22508 HOST_WIDE_INT align = 1;
22509 unsigned HOST_WIDE_INT count = 0;
22510 HOST_WIDE_INT expected_size = -1;
22511 int size_needed = 0, epilogue_size_needed;
22512 int desired_align = 0, align_bytes = 0;
22513 enum stringop_alg alg;
22514 rtx promoted_val = NULL;
22515 bool force_loopy_epilogue = false;
22516 int dynamic_check;
22517 bool need_zero_guard = false;
22518
22519 if (CONST_INT_P (align_exp))
22520 align = INTVAL (align_exp);
22521 /* i386 can do misaligned access on reasonably increased cost. */
22522 if (CONST_INT_P (expected_align_exp)
22523 && INTVAL (expected_align_exp) > align)
22524 align = INTVAL (expected_align_exp);
22525 if (CONST_INT_P (count_exp))
22526 count = expected_size = INTVAL (count_exp);
22527 if (CONST_INT_P (expected_size_exp) && count == 0)
22528 expected_size = INTVAL (expected_size_exp);
22529
22530 /* Make sure we don't need to care about overflow later on. */
22531 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22532 return false;
22533
22534 /* Step 0: Decide on preferred algorithm, desired alignment and
22535 size of chunks to be copied by main loop. */
22536
22537 alg = decide_alg (count, expected_size, true, &dynamic_check);
22538 desired_align = decide_alignment (align, alg, expected_size);
22539
22540 if (!TARGET_ALIGN_STRINGOPS)
22541 align = desired_align;
22542
22543 if (alg == libcall)
22544 return false;
22545 gcc_assert (alg != no_stringop);
22546 if (!count)
22547 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22548 destreg = copy_addr_to_reg (XEXP (dst, 0));
22549 switch (alg)
22550 {
22551 case libcall:
22552 case no_stringop:
22553 gcc_unreachable ();
22554 case loop:
22555 need_zero_guard = true;
22556 size_needed = GET_MODE_SIZE (word_mode);
22557 break;
22558 case unrolled_loop:
22559 need_zero_guard = true;
22560 size_needed = GET_MODE_SIZE (word_mode) * 4;
22561 break;
22562 case rep_prefix_8_byte:
22563 size_needed = 8;
22564 break;
22565 case rep_prefix_4_byte:
22566 size_needed = 4;
22567 break;
22568 case rep_prefix_1_byte:
22569 size_needed = 1;
22570 break;
22571 case loop_1_byte:
22572 need_zero_guard = true;
22573 size_needed = 1;
22574 break;
22575 }
22576 epilogue_size_needed = size_needed;
22577
22578 /* Step 1: Prologue guard. */
22579
22580 /* Alignment code needs count to be in register. */
22581 if (CONST_INT_P (count_exp) && desired_align > align)
22582 {
22583 if (INTVAL (count_exp) > desired_align
22584 && INTVAL (count_exp) > size_needed)
22585 {
22586 align_bytes
22587 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22588 if (align_bytes <= 0)
22589 align_bytes = 0;
22590 else
22591 align_bytes = desired_align - align_bytes;
22592 }
22593 if (align_bytes == 0)
22594 {
22595 enum machine_mode mode = SImode;
22596 if (TARGET_64BIT && (count & ~0xffffffff))
22597 mode = DImode;
22598 count_exp = force_reg (mode, count_exp);
22599 }
22600 }
22601 /* Do the cheap promotion to allow better CSE across the
22602 main loop and epilogue (ie one load of the big constant in the
22603 front of all code. */
22604 if (CONST_INT_P (val_exp))
22605 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22606 desired_align, align);
22607 /* Ensure that alignment prologue won't copy past end of block. */
22608 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22609 {
22610 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22611 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22612 Make sure it is power of 2. */
22613 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22614
22615 /* To improve performance of small blocks, we jump around the VAL
22616 promoting mode. This mean that if the promoted VAL is not constant,
22617 we might not use it in the epilogue and have to use byte
22618 loop variant. */
22619 if (epilogue_size_needed > 2 && !promoted_val)
22620 force_loopy_epilogue = true;
22621 if (count)
22622 {
22623 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22624 {
22625 /* If main algorithm works on QImode, no epilogue is needed.
22626 For small sizes just don't align anything. */
22627 if (size_needed == 1)
22628 desired_align = align;
22629 else
22630 goto epilogue;
22631 }
22632 }
22633 else
22634 {
22635 label = gen_label_rtx ();
22636 emit_cmp_and_jump_insns (count_exp,
22637 GEN_INT (epilogue_size_needed),
22638 LTU, 0, counter_mode (count_exp), 1, label);
22639 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22640 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22641 else
22642 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22643 }
22644 }
22645 if (dynamic_check != -1)
22646 {
22647 rtx hot_label = gen_label_rtx ();
22648 jump_around_label = gen_label_rtx ();
22649 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22650 LEU, 0, counter_mode (count_exp), 1, hot_label);
22651 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22652 set_storage_via_libcall (dst, count_exp, val_exp, false);
22653 emit_jump (jump_around_label);
22654 emit_label (hot_label);
22655 }
22656
22657 /* Step 2: Alignment prologue. */
22658
22659 /* Do the expensive promotion once we branched off the small blocks. */
22660 if (!promoted_val)
22661 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22662 desired_align, align);
22663 gcc_assert (desired_align >= 1 && align >= 1);
22664
22665 if (desired_align > align)
22666 {
22667 if (align_bytes == 0)
22668 {
22669 /* Except for the first move in epilogue, we no longer know
22670 constant offset in aliasing info. It don't seems to worth
22671 the pain to maintain it for the first move, so throw away
22672 the info early. */
22673 dst = change_address (dst, BLKmode, destreg);
22674 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22675 desired_align);
22676 }
22677 else
22678 {
22679 /* If we know how many bytes need to be stored before dst is
22680 sufficiently aligned, maintain aliasing info accurately. */
22681 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22682 desired_align, align_bytes);
22683 count_exp = plus_constant (counter_mode (count_exp),
22684 count_exp, -align_bytes);
22685 count -= align_bytes;
22686 }
22687 if (need_zero_guard
22688 && (count < (unsigned HOST_WIDE_INT) size_needed
22689 || (align_bytes == 0
22690 && count < ((unsigned HOST_WIDE_INT) size_needed
22691 + desired_align - align))))
22692 {
22693 /* It is possible that we copied enough so the main loop will not
22694 execute. */
22695 gcc_assert (size_needed > 1);
22696 if (label == NULL_RTX)
22697 label = gen_label_rtx ();
22698 emit_cmp_and_jump_insns (count_exp,
22699 GEN_INT (size_needed),
22700 LTU, 0, counter_mode (count_exp), 1, label);
22701 if (expected_size == -1
22702 || expected_size < (desired_align - align) / 2 + size_needed)
22703 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22704 else
22705 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22706 }
22707 }
22708 if (label && size_needed == 1)
22709 {
22710 emit_label (label);
22711 LABEL_NUSES (label) = 1;
22712 label = NULL;
22713 promoted_val = val_exp;
22714 epilogue_size_needed = 1;
22715 }
22716 else if (label == NULL_RTX)
22717 epilogue_size_needed = size_needed;
22718
22719 /* Step 3: Main loop. */
22720
22721 switch (alg)
22722 {
22723 case libcall:
22724 case no_stringop:
22725 gcc_unreachable ();
22726 case loop_1_byte:
22727 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22728 count_exp, QImode, 1, expected_size);
22729 break;
22730 case loop:
22731 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22732 count_exp, word_mode, 1, expected_size);
22733 break;
22734 case unrolled_loop:
22735 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22736 count_exp, word_mode, 4, expected_size);
22737 break;
22738 case rep_prefix_8_byte:
22739 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22740 DImode, val_exp);
22741 break;
22742 case rep_prefix_4_byte:
22743 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22744 SImode, val_exp);
22745 break;
22746 case rep_prefix_1_byte:
22747 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22748 QImode, val_exp);
22749 break;
22750 }
22751 /* Adjust properly the offset of src and dest memory for aliasing. */
22752 if (CONST_INT_P (count_exp))
22753 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22754 (count / size_needed) * size_needed);
22755 else
22756 dst = change_address (dst, BLKmode, destreg);
22757
22758 /* Step 4: Epilogue to copy the remaining bytes. */
22759
22760 if (label)
22761 {
22762 /* When the main loop is done, COUNT_EXP might hold original count,
22763 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22764 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22765 bytes. Compensate if needed. */
22766
22767 if (size_needed < epilogue_size_needed)
22768 {
22769 tmp =
22770 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22771 GEN_INT (size_needed - 1), count_exp, 1,
22772 OPTAB_DIRECT);
22773 if (tmp != count_exp)
22774 emit_move_insn (count_exp, tmp);
22775 }
22776 emit_label (label);
22777 LABEL_NUSES (label) = 1;
22778 }
22779 epilogue:
22780 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22781 {
22782 if (force_loopy_epilogue)
22783 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22784 epilogue_size_needed);
22785 else
22786 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22787 epilogue_size_needed);
22788 }
22789 if (jump_around_label)
22790 emit_label (jump_around_label);
22791 return true;
22792 }
22793
22794 /* Expand the appropriate insns for doing strlen if not just doing
22795 repnz; scasb
22796
22797 out = result, initialized with the start address
22798 align_rtx = alignment of the address.
22799 scratch = scratch register, initialized with the startaddress when
22800 not aligned, otherwise undefined
22801
22802 This is just the body. It needs the initializations mentioned above and
22803 some address computing at the end. These things are done in i386.md. */
22804
22805 static void
22806 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22807 {
22808 int align;
22809 rtx tmp;
22810 rtx align_2_label = NULL_RTX;
22811 rtx align_3_label = NULL_RTX;
22812 rtx align_4_label = gen_label_rtx ();
22813 rtx end_0_label = gen_label_rtx ();
22814 rtx mem;
22815 rtx tmpreg = gen_reg_rtx (SImode);
22816 rtx scratch = gen_reg_rtx (SImode);
22817 rtx cmp;
22818
22819 align = 0;
22820 if (CONST_INT_P (align_rtx))
22821 align = INTVAL (align_rtx);
22822
22823 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22824
22825 /* Is there a known alignment and is it less than 4? */
22826 if (align < 4)
22827 {
22828 rtx scratch1 = gen_reg_rtx (Pmode);
22829 emit_move_insn (scratch1, out);
22830 /* Is there a known alignment and is it not 2? */
22831 if (align != 2)
22832 {
22833 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22834 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22835
22836 /* Leave just the 3 lower bits. */
22837 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22838 NULL_RTX, 0, OPTAB_WIDEN);
22839
22840 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22841 Pmode, 1, align_4_label);
22842 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22843 Pmode, 1, align_2_label);
22844 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22845 Pmode, 1, align_3_label);
22846 }
22847 else
22848 {
22849 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22850 check if is aligned to 4 - byte. */
22851
22852 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22853 NULL_RTX, 0, OPTAB_WIDEN);
22854
22855 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22856 Pmode, 1, align_4_label);
22857 }
22858
22859 mem = change_address (src, QImode, out);
22860
22861 /* Now compare the bytes. */
22862
22863 /* Compare the first n unaligned byte on a byte per byte basis. */
22864 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22865 QImode, 1, end_0_label);
22866
22867 /* Increment the address. */
22868 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22869
22870 /* Not needed with an alignment of 2 */
22871 if (align != 2)
22872 {
22873 emit_label (align_2_label);
22874
22875 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22876 end_0_label);
22877
22878 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22879
22880 emit_label (align_3_label);
22881 }
22882
22883 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22884 end_0_label);
22885
22886 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22887 }
22888
22889 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22890 align this loop. It gives only huge programs, but does not help to
22891 speed up. */
22892 emit_label (align_4_label);
22893
22894 mem = change_address (src, SImode, out);
22895 emit_move_insn (scratch, mem);
22896 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22897
22898 /* This formula yields a nonzero result iff one of the bytes is zero.
22899 This saves three branches inside loop and many cycles. */
22900
22901 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22902 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22903 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22904 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22905 gen_int_mode (0x80808080, SImode)));
22906 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22907 align_4_label);
22908
22909 if (TARGET_CMOVE)
22910 {
22911 rtx reg = gen_reg_rtx (SImode);
22912 rtx reg2 = gen_reg_rtx (Pmode);
22913 emit_move_insn (reg, tmpreg);
22914 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22915
22916 /* If zero is not in the first two bytes, move two bytes forward. */
22917 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22918 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22919 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22920 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22921 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22922 reg,
22923 tmpreg)));
22924 /* Emit lea manually to avoid clobbering of flags. */
22925 emit_insn (gen_rtx_SET (SImode, reg2,
22926 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22927
22928 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22929 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22930 emit_insn (gen_rtx_SET (VOIDmode, out,
22931 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22932 reg2,
22933 out)));
22934 }
22935 else
22936 {
22937 rtx end_2_label = gen_label_rtx ();
22938 /* Is zero in the first two bytes? */
22939
22940 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22941 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22942 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22943 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22944 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22945 pc_rtx);
22946 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22947 JUMP_LABEL (tmp) = end_2_label;
22948
22949 /* Not in the first two. Move two bytes forward. */
22950 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22951 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22952
22953 emit_label (end_2_label);
22954
22955 }
22956
22957 /* Avoid branch in fixing the byte. */
22958 tmpreg = gen_lowpart (QImode, tmpreg);
22959 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22960 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22961 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22962 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22963
22964 emit_label (end_0_label);
22965 }
22966
22967 /* Expand strlen. */
22968
22969 bool
22970 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22971 {
22972 rtx addr, scratch1, scratch2, scratch3, scratch4;
22973
22974 /* The generic case of strlen expander is long. Avoid it's
22975 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22976
22977 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22978 && !TARGET_INLINE_ALL_STRINGOPS
22979 && !optimize_insn_for_size_p ()
22980 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22981 return false;
22982
22983 addr = force_reg (Pmode, XEXP (src, 0));
22984 scratch1 = gen_reg_rtx (Pmode);
22985
22986 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22987 && !optimize_insn_for_size_p ())
22988 {
22989 /* Well it seems that some optimizer does not combine a call like
22990 foo(strlen(bar), strlen(bar));
22991 when the move and the subtraction is done here. It does calculate
22992 the length just once when these instructions are done inside of
22993 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22994 often used and I use one fewer register for the lifetime of
22995 output_strlen_unroll() this is better. */
22996
22997 emit_move_insn (out, addr);
22998
22999 ix86_expand_strlensi_unroll_1 (out, src, align);
23000
23001 /* strlensi_unroll_1 returns the address of the zero at the end of
23002 the string, like memchr(), so compute the length by subtracting
23003 the start address. */
23004 emit_insn (ix86_gen_sub3 (out, out, addr));
23005 }
23006 else
23007 {
23008 rtx unspec;
23009
23010 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23011 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23012 return false;
23013
23014 scratch2 = gen_reg_rtx (Pmode);
23015 scratch3 = gen_reg_rtx (Pmode);
23016 scratch4 = force_reg (Pmode, constm1_rtx);
23017
23018 emit_move_insn (scratch3, addr);
23019 eoschar = force_reg (QImode, eoschar);
23020
23021 src = replace_equiv_address_nv (src, scratch3);
23022
23023 /* If .md starts supporting :P, this can be done in .md. */
23024 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23025 scratch4), UNSPEC_SCAS);
23026 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23027 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23028 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23029 }
23030 return true;
23031 }
23032
23033 /* For given symbol (function) construct code to compute address of it's PLT
23034 entry in large x86-64 PIC model. */
23035 rtx
23036 construct_plt_address (rtx symbol)
23037 {
23038 rtx tmp, unspec;
23039
23040 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23041 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23042 gcc_assert (Pmode == DImode);
23043
23044 tmp = gen_reg_rtx (Pmode);
23045 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23046
23047 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23048 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23049 return tmp;
23050 }
23051
23052 rtx
23053 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23054 rtx callarg2,
23055 rtx pop, bool sibcall)
23056 {
23057 /* We need to represent that SI and DI registers are clobbered
23058 by SYSV calls. */
23059 static int clobbered_registers[] = {
23060 XMM6_REG, XMM7_REG, XMM8_REG,
23061 XMM9_REG, XMM10_REG, XMM11_REG,
23062 XMM12_REG, XMM13_REG, XMM14_REG,
23063 XMM15_REG, SI_REG, DI_REG
23064 };
23065 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23066 rtx use = NULL, call;
23067 unsigned int vec_len;
23068
23069 if (pop == const0_rtx)
23070 pop = NULL;
23071 gcc_assert (!TARGET_64BIT || !pop);
23072
23073 if (TARGET_MACHO && !TARGET_64BIT)
23074 {
23075 #if TARGET_MACHO
23076 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23077 fnaddr = machopic_indirect_call_target (fnaddr);
23078 #endif
23079 }
23080 else
23081 {
23082 /* Static functions and indirect calls don't need the pic register. */
23083 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23084 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23085 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23086 use_reg (&use, pic_offset_table_rtx);
23087 }
23088
23089 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23090 {
23091 rtx al = gen_rtx_REG (QImode, AX_REG);
23092 emit_move_insn (al, callarg2);
23093 use_reg (&use, al);
23094 }
23095
23096 if (ix86_cmodel == CM_LARGE_PIC
23097 && MEM_P (fnaddr)
23098 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23099 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23100 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23101 else if (sibcall
23102 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23103 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23104 {
23105 fnaddr = XEXP (fnaddr, 0);
23106 if (GET_MODE (fnaddr) != word_mode)
23107 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23108 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23109 }
23110
23111 vec_len = 0;
23112 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23113 if (retval)
23114 call = gen_rtx_SET (VOIDmode, retval, call);
23115 vec[vec_len++] = call;
23116
23117 if (pop)
23118 {
23119 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23120 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23121 vec[vec_len++] = pop;
23122 }
23123
23124 if (TARGET_64BIT_MS_ABI
23125 && (!callarg2 || INTVAL (callarg2) != -2))
23126 {
23127 unsigned i;
23128
23129 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23130 UNSPEC_MS_TO_SYSV_CALL);
23131
23132 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23133 vec[vec_len++]
23134 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23135 ? TImode : DImode,
23136 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23137 ? TImode : DImode,
23138 clobbered_registers[i]));
23139 }
23140
23141 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23142 if (TARGET_VZEROUPPER)
23143 {
23144 int avx256;
23145 if (cfun->machine->callee_pass_avx256_p)
23146 {
23147 if (cfun->machine->callee_return_avx256_p)
23148 avx256 = callee_return_pass_avx256;
23149 else
23150 avx256 = callee_pass_avx256;
23151 }
23152 else if (cfun->machine->callee_return_avx256_p)
23153 avx256 = callee_return_avx256;
23154 else
23155 avx256 = call_no_avx256;
23156
23157 if (reload_completed)
23158 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23159 else
23160 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23161 gen_rtvec (1, GEN_INT (avx256)),
23162 UNSPEC_CALL_NEEDS_VZEROUPPER);
23163 }
23164
23165 if (vec_len > 1)
23166 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23167 call = emit_call_insn (call);
23168 if (use)
23169 CALL_INSN_FUNCTION_USAGE (call) = use;
23170
23171 return call;
23172 }
23173
23174 void
23175 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23176 {
23177 rtx pat = PATTERN (insn);
23178 rtvec vec = XVEC (pat, 0);
23179 int len = GET_NUM_ELEM (vec) - 1;
23180
23181 /* Strip off the last entry of the parallel. */
23182 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23183 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23184 if (len == 1)
23185 pat = RTVEC_ELT (vec, 0);
23186 else
23187 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23188
23189 emit_insn (gen_avx_vzeroupper (vzeroupper));
23190 emit_call_insn (pat);
23191 }
23192
23193 /* Output the assembly for a call instruction. */
23194
23195 const char *
23196 ix86_output_call_insn (rtx insn, rtx call_op)
23197 {
23198 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23199 bool seh_nop_p = false;
23200 const char *xasm;
23201
23202 if (SIBLING_CALL_P (insn))
23203 {
23204 if (direct_p)
23205 xasm = "jmp\t%P0";
23206 /* SEH epilogue detection requires the indirect branch case
23207 to include REX.W. */
23208 else if (TARGET_SEH)
23209 xasm = "rex.W jmp %A0";
23210 else
23211 xasm = "jmp\t%A0";
23212
23213 output_asm_insn (xasm, &call_op);
23214 return "";
23215 }
23216
23217 /* SEH unwinding can require an extra nop to be emitted in several
23218 circumstances. Determine if we have one of those. */
23219 if (TARGET_SEH)
23220 {
23221 rtx i;
23222
23223 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23224 {
23225 /* If we get to another real insn, we don't need the nop. */
23226 if (INSN_P (i))
23227 break;
23228
23229 /* If we get to the epilogue note, prevent a catch region from
23230 being adjacent to the standard epilogue sequence. If non-
23231 call-exceptions, we'll have done this during epilogue emission. */
23232 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23233 && !flag_non_call_exceptions
23234 && !can_throw_internal (insn))
23235 {
23236 seh_nop_p = true;
23237 break;
23238 }
23239 }
23240
23241 /* If we didn't find a real insn following the call, prevent the
23242 unwinder from looking into the next function. */
23243 if (i == NULL)
23244 seh_nop_p = true;
23245 }
23246
23247 if (direct_p)
23248 xasm = "call\t%P0";
23249 else
23250 xasm = "call\t%A0";
23251
23252 output_asm_insn (xasm, &call_op);
23253
23254 if (seh_nop_p)
23255 return "nop";
23256
23257 return "";
23258 }
23259 \f
23260 /* Clear stack slot assignments remembered from previous functions.
23261 This is called from INIT_EXPANDERS once before RTL is emitted for each
23262 function. */
23263
23264 static struct machine_function *
23265 ix86_init_machine_status (void)
23266 {
23267 struct machine_function *f;
23268
23269 f = ggc_alloc_cleared_machine_function ();
23270 f->use_fast_prologue_epilogue_nregs = -1;
23271 f->tls_descriptor_call_expanded_p = 0;
23272 f->call_abi = ix86_abi;
23273
23274 return f;
23275 }
23276
23277 /* Return a MEM corresponding to a stack slot with mode MODE.
23278 Allocate a new slot if necessary.
23279
23280 The RTL for a function can have several slots available: N is
23281 which slot to use. */
23282
23283 rtx
23284 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23285 {
23286 struct stack_local_entry *s;
23287
23288 gcc_assert (n < MAX_386_STACK_LOCALS);
23289
23290 /* Virtual slot is valid only before vregs are instantiated. */
23291 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23292
23293 for (s = ix86_stack_locals; s; s = s->next)
23294 if (s->mode == mode && s->n == n)
23295 return validize_mem (copy_rtx (s->rtl));
23296
23297 s = ggc_alloc_stack_local_entry ();
23298 s->n = n;
23299 s->mode = mode;
23300 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23301
23302 s->next = ix86_stack_locals;
23303 ix86_stack_locals = s;
23304 return validize_mem (s->rtl);
23305 }
23306 \f
23307 /* Calculate the length of the memory address in the instruction encoding.
23308 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23309 or other prefixes. */
23310
23311 int
23312 memory_address_length (rtx addr)
23313 {
23314 struct ix86_address parts;
23315 rtx base, index, disp;
23316 int len;
23317 int ok;
23318
23319 if (GET_CODE (addr) == PRE_DEC
23320 || GET_CODE (addr) == POST_INC
23321 || GET_CODE (addr) == PRE_MODIFY
23322 || GET_CODE (addr) == POST_MODIFY)
23323 return 0;
23324
23325 ok = ix86_decompose_address (addr, &parts);
23326 gcc_assert (ok);
23327
23328 if (parts.base && GET_CODE (parts.base) == SUBREG)
23329 parts.base = SUBREG_REG (parts.base);
23330 if (parts.index && GET_CODE (parts.index) == SUBREG)
23331 parts.index = SUBREG_REG (parts.index);
23332
23333 base = parts.base;
23334 index = parts.index;
23335 disp = parts.disp;
23336
23337 /* Add length of addr32 prefix. */
23338 len = (GET_CODE (addr) == ZERO_EXTEND
23339 || GET_CODE (addr) == AND);
23340
23341 /* Rule of thumb:
23342 - esp as the base always wants an index,
23343 - ebp as the base always wants a displacement,
23344 - r12 as the base always wants an index,
23345 - r13 as the base always wants a displacement. */
23346
23347 /* Register Indirect. */
23348 if (base && !index && !disp)
23349 {
23350 /* esp (for its index) and ebp (for its displacement) need
23351 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23352 code. */
23353 if (REG_P (addr)
23354 && (addr == arg_pointer_rtx
23355 || addr == frame_pointer_rtx
23356 || REGNO (addr) == SP_REG
23357 || REGNO (addr) == BP_REG
23358 || REGNO (addr) == R12_REG
23359 || REGNO (addr) == R13_REG))
23360 len = 1;
23361 }
23362
23363 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23364 is not disp32, but disp32(%rip), so for disp32
23365 SIB byte is needed, unless print_operand_address
23366 optimizes it into disp32(%rip) or (%rip) is implied
23367 by UNSPEC. */
23368 else if (disp && !base && !index)
23369 {
23370 len = 4;
23371 if (TARGET_64BIT)
23372 {
23373 rtx symbol = disp;
23374
23375 if (GET_CODE (disp) == CONST)
23376 symbol = XEXP (disp, 0);
23377 if (GET_CODE (symbol) == PLUS
23378 && CONST_INT_P (XEXP (symbol, 1)))
23379 symbol = XEXP (symbol, 0);
23380
23381 if (GET_CODE (symbol) != LABEL_REF
23382 && (GET_CODE (symbol) != SYMBOL_REF
23383 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23384 && (GET_CODE (symbol) != UNSPEC
23385 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23386 && XINT (symbol, 1) != UNSPEC_PCREL
23387 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23388 len += 1;
23389 }
23390 }
23391
23392 else
23393 {
23394 /* Find the length of the displacement constant. */
23395 if (disp)
23396 {
23397 if (base && satisfies_constraint_K (disp))
23398 len = 1;
23399 else
23400 len = 4;
23401 }
23402 /* ebp always wants a displacement. Similarly r13. */
23403 else if (base && REG_P (base)
23404 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23405 len = 1;
23406
23407 /* An index requires the two-byte modrm form.... */
23408 if (index
23409 /* ...like esp (or r12), which always wants an index. */
23410 || base == arg_pointer_rtx
23411 || base == frame_pointer_rtx
23412 || (base && REG_P (base)
23413 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23414 len += 1;
23415 }
23416
23417 switch (parts.seg)
23418 {
23419 case SEG_FS:
23420 case SEG_GS:
23421 len += 1;
23422 break;
23423 default:
23424 break;
23425 }
23426
23427 return len;
23428 }
23429
23430 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23431 is set, expect that insn have 8bit immediate alternative. */
23432 int
23433 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23434 {
23435 int len = 0;
23436 int i;
23437 extract_insn_cached (insn);
23438 for (i = recog_data.n_operands - 1; i >= 0; --i)
23439 if (CONSTANT_P (recog_data.operand[i]))
23440 {
23441 enum attr_mode mode = get_attr_mode (insn);
23442
23443 gcc_assert (!len);
23444 if (shortform && CONST_INT_P (recog_data.operand[i]))
23445 {
23446 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23447 switch (mode)
23448 {
23449 case MODE_QI:
23450 len = 1;
23451 continue;
23452 case MODE_HI:
23453 ival = trunc_int_for_mode (ival, HImode);
23454 break;
23455 case MODE_SI:
23456 ival = trunc_int_for_mode (ival, SImode);
23457 break;
23458 default:
23459 break;
23460 }
23461 if (IN_RANGE (ival, -128, 127))
23462 {
23463 len = 1;
23464 continue;
23465 }
23466 }
23467 switch (mode)
23468 {
23469 case MODE_QI:
23470 len = 1;
23471 break;
23472 case MODE_HI:
23473 len = 2;
23474 break;
23475 case MODE_SI:
23476 len = 4;
23477 break;
23478 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23479 case MODE_DI:
23480 len = 4;
23481 break;
23482 default:
23483 fatal_insn ("unknown insn mode", insn);
23484 }
23485 }
23486 return len;
23487 }
23488 /* Compute default value for "length_address" attribute. */
23489 int
23490 ix86_attr_length_address_default (rtx insn)
23491 {
23492 int i;
23493
23494 if (get_attr_type (insn) == TYPE_LEA)
23495 {
23496 rtx set = PATTERN (insn), addr;
23497
23498 if (GET_CODE (set) == PARALLEL)
23499 set = XVECEXP (set, 0, 0);
23500
23501 gcc_assert (GET_CODE (set) == SET);
23502
23503 addr = SET_SRC (set);
23504 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23505 {
23506 if (GET_CODE (addr) == ZERO_EXTEND)
23507 addr = XEXP (addr, 0);
23508 if (GET_CODE (addr) == SUBREG)
23509 addr = SUBREG_REG (addr);
23510 }
23511
23512 return memory_address_length (addr);
23513 }
23514
23515 extract_insn_cached (insn);
23516 for (i = recog_data.n_operands - 1; i >= 0; --i)
23517 if (MEM_P (recog_data.operand[i]))
23518 {
23519 constrain_operands_cached (reload_completed);
23520 if (which_alternative != -1)
23521 {
23522 const char *constraints = recog_data.constraints[i];
23523 int alt = which_alternative;
23524
23525 while (*constraints == '=' || *constraints == '+')
23526 constraints++;
23527 while (alt-- > 0)
23528 while (*constraints++ != ',')
23529 ;
23530 /* Skip ignored operands. */
23531 if (*constraints == 'X')
23532 continue;
23533 }
23534 return memory_address_length (XEXP (recog_data.operand[i], 0));
23535 }
23536 return 0;
23537 }
23538
23539 /* Compute default value for "length_vex" attribute. It includes
23540 2 or 3 byte VEX prefix and 1 opcode byte. */
23541
23542 int
23543 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23544 {
23545 int i;
23546
23547 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23548 byte VEX prefix. */
23549 if (!has_0f_opcode || has_vex_w)
23550 return 3 + 1;
23551
23552 /* We can always use 2 byte VEX prefix in 32bit. */
23553 if (!TARGET_64BIT)
23554 return 2 + 1;
23555
23556 extract_insn_cached (insn);
23557
23558 for (i = recog_data.n_operands - 1; i >= 0; --i)
23559 if (REG_P (recog_data.operand[i]))
23560 {
23561 /* REX.W bit uses 3 byte VEX prefix. */
23562 if (GET_MODE (recog_data.operand[i]) == DImode
23563 && GENERAL_REG_P (recog_data.operand[i]))
23564 return 3 + 1;
23565 }
23566 else
23567 {
23568 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23569 if (MEM_P (recog_data.operand[i])
23570 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23571 return 3 + 1;
23572 }
23573
23574 return 2 + 1;
23575 }
23576 \f
23577 /* Return the maximum number of instructions a cpu can issue. */
23578
23579 static int
23580 ix86_issue_rate (void)
23581 {
23582 switch (ix86_tune)
23583 {
23584 case PROCESSOR_PENTIUM:
23585 case PROCESSOR_ATOM:
23586 case PROCESSOR_K6:
23587 return 2;
23588
23589 case PROCESSOR_PENTIUMPRO:
23590 case PROCESSOR_PENTIUM4:
23591 case PROCESSOR_CORE2_32:
23592 case PROCESSOR_CORE2_64:
23593 case PROCESSOR_COREI7_32:
23594 case PROCESSOR_COREI7_64:
23595 case PROCESSOR_ATHLON:
23596 case PROCESSOR_K8:
23597 case PROCESSOR_AMDFAM10:
23598 case PROCESSOR_NOCONA:
23599 case PROCESSOR_GENERIC32:
23600 case PROCESSOR_GENERIC64:
23601 case PROCESSOR_BDVER1:
23602 case PROCESSOR_BDVER2:
23603 case PROCESSOR_BTVER1:
23604 return 3;
23605
23606 default:
23607 return 1;
23608 }
23609 }
23610
23611 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23612 by DEP_INSN and nothing set by DEP_INSN. */
23613
23614 static bool
23615 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23616 {
23617 rtx set, set2;
23618
23619 /* Simplify the test for uninteresting insns. */
23620 if (insn_type != TYPE_SETCC
23621 && insn_type != TYPE_ICMOV
23622 && insn_type != TYPE_FCMOV
23623 && insn_type != TYPE_IBR)
23624 return false;
23625
23626 if ((set = single_set (dep_insn)) != 0)
23627 {
23628 set = SET_DEST (set);
23629 set2 = NULL_RTX;
23630 }
23631 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23632 && XVECLEN (PATTERN (dep_insn), 0) == 2
23633 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23634 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23635 {
23636 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23637 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23638 }
23639 else
23640 return false;
23641
23642 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23643 return false;
23644
23645 /* This test is true if the dependent insn reads the flags but
23646 not any other potentially set register. */
23647 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23648 return false;
23649
23650 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23651 return false;
23652
23653 return true;
23654 }
23655
23656 /* Return true iff USE_INSN has a memory address with operands set by
23657 SET_INSN. */
23658
23659 bool
23660 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23661 {
23662 int i;
23663 extract_insn_cached (use_insn);
23664 for (i = recog_data.n_operands - 1; i >= 0; --i)
23665 if (MEM_P (recog_data.operand[i]))
23666 {
23667 rtx addr = XEXP (recog_data.operand[i], 0);
23668 return modified_in_p (addr, set_insn) != 0;
23669 }
23670 return false;
23671 }
23672
23673 static int
23674 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23675 {
23676 enum attr_type insn_type, dep_insn_type;
23677 enum attr_memory memory;
23678 rtx set, set2;
23679 int dep_insn_code_number;
23680
23681 /* Anti and output dependencies have zero cost on all CPUs. */
23682 if (REG_NOTE_KIND (link) != 0)
23683 return 0;
23684
23685 dep_insn_code_number = recog_memoized (dep_insn);
23686
23687 /* If we can't recognize the insns, we can't really do anything. */
23688 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23689 return cost;
23690
23691 insn_type = get_attr_type (insn);
23692 dep_insn_type = get_attr_type (dep_insn);
23693
23694 switch (ix86_tune)
23695 {
23696 case PROCESSOR_PENTIUM:
23697 /* Address Generation Interlock adds a cycle of latency. */
23698 if (insn_type == TYPE_LEA)
23699 {
23700 rtx addr = PATTERN (insn);
23701
23702 if (GET_CODE (addr) == PARALLEL)
23703 addr = XVECEXP (addr, 0, 0);
23704
23705 gcc_assert (GET_CODE (addr) == SET);
23706
23707 addr = SET_SRC (addr);
23708 if (modified_in_p (addr, dep_insn))
23709 cost += 1;
23710 }
23711 else if (ix86_agi_dependent (dep_insn, insn))
23712 cost += 1;
23713
23714 /* ??? Compares pair with jump/setcc. */
23715 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23716 cost = 0;
23717
23718 /* Floating point stores require value to be ready one cycle earlier. */
23719 if (insn_type == TYPE_FMOV
23720 && get_attr_memory (insn) == MEMORY_STORE
23721 && !ix86_agi_dependent (dep_insn, insn))
23722 cost += 1;
23723 break;
23724
23725 case PROCESSOR_PENTIUMPRO:
23726 memory = get_attr_memory (insn);
23727
23728 /* INT->FP conversion is expensive. */
23729 if (get_attr_fp_int_src (dep_insn))
23730 cost += 5;
23731
23732 /* There is one cycle extra latency between an FP op and a store. */
23733 if (insn_type == TYPE_FMOV
23734 && (set = single_set (dep_insn)) != NULL_RTX
23735 && (set2 = single_set (insn)) != NULL_RTX
23736 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23737 && MEM_P (SET_DEST (set2)))
23738 cost += 1;
23739
23740 /* Show ability of reorder buffer to hide latency of load by executing
23741 in parallel with previous instruction in case
23742 previous instruction is not needed to compute the address. */
23743 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23744 && !ix86_agi_dependent (dep_insn, insn))
23745 {
23746 /* Claim moves to take one cycle, as core can issue one load
23747 at time and the next load can start cycle later. */
23748 if (dep_insn_type == TYPE_IMOV
23749 || dep_insn_type == TYPE_FMOV)
23750 cost = 1;
23751 else if (cost > 1)
23752 cost--;
23753 }
23754 break;
23755
23756 case PROCESSOR_K6:
23757 memory = get_attr_memory (insn);
23758
23759 /* The esp dependency is resolved before the instruction is really
23760 finished. */
23761 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23762 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23763 return 1;
23764
23765 /* INT->FP conversion is expensive. */
23766 if (get_attr_fp_int_src (dep_insn))
23767 cost += 5;
23768
23769 /* Show ability of reorder buffer to hide latency of load by executing
23770 in parallel with previous instruction in case
23771 previous instruction is not needed to compute the address. */
23772 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23773 && !ix86_agi_dependent (dep_insn, insn))
23774 {
23775 /* Claim moves to take one cycle, as core can issue one load
23776 at time and the next load can start cycle later. */
23777 if (dep_insn_type == TYPE_IMOV
23778 || dep_insn_type == TYPE_FMOV)
23779 cost = 1;
23780 else if (cost > 2)
23781 cost -= 2;
23782 else
23783 cost = 1;
23784 }
23785 break;
23786
23787 case PROCESSOR_ATHLON:
23788 case PROCESSOR_K8:
23789 case PROCESSOR_AMDFAM10:
23790 case PROCESSOR_BDVER1:
23791 case PROCESSOR_BDVER2:
23792 case PROCESSOR_BTVER1:
23793 case PROCESSOR_ATOM:
23794 case PROCESSOR_GENERIC32:
23795 case PROCESSOR_GENERIC64:
23796 memory = get_attr_memory (insn);
23797
23798 /* Show ability of reorder buffer to hide latency of load by executing
23799 in parallel with previous instruction in case
23800 previous instruction is not needed to compute the address. */
23801 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23802 && !ix86_agi_dependent (dep_insn, insn))
23803 {
23804 enum attr_unit unit = get_attr_unit (insn);
23805 int loadcost = 3;
23806
23807 /* Because of the difference between the length of integer and
23808 floating unit pipeline preparation stages, the memory operands
23809 for floating point are cheaper.
23810
23811 ??? For Athlon it the difference is most probably 2. */
23812 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23813 loadcost = 3;
23814 else
23815 loadcost = TARGET_ATHLON ? 2 : 0;
23816
23817 if (cost >= loadcost)
23818 cost -= loadcost;
23819 else
23820 cost = 0;
23821 }
23822
23823 default:
23824 break;
23825 }
23826
23827 return cost;
23828 }
23829
23830 /* How many alternative schedules to try. This should be as wide as the
23831 scheduling freedom in the DFA, but no wider. Making this value too
23832 large results extra work for the scheduler. */
23833
23834 static int
23835 ia32_multipass_dfa_lookahead (void)
23836 {
23837 switch (ix86_tune)
23838 {
23839 case PROCESSOR_PENTIUM:
23840 return 2;
23841
23842 case PROCESSOR_PENTIUMPRO:
23843 case PROCESSOR_K6:
23844 return 1;
23845
23846 case PROCESSOR_CORE2_32:
23847 case PROCESSOR_CORE2_64:
23848 case PROCESSOR_COREI7_32:
23849 case PROCESSOR_COREI7_64:
23850 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23851 as many instructions can be executed on a cycle, i.e.,
23852 issue_rate. I wonder why tuning for many CPUs does not do this. */
23853 return ix86_issue_rate ();
23854
23855 default:
23856 return 0;
23857 }
23858 }
23859
23860 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
23861 execution. It is applied if
23862 (1) IMUL instruction is on the top of list;
23863 (2) There exists the only producer of independent IMUL instruction in
23864 ready list;
23865 (3) Put found producer on the top of ready list.
23866 Returns issue rate. */
23867
23868 static int
23869 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
23870 int clock_var ATTRIBUTE_UNUSED)
23871 {
23872 static int issue_rate = -1;
23873 int n_ready = *pn_ready;
23874 rtx insn, insn1, insn2;
23875 int i;
23876 sd_iterator_def sd_it;
23877 dep_t dep;
23878 int index = -1;
23879
23880 /* Set up issue rate. */
23881 issue_rate = ix86_issue_rate();
23882
23883 /* Do reodering for Atom only. */
23884 if (ix86_tune != PROCESSOR_ATOM)
23885 return issue_rate;
23886 /* Nothing to do if ready list contains only 1 instruction. */
23887 if (n_ready <= 1)
23888 return issue_rate;
23889
23890 /* Check that IMUL instruction is on the top of ready list. */
23891 insn = ready[n_ready - 1];
23892 if (!NONDEBUG_INSN_P (insn))
23893 return issue_rate;
23894 insn = PATTERN (insn);
23895 if (GET_CODE (insn) == PARALLEL)
23896 insn = XVECEXP (insn, 0, 0);
23897 if (GET_CODE (insn) != SET)
23898 return issue_rate;
23899 if (!(GET_CODE (SET_SRC (insn)) == MULT
23900 && GET_MODE (SET_SRC (insn)) == SImode))
23901 return issue_rate;
23902
23903 /* Search for producer of independent IMUL instruction. */
23904 for (i = n_ready - 2; i>= 0; i--)
23905 {
23906 insn = ready[i];
23907 if (!NONDEBUG_INSN_P (insn))
23908 continue;
23909 /* Skip IMUL instruction. */
23910 insn2 = PATTERN (insn);
23911 if (GET_CODE (insn2) == PARALLEL)
23912 insn2 = XVECEXP (insn2, 0, 0);
23913 if (GET_CODE (insn2) == SET
23914 && GET_CODE (SET_SRC (insn2)) == MULT
23915 && GET_MODE (SET_SRC (insn2)) == SImode)
23916 continue;
23917
23918 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
23919 {
23920 rtx con;
23921 con = DEP_CON (dep);
23922 if (!NONDEBUG_INSN_P (con))
23923 continue;
23924 insn1 = PATTERN (con);
23925 if (GET_CODE (insn1) == PARALLEL)
23926 insn1 = XVECEXP (insn1, 0, 0);
23927
23928 if (GET_CODE (insn1) == SET
23929 && GET_CODE (SET_SRC (insn1)) == MULT
23930 && GET_MODE (SET_SRC (insn1)) == SImode)
23931 {
23932 sd_iterator_def sd_it1;
23933 dep_t dep1;
23934 /* Check if there is no other dependee for IMUL. */
23935 index = i;
23936 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
23937 {
23938 rtx pro;
23939 pro = DEP_PRO (dep1);
23940 if (!NONDEBUG_INSN_P (pro))
23941 continue;
23942 if (pro != insn)
23943 index = -1;
23944 }
23945 if (index >= 0)
23946 break;
23947 }
23948 }
23949 if (index >= 0)
23950 break;
23951 }
23952 if (index < 0)
23953 return issue_rate; /* Didn't find IMUL producer. */
23954
23955 if (sched_verbose > 1)
23956 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
23957 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
23958
23959 /* Put IMUL producer (ready[index]) at the top of ready list. */
23960 insn1= ready[index];
23961 for (i = index; i < n_ready - 1; i++)
23962 ready[i] = ready[i + 1];
23963 ready[n_ready - 1] = insn1;
23964
23965 return issue_rate;
23966 }
23967
23968 \f
23969
23970 /* Model decoder of Core 2/i7.
23971 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23972 track the instruction fetch block boundaries and make sure that long
23973 (9+ bytes) instructions are assigned to D0. */
23974
23975 /* Maximum length of an insn that can be handled by
23976 a secondary decoder unit. '8' for Core 2/i7. */
23977 static int core2i7_secondary_decoder_max_insn_size;
23978
23979 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23980 '16' for Core 2/i7. */
23981 static int core2i7_ifetch_block_size;
23982
23983 /* Maximum number of instructions decoder can handle per cycle.
23984 '6' for Core 2/i7. */
23985 static int core2i7_ifetch_block_max_insns;
23986
23987 typedef struct ix86_first_cycle_multipass_data_ *
23988 ix86_first_cycle_multipass_data_t;
23989 typedef const struct ix86_first_cycle_multipass_data_ *
23990 const_ix86_first_cycle_multipass_data_t;
23991
23992 /* A variable to store target state across calls to max_issue within
23993 one cycle. */
23994 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23995 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23996
23997 /* Initialize DATA. */
23998 static void
23999 core2i7_first_cycle_multipass_init (void *_data)
24000 {
24001 ix86_first_cycle_multipass_data_t data
24002 = (ix86_first_cycle_multipass_data_t) _data;
24003
24004 data->ifetch_block_len = 0;
24005 data->ifetch_block_n_insns = 0;
24006 data->ready_try_change = NULL;
24007 data->ready_try_change_size = 0;
24008 }
24009
24010 /* Advancing the cycle; reset ifetch block counts. */
24011 static void
24012 core2i7_dfa_post_advance_cycle (void)
24013 {
24014 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24015
24016 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24017
24018 data->ifetch_block_len = 0;
24019 data->ifetch_block_n_insns = 0;
24020 }
24021
24022 static int min_insn_size (rtx);
24023
24024 /* Filter out insns from ready_try that the core will not be able to issue
24025 on current cycle due to decoder. */
24026 static void
24027 core2i7_first_cycle_multipass_filter_ready_try
24028 (const_ix86_first_cycle_multipass_data_t data,
24029 char *ready_try, int n_ready, bool first_cycle_insn_p)
24030 {
24031 while (n_ready--)
24032 {
24033 rtx insn;
24034 int insn_size;
24035
24036 if (ready_try[n_ready])
24037 continue;
24038
24039 insn = get_ready_element (n_ready);
24040 insn_size = min_insn_size (insn);
24041
24042 if (/* If this is a too long an insn for a secondary decoder ... */
24043 (!first_cycle_insn_p
24044 && insn_size > core2i7_secondary_decoder_max_insn_size)
24045 /* ... or it would not fit into the ifetch block ... */
24046 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24047 /* ... or the decoder is full already ... */
24048 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24049 /* ... mask the insn out. */
24050 {
24051 ready_try[n_ready] = 1;
24052
24053 if (data->ready_try_change)
24054 SET_BIT (data->ready_try_change, n_ready);
24055 }
24056 }
24057 }
24058
24059 /* Prepare for a new round of multipass lookahead scheduling. */
24060 static void
24061 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24062 bool first_cycle_insn_p)
24063 {
24064 ix86_first_cycle_multipass_data_t data
24065 = (ix86_first_cycle_multipass_data_t) _data;
24066 const_ix86_first_cycle_multipass_data_t prev_data
24067 = ix86_first_cycle_multipass_data;
24068
24069 /* Restore the state from the end of the previous round. */
24070 data->ifetch_block_len = prev_data->ifetch_block_len;
24071 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24072
24073 /* Filter instructions that cannot be issued on current cycle due to
24074 decoder restrictions. */
24075 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24076 first_cycle_insn_p);
24077 }
24078
24079 /* INSN is being issued in current solution. Account for its impact on
24080 the decoder model. */
24081 static void
24082 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24083 rtx insn, const void *_prev_data)
24084 {
24085 ix86_first_cycle_multipass_data_t data
24086 = (ix86_first_cycle_multipass_data_t) _data;
24087 const_ix86_first_cycle_multipass_data_t prev_data
24088 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24089
24090 int insn_size = min_insn_size (insn);
24091
24092 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24093 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24094 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24095 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24096
24097 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24098 if (!data->ready_try_change)
24099 {
24100 data->ready_try_change = sbitmap_alloc (n_ready);
24101 data->ready_try_change_size = n_ready;
24102 }
24103 else if (data->ready_try_change_size < n_ready)
24104 {
24105 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24106 n_ready, 0);
24107 data->ready_try_change_size = n_ready;
24108 }
24109 sbitmap_zero (data->ready_try_change);
24110
24111 /* Filter out insns from ready_try that the core will not be able to issue
24112 on current cycle due to decoder. */
24113 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24114 false);
24115 }
24116
24117 /* Revert the effect on ready_try. */
24118 static void
24119 core2i7_first_cycle_multipass_backtrack (const void *_data,
24120 char *ready_try,
24121 int n_ready ATTRIBUTE_UNUSED)
24122 {
24123 const_ix86_first_cycle_multipass_data_t data
24124 = (const_ix86_first_cycle_multipass_data_t) _data;
24125 unsigned int i = 0;
24126 sbitmap_iterator sbi;
24127
24128 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24129 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24130 {
24131 ready_try[i] = 0;
24132 }
24133 }
24134
24135 /* Save the result of multipass lookahead scheduling for the next round. */
24136 static void
24137 core2i7_first_cycle_multipass_end (const void *_data)
24138 {
24139 const_ix86_first_cycle_multipass_data_t data
24140 = (const_ix86_first_cycle_multipass_data_t) _data;
24141 ix86_first_cycle_multipass_data_t next_data
24142 = ix86_first_cycle_multipass_data;
24143
24144 if (data != NULL)
24145 {
24146 next_data->ifetch_block_len = data->ifetch_block_len;
24147 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24148 }
24149 }
24150
24151 /* Deallocate target data. */
24152 static void
24153 core2i7_first_cycle_multipass_fini (void *_data)
24154 {
24155 ix86_first_cycle_multipass_data_t data
24156 = (ix86_first_cycle_multipass_data_t) _data;
24157
24158 if (data->ready_try_change)
24159 {
24160 sbitmap_free (data->ready_try_change);
24161 data->ready_try_change = NULL;
24162 data->ready_try_change_size = 0;
24163 }
24164 }
24165
24166 /* Prepare for scheduling pass. */
24167 static void
24168 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24169 int verbose ATTRIBUTE_UNUSED,
24170 int max_uid ATTRIBUTE_UNUSED)
24171 {
24172 /* Install scheduling hooks for current CPU. Some of these hooks are used
24173 in time-critical parts of the scheduler, so we only set them up when
24174 they are actually used. */
24175 switch (ix86_tune)
24176 {
24177 case PROCESSOR_CORE2_32:
24178 case PROCESSOR_CORE2_64:
24179 case PROCESSOR_COREI7_32:
24180 case PROCESSOR_COREI7_64:
24181 targetm.sched.dfa_post_advance_cycle
24182 = core2i7_dfa_post_advance_cycle;
24183 targetm.sched.first_cycle_multipass_init
24184 = core2i7_first_cycle_multipass_init;
24185 targetm.sched.first_cycle_multipass_begin
24186 = core2i7_first_cycle_multipass_begin;
24187 targetm.sched.first_cycle_multipass_issue
24188 = core2i7_first_cycle_multipass_issue;
24189 targetm.sched.first_cycle_multipass_backtrack
24190 = core2i7_first_cycle_multipass_backtrack;
24191 targetm.sched.first_cycle_multipass_end
24192 = core2i7_first_cycle_multipass_end;
24193 targetm.sched.first_cycle_multipass_fini
24194 = core2i7_first_cycle_multipass_fini;
24195
24196 /* Set decoder parameters. */
24197 core2i7_secondary_decoder_max_insn_size = 8;
24198 core2i7_ifetch_block_size = 16;
24199 core2i7_ifetch_block_max_insns = 6;
24200 break;
24201
24202 default:
24203 targetm.sched.dfa_post_advance_cycle = NULL;
24204 targetm.sched.first_cycle_multipass_init = NULL;
24205 targetm.sched.first_cycle_multipass_begin = NULL;
24206 targetm.sched.first_cycle_multipass_issue = NULL;
24207 targetm.sched.first_cycle_multipass_backtrack = NULL;
24208 targetm.sched.first_cycle_multipass_end = NULL;
24209 targetm.sched.first_cycle_multipass_fini = NULL;
24210 break;
24211 }
24212 }
24213
24214 \f
24215 /* Compute the alignment given to a constant that is being placed in memory.
24216 EXP is the constant and ALIGN is the alignment that the object would
24217 ordinarily have.
24218 The value of this function is used instead of that alignment to align
24219 the object. */
24220
24221 int
24222 ix86_constant_alignment (tree exp, int align)
24223 {
24224 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24225 || TREE_CODE (exp) == INTEGER_CST)
24226 {
24227 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24228 return 64;
24229 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24230 return 128;
24231 }
24232 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24233 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24234 return BITS_PER_WORD;
24235
24236 return align;
24237 }
24238
24239 /* Compute the alignment for a static variable.
24240 TYPE is the data type, and ALIGN is the alignment that
24241 the object would ordinarily have. The value of this function is used
24242 instead of that alignment to align the object. */
24243
24244 int
24245 ix86_data_alignment (tree type, int align)
24246 {
24247 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24248
24249 if (AGGREGATE_TYPE_P (type)
24250 && TYPE_SIZE (type)
24251 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24252 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24253 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24254 && align < max_align)
24255 align = max_align;
24256
24257 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24258 to 16byte boundary. */
24259 if (TARGET_64BIT)
24260 {
24261 if (AGGREGATE_TYPE_P (type)
24262 && TYPE_SIZE (type)
24263 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24264 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24265 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24266 return 128;
24267 }
24268
24269 if (TREE_CODE (type) == ARRAY_TYPE)
24270 {
24271 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24272 return 64;
24273 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24274 return 128;
24275 }
24276 else if (TREE_CODE (type) == COMPLEX_TYPE)
24277 {
24278
24279 if (TYPE_MODE (type) == DCmode && align < 64)
24280 return 64;
24281 if ((TYPE_MODE (type) == XCmode
24282 || TYPE_MODE (type) == TCmode) && align < 128)
24283 return 128;
24284 }
24285 else if ((TREE_CODE (type) == RECORD_TYPE
24286 || TREE_CODE (type) == UNION_TYPE
24287 || TREE_CODE (type) == QUAL_UNION_TYPE)
24288 && TYPE_FIELDS (type))
24289 {
24290 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24291 return 64;
24292 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24293 return 128;
24294 }
24295 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24296 || TREE_CODE (type) == INTEGER_TYPE)
24297 {
24298 if (TYPE_MODE (type) == DFmode && align < 64)
24299 return 64;
24300 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24301 return 128;
24302 }
24303
24304 return align;
24305 }
24306
24307 /* Compute the alignment for a local variable or a stack slot. EXP is
24308 the data type or decl itself, MODE is the widest mode available and
24309 ALIGN is the alignment that the object would ordinarily have. The
24310 value of this macro is used instead of that alignment to align the
24311 object. */
24312
24313 unsigned int
24314 ix86_local_alignment (tree exp, enum machine_mode mode,
24315 unsigned int align)
24316 {
24317 tree type, decl;
24318
24319 if (exp && DECL_P (exp))
24320 {
24321 type = TREE_TYPE (exp);
24322 decl = exp;
24323 }
24324 else
24325 {
24326 type = exp;
24327 decl = NULL;
24328 }
24329
24330 /* Don't do dynamic stack realignment for long long objects with
24331 -mpreferred-stack-boundary=2. */
24332 if (!TARGET_64BIT
24333 && align == 64
24334 && ix86_preferred_stack_boundary < 64
24335 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24336 && (!type || !TYPE_USER_ALIGN (type))
24337 && (!decl || !DECL_USER_ALIGN (decl)))
24338 align = 32;
24339
24340 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24341 register in MODE. We will return the largest alignment of XF
24342 and DF. */
24343 if (!type)
24344 {
24345 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24346 align = GET_MODE_ALIGNMENT (DFmode);
24347 return align;
24348 }
24349
24350 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24351 to 16byte boundary. Exact wording is:
24352
24353 An array uses the same alignment as its elements, except that a local or
24354 global array variable of length at least 16 bytes or
24355 a C99 variable-length array variable always has alignment of at least 16 bytes.
24356
24357 This was added to allow use of aligned SSE instructions at arrays. This
24358 rule is meant for static storage (where compiler can not do the analysis
24359 by itself). We follow it for automatic variables only when convenient.
24360 We fully control everything in the function compiled and functions from
24361 other unit can not rely on the alignment.
24362
24363 Exclude va_list type. It is the common case of local array where
24364 we can not benefit from the alignment. */
24365 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24366 && TARGET_SSE)
24367 {
24368 if (AGGREGATE_TYPE_P (type)
24369 && (va_list_type_node == NULL_TREE
24370 || (TYPE_MAIN_VARIANT (type)
24371 != TYPE_MAIN_VARIANT (va_list_type_node)))
24372 && TYPE_SIZE (type)
24373 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24374 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24375 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24376 return 128;
24377 }
24378 if (TREE_CODE (type) == ARRAY_TYPE)
24379 {
24380 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24381 return 64;
24382 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24383 return 128;
24384 }
24385 else if (TREE_CODE (type) == COMPLEX_TYPE)
24386 {
24387 if (TYPE_MODE (type) == DCmode && align < 64)
24388 return 64;
24389 if ((TYPE_MODE (type) == XCmode
24390 || TYPE_MODE (type) == TCmode) && align < 128)
24391 return 128;
24392 }
24393 else if ((TREE_CODE (type) == RECORD_TYPE
24394 || TREE_CODE (type) == UNION_TYPE
24395 || TREE_CODE (type) == QUAL_UNION_TYPE)
24396 && TYPE_FIELDS (type))
24397 {
24398 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24399 return 64;
24400 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24401 return 128;
24402 }
24403 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24404 || TREE_CODE (type) == INTEGER_TYPE)
24405 {
24406
24407 if (TYPE_MODE (type) == DFmode && align < 64)
24408 return 64;
24409 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24410 return 128;
24411 }
24412 return align;
24413 }
24414
24415 /* Compute the minimum required alignment for dynamic stack realignment
24416 purposes for a local variable, parameter or a stack slot. EXP is
24417 the data type or decl itself, MODE is its mode and ALIGN is the
24418 alignment that the object would ordinarily have. */
24419
24420 unsigned int
24421 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24422 unsigned int align)
24423 {
24424 tree type, decl;
24425
24426 if (exp && DECL_P (exp))
24427 {
24428 type = TREE_TYPE (exp);
24429 decl = exp;
24430 }
24431 else
24432 {
24433 type = exp;
24434 decl = NULL;
24435 }
24436
24437 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24438 return align;
24439
24440 /* Don't do dynamic stack realignment for long long objects with
24441 -mpreferred-stack-boundary=2. */
24442 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24443 && (!type || !TYPE_USER_ALIGN (type))
24444 && (!decl || !DECL_USER_ALIGN (decl)))
24445 return 32;
24446
24447 return align;
24448 }
24449 \f
24450 /* Find a location for the static chain incoming to a nested function.
24451 This is a register, unless all free registers are used by arguments. */
24452
24453 static rtx
24454 ix86_static_chain (const_tree fndecl, bool incoming_p)
24455 {
24456 unsigned regno;
24457
24458 if (!DECL_STATIC_CHAIN (fndecl))
24459 return NULL;
24460
24461 if (TARGET_64BIT)
24462 {
24463 /* We always use R10 in 64-bit mode. */
24464 regno = R10_REG;
24465 }
24466 else
24467 {
24468 tree fntype;
24469 unsigned int ccvt;
24470
24471 /* By default in 32-bit mode we use ECX to pass the static chain. */
24472 regno = CX_REG;
24473
24474 fntype = TREE_TYPE (fndecl);
24475 ccvt = ix86_get_callcvt (fntype);
24476 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24477 {
24478 /* Fastcall functions use ecx/edx for arguments, which leaves
24479 us with EAX for the static chain.
24480 Thiscall functions use ecx for arguments, which also
24481 leaves us with EAX for the static chain. */
24482 regno = AX_REG;
24483 }
24484 else if (ix86_function_regparm (fntype, fndecl) == 3)
24485 {
24486 /* For regparm 3, we have no free call-clobbered registers in
24487 which to store the static chain. In order to implement this,
24488 we have the trampoline push the static chain to the stack.
24489 However, we can't push a value below the return address when
24490 we call the nested function directly, so we have to use an
24491 alternate entry point. For this we use ESI, and have the
24492 alternate entry point push ESI, so that things appear the
24493 same once we're executing the nested function. */
24494 if (incoming_p)
24495 {
24496 if (fndecl == current_function_decl)
24497 ix86_static_chain_on_stack = true;
24498 return gen_frame_mem (SImode,
24499 plus_constant (Pmode,
24500 arg_pointer_rtx, -8));
24501 }
24502 regno = SI_REG;
24503 }
24504 }
24505
24506 return gen_rtx_REG (Pmode, regno);
24507 }
24508
24509 /* Emit RTL insns to initialize the variable parts of a trampoline.
24510 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24511 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24512 to be passed to the target function. */
24513
24514 static void
24515 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24516 {
24517 rtx mem, fnaddr;
24518 int opcode;
24519 int offset = 0;
24520
24521 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24522
24523 if (TARGET_64BIT)
24524 {
24525 int size;
24526
24527 /* Load the function address to r11. Try to load address using
24528 the shorter movl instead of movabs. We may want to support
24529 movq for kernel mode, but kernel does not use trampolines at
24530 the moment. FNADDR is a 32bit address and may not be in
24531 DImode when ptr_mode == SImode. Always use movl in this
24532 case. */
24533 if (ptr_mode == SImode
24534 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24535 {
24536 fnaddr = copy_addr_to_reg (fnaddr);
24537
24538 mem = adjust_address (m_tramp, HImode, offset);
24539 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24540
24541 mem = adjust_address (m_tramp, SImode, offset + 2);
24542 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24543 offset += 6;
24544 }
24545 else
24546 {
24547 mem = adjust_address (m_tramp, HImode, offset);
24548 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24549
24550 mem = adjust_address (m_tramp, DImode, offset + 2);
24551 emit_move_insn (mem, fnaddr);
24552 offset += 10;
24553 }
24554
24555 /* Load static chain using movabs to r10. Use the shorter movl
24556 instead of movabs when ptr_mode == SImode. */
24557 if (ptr_mode == SImode)
24558 {
24559 opcode = 0xba41;
24560 size = 6;
24561 }
24562 else
24563 {
24564 opcode = 0xba49;
24565 size = 10;
24566 }
24567
24568 mem = adjust_address (m_tramp, HImode, offset);
24569 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24570
24571 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24572 emit_move_insn (mem, chain_value);
24573 offset += size;
24574
24575 /* Jump to r11; the last (unused) byte is a nop, only there to
24576 pad the write out to a single 32-bit store. */
24577 mem = adjust_address (m_tramp, SImode, offset);
24578 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24579 offset += 4;
24580 }
24581 else
24582 {
24583 rtx disp, chain;
24584
24585 /* Depending on the static chain location, either load a register
24586 with a constant, or push the constant to the stack. All of the
24587 instructions are the same size. */
24588 chain = ix86_static_chain (fndecl, true);
24589 if (REG_P (chain))
24590 {
24591 switch (REGNO (chain))
24592 {
24593 case AX_REG:
24594 opcode = 0xb8; break;
24595 case CX_REG:
24596 opcode = 0xb9; break;
24597 default:
24598 gcc_unreachable ();
24599 }
24600 }
24601 else
24602 opcode = 0x68;
24603
24604 mem = adjust_address (m_tramp, QImode, offset);
24605 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24606
24607 mem = adjust_address (m_tramp, SImode, offset + 1);
24608 emit_move_insn (mem, chain_value);
24609 offset += 5;
24610
24611 mem = adjust_address (m_tramp, QImode, offset);
24612 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24613
24614 mem = adjust_address (m_tramp, SImode, offset + 1);
24615
24616 /* Compute offset from the end of the jmp to the target function.
24617 In the case in which the trampoline stores the static chain on
24618 the stack, we need to skip the first insn which pushes the
24619 (call-saved) register static chain; this push is 1 byte. */
24620 offset += 5;
24621 disp = expand_binop (SImode, sub_optab, fnaddr,
24622 plus_constant (Pmode, XEXP (m_tramp, 0),
24623 offset - (MEM_P (chain) ? 1 : 0)),
24624 NULL_RTX, 1, OPTAB_DIRECT);
24625 emit_move_insn (mem, disp);
24626 }
24627
24628 gcc_assert (offset <= TRAMPOLINE_SIZE);
24629
24630 #ifdef HAVE_ENABLE_EXECUTE_STACK
24631 #ifdef CHECK_EXECUTE_STACK_ENABLED
24632 if (CHECK_EXECUTE_STACK_ENABLED)
24633 #endif
24634 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24635 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24636 #endif
24637 }
24638 \f
24639 /* The following file contains several enumerations and data structures
24640 built from the definitions in i386-builtin-types.def. */
24641
24642 #include "i386-builtin-types.inc"
24643
24644 /* Table for the ix86 builtin non-function types. */
24645 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24646
24647 /* Retrieve an element from the above table, building some of
24648 the types lazily. */
24649
24650 static tree
24651 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24652 {
24653 unsigned int index;
24654 tree type, itype;
24655
24656 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24657
24658 type = ix86_builtin_type_tab[(int) tcode];
24659 if (type != NULL)
24660 return type;
24661
24662 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24663 if (tcode <= IX86_BT_LAST_VECT)
24664 {
24665 enum machine_mode mode;
24666
24667 index = tcode - IX86_BT_LAST_PRIM - 1;
24668 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24669 mode = ix86_builtin_type_vect_mode[index];
24670
24671 type = build_vector_type_for_mode (itype, mode);
24672 }
24673 else
24674 {
24675 int quals;
24676
24677 index = tcode - IX86_BT_LAST_VECT - 1;
24678 if (tcode <= IX86_BT_LAST_PTR)
24679 quals = TYPE_UNQUALIFIED;
24680 else
24681 quals = TYPE_QUAL_CONST;
24682
24683 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24684 if (quals != TYPE_UNQUALIFIED)
24685 itype = build_qualified_type (itype, quals);
24686
24687 type = build_pointer_type (itype);
24688 }
24689
24690 ix86_builtin_type_tab[(int) tcode] = type;
24691 return type;
24692 }
24693
24694 /* Table for the ix86 builtin function types. */
24695 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24696
24697 /* Retrieve an element from the above table, building some of
24698 the types lazily. */
24699
24700 static tree
24701 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24702 {
24703 tree type;
24704
24705 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24706
24707 type = ix86_builtin_func_type_tab[(int) tcode];
24708 if (type != NULL)
24709 return type;
24710
24711 if (tcode <= IX86_BT_LAST_FUNC)
24712 {
24713 unsigned start = ix86_builtin_func_start[(int) tcode];
24714 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24715 tree rtype, atype, args = void_list_node;
24716 unsigned i;
24717
24718 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24719 for (i = after - 1; i > start; --i)
24720 {
24721 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24722 args = tree_cons (NULL, atype, args);
24723 }
24724
24725 type = build_function_type (rtype, args);
24726 }
24727 else
24728 {
24729 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24730 enum ix86_builtin_func_type icode;
24731
24732 icode = ix86_builtin_func_alias_base[index];
24733 type = ix86_get_builtin_func_type (icode);
24734 }
24735
24736 ix86_builtin_func_type_tab[(int) tcode] = type;
24737 return type;
24738 }
24739
24740
24741 /* Codes for all the SSE/MMX builtins. */
24742 enum ix86_builtins
24743 {
24744 IX86_BUILTIN_ADDPS,
24745 IX86_BUILTIN_ADDSS,
24746 IX86_BUILTIN_DIVPS,
24747 IX86_BUILTIN_DIVSS,
24748 IX86_BUILTIN_MULPS,
24749 IX86_BUILTIN_MULSS,
24750 IX86_BUILTIN_SUBPS,
24751 IX86_BUILTIN_SUBSS,
24752
24753 IX86_BUILTIN_CMPEQPS,
24754 IX86_BUILTIN_CMPLTPS,
24755 IX86_BUILTIN_CMPLEPS,
24756 IX86_BUILTIN_CMPGTPS,
24757 IX86_BUILTIN_CMPGEPS,
24758 IX86_BUILTIN_CMPNEQPS,
24759 IX86_BUILTIN_CMPNLTPS,
24760 IX86_BUILTIN_CMPNLEPS,
24761 IX86_BUILTIN_CMPNGTPS,
24762 IX86_BUILTIN_CMPNGEPS,
24763 IX86_BUILTIN_CMPORDPS,
24764 IX86_BUILTIN_CMPUNORDPS,
24765 IX86_BUILTIN_CMPEQSS,
24766 IX86_BUILTIN_CMPLTSS,
24767 IX86_BUILTIN_CMPLESS,
24768 IX86_BUILTIN_CMPNEQSS,
24769 IX86_BUILTIN_CMPNLTSS,
24770 IX86_BUILTIN_CMPNLESS,
24771 IX86_BUILTIN_CMPNGTSS,
24772 IX86_BUILTIN_CMPNGESS,
24773 IX86_BUILTIN_CMPORDSS,
24774 IX86_BUILTIN_CMPUNORDSS,
24775
24776 IX86_BUILTIN_COMIEQSS,
24777 IX86_BUILTIN_COMILTSS,
24778 IX86_BUILTIN_COMILESS,
24779 IX86_BUILTIN_COMIGTSS,
24780 IX86_BUILTIN_COMIGESS,
24781 IX86_BUILTIN_COMINEQSS,
24782 IX86_BUILTIN_UCOMIEQSS,
24783 IX86_BUILTIN_UCOMILTSS,
24784 IX86_BUILTIN_UCOMILESS,
24785 IX86_BUILTIN_UCOMIGTSS,
24786 IX86_BUILTIN_UCOMIGESS,
24787 IX86_BUILTIN_UCOMINEQSS,
24788
24789 IX86_BUILTIN_CVTPI2PS,
24790 IX86_BUILTIN_CVTPS2PI,
24791 IX86_BUILTIN_CVTSI2SS,
24792 IX86_BUILTIN_CVTSI642SS,
24793 IX86_BUILTIN_CVTSS2SI,
24794 IX86_BUILTIN_CVTSS2SI64,
24795 IX86_BUILTIN_CVTTPS2PI,
24796 IX86_BUILTIN_CVTTSS2SI,
24797 IX86_BUILTIN_CVTTSS2SI64,
24798
24799 IX86_BUILTIN_MAXPS,
24800 IX86_BUILTIN_MAXSS,
24801 IX86_BUILTIN_MINPS,
24802 IX86_BUILTIN_MINSS,
24803
24804 IX86_BUILTIN_LOADUPS,
24805 IX86_BUILTIN_STOREUPS,
24806 IX86_BUILTIN_MOVSS,
24807
24808 IX86_BUILTIN_MOVHLPS,
24809 IX86_BUILTIN_MOVLHPS,
24810 IX86_BUILTIN_LOADHPS,
24811 IX86_BUILTIN_LOADLPS,
24812 IX86_BUILTIN_STOREHPS,
24813 IX86_BUILTIN_STORELPS,
24814
24815 IX86_BUILTIN_MASKMOVQ,
24816 IX86_BUILTIN_MOVMSKPS,
24817 IX86_BUILTIN_PMOVMSKB,
24818
24819 IX86_BUILTIN_MOVNTPS,
24820 IX86_BUILTIN_MOVNTQ,
24821
24822 IX86_BUILTIN_LOADDQU,
24823 IX86_BUILTIN_STOREDQU,
24824
24825 IX86_BUILTIN_PACKSSWB,
24826 IX86_BUILTIN_PACKSSDW,
24827 IX86_BUILTIN_PACKUSWB,
24828
24829 IX86_BUILTIN_PADDB,
24830 IX86_BUILTIN_PADDW,
24831 IX86_BUILTIN_PADDD,
24832 IX86_BUILTIN_PADDQ,
24833 IX86_BUILTIN_PADDSB,
24834 IX86_BUILTIN_PADDSW,
24835 IX86_BUILTIN_PADDUSB,
24836 IX86_BUILTIN_PADDUSW,
24837 IX86_BUILTIN_PSUBB,
24838 IX86_BUILTIN_PSUBW,
24839 IX86_BUILTIN_PSUBD,
24840 IX86_BUILTIN_PSUBQ,
24841 IX86_BUILTIN_PSUBSB,
24842 IX86_BUILTIN_PSUBSW,
24843 IX86_BUILTIN_PSUBUSB,
24844 IX86_BUILTIN_PSUBUSW,
24845
24846 IX86_BUILTIN_PAND,
24847 IX86_BUILTIN_PANDN,
24848 IX86_BUILTIN_POR,
24849 IX86_BUILTIN_PXOR,
24850
24851 IX86_BUILTIN_PAVGB,
24852 IX86_BUILTIN_PAVGW,
24853
24854 IX86_BUILTIN_PCMPEQB,
24855 IX86_BUILTIN_PCMPEQW,
24856 IX86_BUILTIN_PCMPEQD,
24857 IX86_BUILTIN_PCMPGTB,
24858 IX86_BUILTIN_PCMPGTW,
24859 IX86_BUILTIN_PCMPGTD,
24860
24861 IX86_BUILTIN_PMADDWD,
24862
24863 IX86_BUILTIN_PMAXSW,
24864 IX86_BUILTIN_PMAXUB,
24865 IX86_BUILTIN_PMINSW,
24866 IX86_BUILTIN_PMINUB,
24867
24868 IX86_BUILTIN_PMULHUW,
24869 IX86_BUILTIN_PMULHW,
24870 IX86_BUILTIN_PMULLW,
24871
24872 IX86_BUILTIN_PSADBW,
24873 IX86_BUILTIN_PSHUFW,
24874
24875 IX86_BUILTIN_PSLLW,
24876 IX86_BUILTIN_PSLLD,
24877 IX86_BUILTIN_PSLLQ,
24878 IX86_BUILTIN_PSRAW,
24879 IX86_BUILTIN_PSRAD,
24880 IX86_BUILTIN_PSRLW,
24881 IX86_BUILTIN_PSRLD,
24882 IX86_BUILTIN_PSRLQ,
24883 IX86_BUILTIN_PSLLWI,
24884 IX86_BUILTIN_PSLLDI,
24885 IX86_BUILTIN_PSLLQI,
24886 IX86_BUILTIN_PSRAWI,
24887 IX86_BUILTIN_PSRADI,
24888 IX86_BUILTIN_PSRLWI,
24889 IX86_BUILTIN_PSRLDI,
24890 IX86_BUILTIN_PSRLQI,
24891
24892 IX86_BUILTIN_PUNPCKHBW,
24893 IX86_BUILTIN_PUNPCKHWD,
24894 IX86_BUILTIN_PUNPCKHDQ,
24895 IX86_BUILTIN_PUNPCKLBW,
24896 IX86_BUILTIN_PUNPCKLWD,
24897 IX86_BUILTIN_PUNPCKLDQ,
24898
24899 IX86_BUILTIN_SHUFPS,
24900
24901 IX86_BUILTIN_RCPPS,
24902 IX86_BUILTIN_RCPSS,
24903 IX86_BUILTIN_RSQRTPS,
24904 IX86_BUILTIN_RSQRTPS_NR,
24905 IX86_BUILTIN_RSQRTSS,
24906 IX86_BUILTIN_RSQRTF,
24907 IX86_BUILTIN_SQRTPS,
24908 IX86_BUILTIN_SQRTPS_NR,
24909 IX86_BUILTIN_SQRTSS,
24910
24911 IX86_BUILTIN_UNPCKHPS,
24912 IX86_BUILTIN_UNPCKLPS,
24913
24914 IX86_BUILTIN_ANDPS,
24915 IX86_BUILTIN_ANDNPS,
24916 IX86_BUILTIN_ORPS,
24917 IX86_BUILTIN_XORPS,
24918
24919 IX86_BUILTIN_EMMS,
24920 IX86_BUILTIN_LDMXCSR,
24921 IX86_BUILTIN_STMXCSR,
24922 IX86_BUILTIN_SFENCE,
24923
24924 /* 3DNow! Original */
24925 IX86_BUILTIN_FEMMS,
24926 IX86_BUILTIN_PAVGUSB,
24927 IX86_BUILTIN_PF2ID,
24928 IX86_BUILTIN_PFACC,
24929 IX86_BUILTIN_PFADD,
24930 IX86_BUILTIN_PFCMPEQ,
24931 IX86_BUILTIN_PFCMPGE,
24932 IX86_BUILTIN_PFCMPGT,
24933 IX86_BUILTIN_PFMAX,
24934 IX86_BUILTIN_PFMIN,
24935 IX86_BUILTIN_PFMUL,
24936 IX86_BUILTIN_PFRCP,
24937 IX86_BUILTIN_PFRCPIT1,
24938 IX86_BUILTIN_PFRCPIT2,
24939 IX86_BUILTIN_PFRSQIT1,
24940 IX86_BUILTIN_PFRSQRT,
24941 IX86_BUILTIN_PFSUB,
24942 IX86_BUILTIN_PFSUBR,
24943 IX86_BUILTIN_PI2FD,
24944 IX86_BUILTIN_PMULHRW,
24945
24946 /* 3DNow! Athlon Extensions */
24947 IX86_BUILTIN_PF2IW,
24948 IX86_BUILTIN_PFNACC,
24949 IX86_BUILTIN_PFPNACC,
24950 IX86_BUILTIN_PI2FW,
24951 IX86_BUILTIN_PSWAPDSI,
24952 IX86_BUILTIN_PSWAPDSF,
24953
24954 /* SSE2 */
24955 IX86_BUILTIN_ADDPD,
24956 IX86_BUILTIN_ADDSD,
24957 IX86_BUILTIN_DIVPD,
24958 IX86_BUILTIN_DIVSD,
24959 IX86_BUILTIN_MULPD,
24960 IX86_BUILTIN_MULSD,
24961 IX86_BUILTIN_SUBPD,
24962 IX86_BUILTIN_SUBSD,
24963
24964 IX86_BUILTIN_CMPEQPD,
24965 IX86_BUILTIN_CMPLTPD,
24966 IX86_BUILTIN_CMPLEPD,
24967 IX86_BUILTIN_CMPGTPD,
24968 IX86_BUILTIN_CMPGEPD,
24969 IX86_BUILTIN_CMPNEQPD,
24970 IX86_BUILTIN_CMPNLTPD,
24971 IX86_BUILTIN_CMPNLEPD,
24972 IX86_BUILTIN_CMPNGTPD,
24973 IX86_BUILTIN_CMPNGEPD,
24974 IX86_BUILTIN_CMPORDPD,
24975 IX86_BUILTIN_CMPUNORDPD,
24976 IX86_BUILTIN_CMPEQSD,
24977 IX86_BUILTIN_CMPLTSD,
24978 IX86_BUILTIN_CMPLESD,
24979 IX86_BUILTIN_CMPNEQSD,
24980 IX86_BUILTIN_CMPNLTSD,
24981 IX86_BUILTIN_CMPNLESD,
24982 IX86_BUILTIN_CMPORDSD,
24983 IX86_BUILTIN_CMPUNORDSD,
24984
24985 IX86_BUILTIN_COMIEQSD,
24986 IX86_BUILTIN_COMILTSD,
24987 IX86_BUILTIN_COMILESD,
24988 IX86_BUILTIN_COMIGTSD,
24989 IX86_BUILTIN_COMIGESD,
24990 IX86_BUILTIN_COMINEQSD,
24991 IX86_BUILTIN_UCOMIEQSD,
24992 IX86_BUILTIN_UCOMILTSD,
24993 IX86_BUILTIN_UCOMILESD,
24994 IX86_BUILTIN_UCOMIGTSD,
24995 IX86_BUILTIN_UCOMIGESD,
24996 IX86_BUILTIN_UCOMINEQSD,
24997
24998 IX86_BUILTIN_MAXPD,
24999 IX86_BUILTIN_MAXSD,
25000 IX86_BUILTIN_MINPD,
25001 IX86_BUILTIN_MINSD,
25002
25003 IX86_BUILTIN_ANDPD,
25004 IX86_BUILTIN_ANDNPD,
25005 IX86_BUILTIN_ORPD,
25006 IX86_BUILTIN_XORPD,
25007
25008 IX86_BUILTIN_SQRTPD,
25009 IX86_BUILTIN_SQRTSD,
25010
25011 IX86_BUILTIN_UNPCKHPD,
25012 IX86_BUILTIN_UNPCKLPD,
25013
25014 IX86_BUILTIN_SHUFPD,
25015
25016 IX86_BUILTIN_LOADUPD,
25017 IX86_BUILTIN_STOREUPD,
25018 IX86_BUILTIN_MOVSD,
25019
25020 IX86_BUILTIN_LOADHPD,
25021 IX86_BUILTIN_LOADLPD,
25022
25023 IX86_BUILTIN_CVTDQ2PD,
25024 IX86_BUILTIN_CVTDQ2PS,
25025
25026 IX86_BUILTIN_CVTPD2DQ,
25027 IX86_BUILTIN_CVTPD2PI,
25028 IX86_BUILTIN_CVTPD2PS,
25029 IX86_BUILTIN_CVTTPD2DQ,
25030 IX86_BUILTIN_CVTTPD2PI,
25031
25032 IX86_BUILTIN_CVTPI2PD,
25033 IX86_BUILTIN_CVTSI2SD,
25034 IX86_BUILTIN_CVTSI642SD,
25035
25036 IX86_BUILTIN_CVTSD2SI,
25037 IX86_BUILTIN_CVTSD2SI64,
25038 IX86_BUILTIN_CVTSD2SS,
25039 IX86_BUILTIN_CVTSS2SD,
25040 IX86_BUILTIN_CVTTSD2SI,
25041 IX86_BUILTIN_CVTTSD2SI64,
25042
25043 IX86_BUILTIN_CVTPS2DQ,
25044 IX86_BUILTIN_CVTPS2PD,
25045 IX86_BUILTIN_CVTTPS2DQ,
25046
25047 IX86_BUILTIN_MOVNTI,
25048 IX86_BUILTIN_MOVNTI64,
25049 IX86_BUILTIN_MOVNTPD,
25050 IX86_BUILTIN_MOVNTDQ,
25051
25052 IX86_BUILTIN_MOVQ128,
25053
25054 /* SSE2 MMX */
25055 IX86_BUILTIN_MASKMOVDQU,
25056 IX86_BUILTIN_MOVMSKPD,
25057 IX86_BUILTIN_PMOVMSKB128,
25058
25059 IX86_BUILTIN_PACKSSWB128,
25060 IX86_BUILTIN_PACKSSDW128,
25061 IX86_BUILTIN_PACKUSWB128,
25062
25063 IX86_BUILTIN_PADDB128,
25064 IX86_BUILTIN_PADDW128,
25065 IX86_BUILTIN_PADDD128,
25066 IX86_BUILTIN_PADDQ128,
25067 IX86_BUILTIN_PADDSB128,
25068 IX86_BUILTIN_PADDSW128,
25069 IX86_BUILTIN_PADDUSB128,
25070 IX86_BUILTIN_PADDUSW128,
25071 IX86_BUILTIN_PSUBB128,
25072 IX86_BUILTIN_PSUBW128,
25073 IX86_BUILTIN_PSUBD128,
25074 IX86_BUILTIN_PSUBQ128,
25075 IX86_BUILTIN_PSUBSB128,
25076 IX86_BUILTIN_PSUBSW128,
25077 IX86_BUILTIN_PSUBUSB128,
25078 IX86_BUILTIN_PSUBUSW128,
25079
25080 IX86_BUILTIN_PAND128,
25081 IX86_BUILTIN_PANDN128,
25082 IX86_BUILTIN_POR128,
25083 IX86_BUILTIN_PXOR128,
25084
25085 IX86_BUILTIN_PAVGB128,
25086 IX86_BUILTIN_PAVGW128,
25087
25088 IX86_BUILTIN_PCMPEQB128,
25089 IX86_BUILTIN_PCMPEQW128,
25090 IX86_BUILTIN_PCMPEQD128,
25091 IX86_BUILTIN_PCMPGTB128,
25092 IX86_BUILTIN_PCMPGTW128,
25093 IX86_BUILTIN_PCMPGTD128,
25094
25095 IX86_BUILTIN_PMADDWD128,
25096
25097 IX86_BUILTIN_PMAXSW128,
25098 IX86_BUILTIN_PMAXUB128,
25099 IX86_BUILTIN_PMINSW128,
25100 IX86_BUILTIN_PMINUB128,
25101
25102 IX86_BUILTIN_PMULUDQ,
25103 IX86_BUILTIN_PMULUDQ128,
25104 IX86_BUILTIN_PMULHUW128,
25105 IX86_BUILTIN_PMULHW128,
25106 IX86_BUILTIN_PMULLW128,
25107
25108 IX86_BUILTIN_PSADBW128,
25109 IX86_BUILTIN_PSHUFHW,
25110 IX86_BUILTIN_PSHUFLW,
25111 IX86_BUILTIN_PSHUFD,
25112
25113 IX86_BUILTIN_PSLLDQI128,
25114 IX86_BUILTIN_PSLLWI128,
25115 IX86_BUILTIN_PSLLDI128,
25116 IX86_BUILTIN_PSLLQI128,
25117 IX86_BUILTIN_PSRAWI128,
25118 IX86_BUILTIN_PSRADI128,
25119 IX86_BUILTIN_PSRLDQI128,
25120 IX86_BUILTIN_PSRLWI128,
25121 IX86_BUILTIN_PSRLDI128,
25122 IX86_BUILTIN_PSRLQI128,
25123
25124 IX86_BUILTIN_PSLLDQ128,
25125 IX86_BUILTIN_PSLLW128,
25126 IX86_BUILTIN_PSLLD128,
25127 IX86_BUILTIN_PSLLQ128,
25128 IX86_BUILTIN_PSRAW128,
25129 IX86_BUILTIN_PSRAD128,
25130 IX86_BUILTIN_PSRLW128,
25131 IX86_BUILTIN_PSRLD128,
25132 IX86_BUILTIN_PSRLQ128,
25133
25134 IX86_BUILTIN_PUNPCKHBW128,
25135 IX86_BUILTIN_PUNPCKHWD128,
25136 IX86_BUILTIN_PUNPCKHDQ128,
25137 IX86_BUILTIN_PUNPCKHQDQ128,
25138 IX86_BUILTIN_PUNPCKLBW128,
25139 IX86_BUILTIN_PUNPCKLWD128,
25140 IX86_BUILTIN_PUNPCKLDQ128,
25141 IX86_BUILTIN_PUNPCKLQDQ128,
25142
25143 IX86_BUILTIN_CLFLUSH,
25144 IX86_BUILTIN_MFENCE,
25145 IX86_BUILTIN_LFENCE,
25146 IX86_BUILTIN_PAUSE,
25147
25148 IX86_BUILTIN_BSRSI,
25149 IX86_BUILTIN_BSRDI,
25150 IX86_BUILTIN_RDPMC,
25151 IX86_BUILTIN_RDTSC,
25152 IX86_BUILTIN_RDTSCP,
25153 IX86_BUILTIN_ROLQI,
25154 IX86_BUILTIN_ROLHI,
25155 IX86_BUILTIN_RORQI,
25156 IX86_BUILTIN_RORHI,
25157
25158 /* SSE3. */
25159 IX86_BUILTIN_ADDSUBPS,
25160 IX86_BUILTIN_HADDPS,
25161 IX86_BUILTIN_HSUBPS,
25162 IX86_BUILTIN_MOVSHDUP,
25163 IX86_BUILTIN_MOVSLDUP,
25164 IX86_BUILTIN_ADDSUBPD,
25165 IX86_BUILTIN_HADDPD,
25166 IX86_BUILTIN_HSUBPD,
25167 IX86_BUILTIN_LDDQU,
25168
25169 IX86_BUILTIN_MONITOR,
25170 IX86_BUILTIN_MWAIT,
25171
25172 /* SSSE3. */
25173 IX86_BUILTIN_PHADDW,
25174 IX86_BUILTIN_PHADDD,
25175 IX86_BUILTIN_PHADDSW,
25176 IX86_BUILTIN_PHSUBW,
25177 IX86_BUILTIN_PHSUBD,
25178 IX86_BUILTIN_PHSUBSW,
25179 IX86_BUILTIN_PMADDUBSW,
25180 IX86_BUILTIN_PMULHRSW,
25181 IX86_BUILTIN_PSHUFB,
25182 IX86_BUILTIN_PSIGNB,
25183 IX86_BUILTIN_PSIGNW,
25184 IX86_BUILTIN_PSIGND,
25185 IX86_BUILTIN_PALIGNR,
25186 IX86_BUILTIN_PABSB,
25187 IX86_BUILTIN_PABSW,
25188 IX86_BUILTIN_PABSD,
25189
25190 IX86_BUILTIN_PHADDW128,
25191 IX86_BUILTIN_PHADDD128,
25192 IX86_BUILTIN_PHADDSW128,
25193 IX86_BUILTIN_PHSUBW128,
25194 IX86_BUILTIN_PHSUBD128,
25195 IX86_BUILTIN_PHSUBSW128,
25196 IX86_BUILTIN_PMADDUBSW128,
25197 IX86_BUILTIN_PMULHRSW128,
25198 IX86_BUILTIN_PSHUFB128,
25199 IX86_BUILTIN_PSIGNB128,
25200 IX86_BUILTIN_PSIGNW128,
25201 IX86_BUILTIN_PSIGND128,
25202 IX86_BUILTIN_PALIGNR128,
25203 IX86_BUILTIN_PABSB128,
25204 IX86_BUILTIN_PABSW128,
25205 IX86_BUILTIN_PABSD128,
25206
25207 /* AMDFAM10 - SSE4A New Instructions. */
25208 IX86_BUILTIN_MOVNTSD,
25209 IX86_BUILTIN_MOVNTSS,
25210 IX86_BUILTIN_EXTRQI,
25211 IX86_BUILTIN_EXTRQ,
25212 IX86_BUILTIN_INSERTQI,
25213 IX86_BUILTIN_INSERTQ,
25214
25215 /* SSE4.1. */
25216 IX86_BUILTIN_BLENDPD,
25217 IX86_BUILTIN_BLENDPS,
25218 IX86_BUILTIN_BLENDVPD,
25219 IX86_BUILTIN_BLENDVPS,
25220 IX86_BUILTIN_PBLENDVB128,
25221 IX86_BUILTIN_PBLENDW128,
25222
25223 IX86_BUILTIN_DPPD,
25224 IX86_BUILTIN_DPPS,
25225
25226 IX86_BUILTIN_INSERTPS128,
25227
25228 IX86_BUILTIN_MOVNTDQA,
25229 IX86_BUILTIN_MPSADBW128,
25230 IX86_BUILTIN_PACKUSDW128,
25231 IX86_BUILTIN_PCMPEQQ,
25232 IX86_BUILTIN_PHMINPOSUW128,
25233
25234 IX86_BUILTIN_PMAXSB128,
25235 IX86_BUILTIN_PMAXSD128,
25236 IX86_BUILTIN_PMAXUD128,
25237 IX86_BUILTIN_PMAXUW128,
25238
25239 IX86_BUILTIN_PMINSB128,
25240 IX86_BUILTIN_PMINSD128,
25241 IX86_BUILTIN_PMINUD128,
25242 IX86_BUILTIN_PMINUW128,
25243
25244 IX86_BUILTIN_PMOVSXBW128,
25245 IX86_BUILTIN_PMOVSXBD128,
25246 IX86_BUILTIN_PMOVSXBQ128,
25247 IX86_BUILTIN_PMOVSXWD128,
25248 IX86_BUILTIN_PMOVSXWQ128,
25249 IX86_BUILTIN_PMOVSXDQ128,
25250
25251 IX86_BUILTIN_PMOVZXBW128,
25252 IX86_BUILTIN_PMOVZXBD128,
25253 IX86_BUILTIN_PMOVZXBQ128,
25254 IX86_BUILTIN_PMOVZXWD128,
25255 IX86_BUILTIN_PMOVZXWQ128,
25256 IX86_BUILTIN_PMOVZXDQ128,
25257
25258 IX86_BUILTIN_PMULDQ128,
25259 IX86_BUILTIN_PMULLD128,
25260
25261 IX86_BUILTIN_ROUNDSD,
25262 IX86_BUILTIN_ROUNDSS,
25263
25264 IX86_BUILTIN_ROUNDPD,
25265 IX86_BUILTIN_ROUNDPS,
25266
25267 IX86_BUILTIN_FLOORPD,
25268 IX86_BUILTIN_CEILPD,
25269 IX86_BUILTIN_TRUNCPD,
25270 IX86_BUILTIN_RINTPD,
25271 IX86_BUILTIN_ROUNDPD_AZ,
25272
25273 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25274 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25275 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25276
25277 IX86_BUILTIN_FLOORPS,
25278 IX86_BUILTIN_CEILPS,
25279 IX86_BUILTIN_TRUNCPS,
25280 IX86_BUILTIN_RINTPS,
25281 IX86_BUILTIN_ROUNDPS_AZ,
25282
25283 IX86_BUILTIN_FLOORPS_SFIX,
25284 IX86_BUILTIN_CEILPS_SFIX,
25285 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25286
25287 IX86_BUILTIN_PTESTZ,
25288 IX86_BUILTIN_PTESTC,
25289 IX86_BUILTIN_PTESTNZC,
25290
25291 IX86_BUILTIN_VEC_INIT_V2SI,
25292 IX86_BUILTIN_VEC_INIT_V4HI,
25293 IX86_BUILTIN_VEC_INIT_V8QI,
25294 IX86_BUILTIN_VEC_EXT_V2DF,
25295 IX86_BUILTIN_VEC_EXT_V2DI,
25296 IX86_BUILTIN_VEC_EXT_V4SF,
25297 IX86_BUILTIN_VEC_EXT_V4SI,
25298 IX86_BUILTIN_VEC_EXT_V8HI,
25299 IX86_BUILTIN_VEC_EXT_V2SI,
25300 IX86_BUILTIN_VEC_EXT_V4HI,
25301 IX86_BUILTIN_VEC_EXT_V16QI,
25302 IX86_BUILTIN_VEC_SET_V2DI,
25303 IX86_BUILTIN_VEC_SET_V4SF,
25304 IX86_BUILTIN_VEC_SET_V4SI,
25305 IX86_BUILTIN_VEC_SET_V8HI,
25306 IX86_BUILTIN_VEC_SET_V4HI,
25307 IX86_BUILTIN_VEC_SET_V16QI,
25308
25309 IX86_BUILTIN_VEC_PACK_SFIX,
25310 IX86_BUILTIN_VEC_PACK_SFIX256,
25311
25312 /* SSE4.2. */
25313 IX86_BUILTIN_CRC32QI,
25314 IX86_BUILTIN_CRC32HI,
25315 IX86_BUILTIN_CRC32SI,
25316 IX86_BUILTIN_CRC32DI,
25317
25318 IX86_BUILTIN_PCMPESTRI128,
25319 IX86_BUILTIN_PCMPESTRM128,
25320 IX86_BUILTIN_PCMPESTRA128,
25321 IX86_BUILTIN_PCMPESTRC128,
25322 IX86_BUILTIN_PCMPESTRO128,
25323 IX86_BUILTIN_PCMPESTRS128,
25324 IX86_BUILTIN_PCMPESTRZ128,
25325 IX86_BUILTIN_PCMPISTRI128,
25326 IX86_BUILTIN_PCMPISTRM128,
25327 IX86_BUILTIN_PCMPISTRA128,
25328 IX86_BUILTIN_PCMPISTRC128,
25329 IX86_BUILTIN_PCMPISTRO128,
25330 IX86_BUILTIN_PCMPISTRS128,
25331 IX86_BUILTIN_PCMPISTRZ128,
25332
25333 IX86_BUILTIN_PCMPGTQ,
25334
25335 /* AES instructions */
25336 IX86_BUILTIN_AESENC128,
25337 IX86_BUILTIN_AESENCLAST128,
25338 IX86_BUILTIN_AESDEC128,
25339 IX86_BUILTIN_AESDECLAST128,
25340 IX86_BUILTIN_AESIMC128,
25341 IX86_BUILTIN_AESKEYGENASSIST128,
25342
25343 /* PCLMUL instruction */
25344 IX86_BUILTIN_PCLMULQDQ128,
25345
25346 /* AVX */
25347 IX86_BUILTIN_ADDPD256,
25348 IX86_BUILTIN_ADDPS256,
25349 IX86_BUILTIN_ADDSUBPD256,
25350 IX86_BUILTIN_ADDSUBPS256,
25351 IX86_BUILTIN_ANDPD256,
25352 IX86_BUILTIN_ANDPS256,
25353 IX86_BUILTIN_ANDNPD256,
25354 IX86_BUILTIN_ANDNPS256,
25355 IX86_BUILTIN_BLENDPD256,
25356 IX86_BUILTIN_BLENDPS256,
25357 IX86_BUILTIN_BLENDVPD256,
25358 IX86_BUILTIN_BLENDVPS256,
25359 IX86_BUILTIN_DIVPD256,
25360 IX86_BUILTIN_DIVPS256,
25361 IX86_BUILTIN_DPPS256,
25362 IX86_BUILTIN_HADDPD256,
25363 IX86_BUILTIN_HADDPS256,
25364 IX86_BUILTIN_HSUBPD256,
25365 IX86_BUILTIN_HSUBPS256,
25366 IX86_BUILTIN_MAXPD256,
25367 IX86_BUILTIN_MAXPS256,
25368 IX86_BUILTIN_MINPD256,
25369 IX86_BUILTIN_MINPS256,
25370 IX86_BUILTIN_MULPD256,
25371 IX86_BUILTIN_MULPS256,
25372 IX86_BUILTIN_ORPD256,
25373 IX86_BUILTIN_ORPS256,
25374 IX86_BUILTIN_SHUFPD256,
25375 IX86_BUILTIN_SHUFPS256,
25376 IX86_BUILTIN_SUBPD256,
25377 IX86_BUILTIN_SUBPS256,
25378 IX86_BUILTIN_XORPD256,
25379 IX86_BUILTIN_XORPS256,
25380 IX86_BUILTIN_CMPSD,
25381 IX86_BUILTIN_CMPSS,
25382 IX86_BUILTIN_CMPPD,
25383 IX86_BUILTIN_CMPPS,
25384 IX86_BUILTIN_CMPPD256,
25385 IX86_BUILTIN_CMPPS256,
25386 IX86_BUILTIN_CVTDQ2PD256,
25387 IX86_BUILTIN_CVTDQ2PS256,
25388 IX86_BUILTIN_CVTPD2PS256,
25389 IX86_BUILTIN_CVTPS2DQ256,
25390 IX86_BUILTIN_CVTPS2PD256,
25391 IX86_BUILTIN_CVTTPD2DQ256,
25392 IX86_BUILTIN_CVTPD2DQ256,
25393 IX86_BUILTIN_CVTTPS2DQ256,
25394 IX86_BUILTIN_EXTRACTF128PD256,
25395 IX86_BUILTIN_EXTRACTF128PS256,
25396 IX86_BUILTIN_EXTRACTF128SI256,
25397 IX86_BUILTIN_VZEROALL,
25398 IX86_BUILTIN_VZEROUPPER,
25399 IX86_BUILTIN_VPERMILVARPD,
25400 IX86_BUILTIN_VPERMILVARPS,
25401 IX86_BUILTIN_VPERMILVARPD256,
25402 IX86_BUILTIN_VPERMILVARPS256,
25403 IX86_BUILTIN_VPERMILPD,
25404 IX86_BUILTIN_VPERMILPS,
25405 IX86_BUILTIN_VPERMILPD256,
25406 IX86_BUILTIN_VPERMILPS256,
25407 IX86_BUILTIN_VPERMIL2PD,
25408 IX86_BUILTIN_VPERMIL2PS,
25409 IX86_BUILTIN_VPERMIL2PD256,
25410 IX86_BUILTIN_VPERMIL2PS256,
25411 IX86_BUILTIN_VPERM2F128PD256,
25412 IX86_BUILTIN_VPERM2F128PS256,
25413 IX86_BUILTIN_VPERM2F128SI256,
25414 IX86_BUILTIN_VBROADCASTSS,
25415 IX86_BUILTIN_VBROADCASTSD256,
25416 IX86_BUILTIN_VBROADCASTSS256,
25417 IX86_BUILTIN_VBROADCASTPD256,
25418 IX86_BUILTIN_VBROADCASTPS256,
25419 IX86_BUILTIN_VINSERTF128PD256,
25420 IX86_BUILTIN_VINSERTF128PS256,
25421 IX86_BUILTIN_VINSERTF128SI256,
25422 IX86_BUILTIN_LOADUPD256,
25423 IX86_BUILTIN_LOADUPS256,
25424 IX86_BUILTIN_STOREUPD256,
25425 IX86_BUILTIN_STOREUPS256,
25426 IX86_BUILTIN_LDDQU256,
25427 IX86_BUILTIN_MOVNTDQ256,
25428 IX86_BUILTIN_MOVNTPD256,
25429 IX86_BUILTIN_MOVNTPS256,
25430 IX86_BUILTIN_LOADDQU256,
25431 IX86_BUILTIN_STOREDQU256,
25432 IX86_BUILTIN_MASKLOADPD,
25433 IX86_BUILTIN_MASKLOADPS,
25434 IX86_BUILTIN_MASKSTOREPD,
25435 IX86_BUILTIN_MASKSTOREPS,
25436 IX86_BUILTIN_MASKLOADPD256,
25437 IX86_BUILTIN_MASKLOADPS256,
25438 IX86_BUILTIN_MASKSTOREPD256,
25439 IX86_BUILTIN_MASKSTOREPS256,
25440 IX86_BUILTIN_MOVSHDUP256,
25441 IX86_BUILTIN_MOVSLDUP256,
25442 IX86_BUILTIN_MOVDDUP256,
25443
25444 IX86_BUILTIN_SQRTPD256,
25445 IX86_BUILTIN_SQRTPS256,
25446 IX86_BUILTIN_SQRTPS_NR256,
25447 IX86_BUILTIN_RSQRTPS256,
25448 IX86_BUILTIN_RSQRTPS_NR256,
25449
25450 IX86_BUILTIN_RCPPS256,
25451
25452 IX86_BUILTIN_ROUNDPD256,
25453 IX86_BUILTIN_ROUNDPS256,
25454
25455 IX86_BUILTIN_FLOORPD256,
25456 IX86_BUILTIN_CEILPD256,
25457 IX86_BUILTIN_TRUNCPD256,
25458 IX86_BUILTIN_RINTPD256,
25459 IX86_BUILTIN_ROUNDPD_AZ256,
25460
25461 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25462 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25463 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25464
25465 IX86_BUILTIN_FLOORPS256,
25466 IX86_BUILTIN_CEILPS256,
25467 IX86_BUILTIN_TRUNCPS256,
25468 IX86_BUILTIN_RINTPS256,
25469 IX86_BUILTIN_ROUNDPS_AZ256,
25470
25471 IX86_BUILTIN_FLOORPS_SFIX256,
25472 IX86_BUILTIN_CEILPS_SFIX256,
25473 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25474
25475 IX86_BUILTIN_UNPCKHPD256,
25476 IX86_BUILTIN_UNPCKLPD256,
25477 IX86_BUILTIN_UNPCKHPS256,
25478 IX86_BUILTIN_UNPCKLPS256,
25479
25480 IX86_BUILTIN_SI256_SI,
25481 IX86_BUILTIN_PS256_PS,
25482 IX86_BUILTIN_PD256_PD,
25483 IX86_BUILTIN_SI_SI256,
25484 IX86_BUILTIN_PS_PS256,
25485 IX86_BUILTIN_PD_PD256,
25486
25487 IX86_BUILTIN_VTESTZPD,
25488 IX86_BUILTIN_VTESTCPD,
25489 IX86_BUILTIN_VTESTNZCPD,
25490 IX86_BUILTIN_VTESTZPS,
25491 IX86_BUILTIN_VTESTCPS,
25492 IX86_BUILTIN_VTESTNZCPS,
25493 IX86_BUILTIN_VTESTZPD256,
25494 IX86_BUILTIN_VTESTCPD256,
25495 IX86_BUILTIN_VTESTNZCPD256,
25496 IX86_BUILTIN_VTESTZPS256,
25497 IX86_BUILTIN_VTESTCPS256,
25498 IX86_BUILTIN_VTESTNZCPS256,
25499 IX86_BUILTIN_PTESTZ256,
25500 IX86_BUILTIN_PTESTC256,
25501 IX86_BUILTIN_PTESTNZC256,
25502
25503 IX86_BUILTIN_MOVMSKPD256,
25504 IX86_BUILTIN_MOVMSKPS256,
25505
25506 /* AVX2 */
25507 IX86_BUILTIN_MPSADBW256,
25508 IX86_BUILTIN_PABSB256,
25509 IX86_BUILTIN_PABSW256,
25510 IX86_BUILTIN_PABSD256,
25511 IX86_BUILTIN_PACKSSDW256,
25512 IX86_BUILTIN_PACKSSWB256,
25513 IX86_BUILTIN_PACKUSDW256,
25514 IX86_BUILTIN_PACKUSWB256,
25515 IX86_BUILTIN_PADDB256,
25516 IX86_BUILTIN_PADDW256,
25517 IX86_BUILTIN_PADDD256,
25518 IX86_BUILTIN_PADDQ256,
25519 IX86_BUILTIN_PADDSB256,
25520 IX86_BUILTIN_PADDSW256,
25521 IX86_BUILTIN_PADDUSB256,
25522 IX86_BUILTIN_PADDUSW256,
25523 IX86_BUILTIN_PALIGNR256,
25524 IX86_BUILTIN_AND256I,
25525 IX86_BUILTIN_ANDNOT256I,
25526 IX86_BUILTIN_PAVGB256,
25527 IX86_BUILTIN_PAVGW256,
25528 IX86_BUILTIN_PBLENDVB256,
25529 IX86_BUILTIN_PBLENDVW256,
25530 IX86_BUILTIN_PCMPEQB256,
25531 IX86_BUILTIN_PCMPEQW256,
25532 IX86_BUILTIN_PCMPEQD256,
25533 IX86_BUILTIN_PCMPEQQ256,
25534 IX86_BUILTIN_PCMPGTB256,
25535 IX86_BUILTIN_PCMPGTW256,
25536 IX86_BUILTIN_PCMPGTD256,
25537 IX86_BUILTIN_PCMPGTQ256,
25538 IX86_BUILTIN_PHADDW256,
25539 IX86_BUILTIN_PHADDD256,
25540 IX86_BUILTIN_PHADDSW256,
25541 IX86_BUILTIN_PHSUBW256,
25542 IX86_BUILTIN_PHSUBD256,
25543 IX86_BUILTIN_PHSUBSW256,
25544 IX86_BUILTIN_PMADDUBSW256,
25545 IX86_BUILTIN_PMADDWD256,
25546 IX86_BUILTIN_PMAXSB256,
25547 IX86_BUILTIN_PMAXSW256,
25548 IX86_BUILTIN_PMAXSD256,
25549 IX86_BUILTIN_PMAXUB256,
25550 IX86_BUILTIN_PMAXUW256,
25551 IX86_BUILTIN_PMAXUD256,
25552 IX86_BUILTIN_PMINSB256,
25553 IX86_BUILTIN_PMINSW256,
25554 IX86_BUILTIN_PMINSD256,
25555 IX86_BUILTIN_PMINUB256,
25556 IX86_BUILTIN_PMINUW256,
25557 IX86_BUILTIN_PMINUD256,
25558 IX86_BUILTIN_PMOVMSKB256,
25559 IX86_BUILTIN_PMOVSXBW256,
25560 IX86_BUILTIN_PMOVSXBD256,
25561 IX86_BUILTIN_PMOVSXBQ256,
25562 IX86_BUILTIN_PMOVSXWD256,
25563 IX86_BUILTIN_PMOVSXWQ256,
25564 IX86_BUILTIN_PMOVSXDQ256,
25565 IX86_BUILTIN_PMOVZXBW256,
25566 IX86_BUILTIN_PMOVZXBD256,
25567 IX86_BUILTIN_PMOVZXBQ256,
25568 IX86_BUILTIN_PMOVZXWD256,
25569 IX86_BUILTIN_PMOVZXWQ256,
25570 IX86_BUILTIN_PMOVZXDQ256,
25571 IX86_BUILTIN_PMULDQ256,
25572 IX86_BUILTIN_PMULHRSW256,
25573 IX86_BUILTIN_PMULHUW256,
25574 IX86_BUILTIN_PMULHW256,
25575 IX86_BUILTIN_PMULLW256,
25576 IX86_BUILTIN_PMULLD256,
25577 IX86_BUILTIN_PMULUDQ256,
25578 IX86_BUILTIN_POR256,
25579 IX86_BUILTIN_PSADBW256,
25580 IX86_BUILTIN_PSHUFB256,
25581 IX86_BUILTIN_PSHUFD256,
25582 IX86_BUILTIN_PSHUFHW256,
25583 IX86_BUILTIN_PSHUFLW256,
25584 IX86_BUILTIN_PSIGNB256,
25585 IX86_BUILTIN_PSIGNW256,
25586 IX86_BUILTIN_PSIGND256,
25587 IX86_BUILTIN_PSLLDQI256,
25588 IX86_BUILTIN_PSLLWI256,
25589 IX86_BUILTIN_PSLLW256,
25590 IX86_BUILTIN_PSLLDI256,
25591 IX86_BUILTIN_PSLLD256,
25592 IX86_BUILTIN_PSLLQI256,
25593 IX86_BUILTIN_PSLLQ256,
25594 IX86_BUILTIN_PSRAWI256,
25595 IX86_BUILTIN_PSRAW256,
25596 IX86_BUILTIN_PSRADI256,
25597 IX86_BUILTIN_PSRAD256,
25598 IX86_BUILTIN_PSRLDQI256,
25599 IX86_BUILTIN_PSRLWI256,
25600 IX86_BUILTIN_PSRLW256,
25601 IX86_BUILTIN_PSRLDI256,
25602 IX86_BUILTIN_PSRLD256,
25603 IX86_BUILTIN_PSRLQI256,
25604 IX86_BUILTIN_PSRLQ256,
25605 IX86_BUILTIN_PSUBB256,
25606 IX86_BUILTIN_PSUBW256,
25607 IX86_BUILTIN_PSUBD256,
25608 IX86_BUILTIN_PSUBQ256,
25609 IX86_BUILTIN_PSUBSB256,
25610 IX86_BUILTIN_PSUBSW256,
25611 IX86_BUILTIN_PSUBUSB256,
25612 IX86_BUILTIN_PSUBUSW256,
25613 IX86_BUILTIN_PUNPCKHBW256,
25614 IX86_BUILTIN_PUNPCKHWD256,
25615 IX86_BUILTIN_PUNPCKHDQ256,
25616 IX86_BUILTIN_PUNPCKHQDQ256,
25617 IX86_BUILTIN_PUNPCKLBW256,
25618 IX86_BUILTIN_PUNPCKLWD256,
25619 IX86_BUILTIN_PUNPCKLDQ256,
25620 IX86_BUILTIN_PUNPCKLQDQ256,
25621 IX86_BUILTIN_PXOR256,
25622 IX86_BUILTIN_MOVNTDQA256,
25623 IX86_BUILTIN_VBROADCASTSS_PS,
25624 IX86_BUILTIN_VBROADCASTSS_PS256,
25625 IX86_BUILTIN_VBROADCASTSD_PD256,
25626 IX86_BUILTIN_VBROADCASTSI256,
25627 IX86_BUILTIN_PBLENDD256,
25628 IX86_BUILTIN_PBLENDD128,
25629 IX86_BUILTIN_PBROADCASTB256,
25630 IX86_BUILTIN_PBROADCASTW256,
25631 IX86_BUILTIN_PBROADCASTD256,
25632 IX86_BUILTIN_PBROADCASTQ256,
25633 IX86_BUILTIN_PBROADCASTB128,
25634 IX86_BUILTIN_PBROADCASTW128,
25635 IX86_BUILTIN_PBROADCASTD128,
25636 IX86_BUILTIN_PBROADCASTQ128,
25637 IX86_BUILTIN_VPERMVARSI256,
25638 IX86_BUILTIN_VPERMDF256,
25639 IX86_BUILTIN_VPERMVARSF256,
25640 IX86_BUILTIN_VPERMDI256,
25641 IX86_BUILTIN_VPERMTI256,
25642 IX86_BUILTIN_VEXTRACT128I256,
25643 IX86_BUILTIN_VINSERT128I256,
25644 IX86_BUILTIN_MASKLOADD,
25645 IX86_BUILTIN_MASKLOADQ,
25646 IX86_BUILTIN_MASKLOADD256,
25647 IX86_BUILTIN_MASKLOADQ256,
25648 IX86_BUILTIN_MASKSTORED,
25649 IX86_BUILTIN_MASKSTOREQ,
25650 IX86_BUILTIN_MASKSTORED256,
25651 IX86_BUILTIN_MASKSTOREQ256,
25652 IX86_BUILTIN_PSLLVV4DI,
25653 IX86_BUILTIN_PSLLVV2DI,
25654 IX86_BUILTIN_PSLLVV8SI,
25655 IX86_BUILTIN_PSLLVV4SI,
25656 IX86_BUILTIN_PSRAVV8SI,
25657 IX86_BUILTIN_PSRAVV4SI,
25658 IX86_BUILTIN_PSRLVV4DI,
25659 IX86_BUILTIN_PSRLVV2DI,
25660 IX86_BUILTIN_PSRLVV8SI,
25661 IX86_BUILTIN_PSRLVV4SI,
25662
25663 IX86_BUILTIN_GATHERSIV2DF,
25664 IX86_BUILTIN_GATHERSIV4DF,
25665 IX86_BUILTIN_GATHERDIV2DF,
25666 IX86_BUILTIN_GATHERDIV4DF,
25667 IX86_BUILTIN_GATHERSIV4SF,
25668 IX86_BUILTIN_GATHERSIV8SF,
25669 IX86_BUILTIN_GATHERDIV4SF,
25670 IX86_BUILTIN_GATHERDIV8SF,
25671 IX86_BUILTIN_GATHERSIV2DI,
25672 IX86_BUILTIN_GATHERSIV4DI,
25673 IX86_BUILTIN_GATHERDIV2DI,
25674 IX86_BUILTIN_GATHERDIV4DI,
25675 IX86_BUILTIN_GATHERSIV4SI,
25676 IX86_BUILTIN_GATHERSIV8SI,
25677 IX86_BUILTIN_GATHERDIV4SI,
25678 IX86_BUILTIN_GATHERDIV8SI,
25679
25680 /* Alternate 4 element gather for the vectorizer where
25681 all operands are 32-byte wide. */
25682 IX86_BUILTIN_GATHERALTSIV4DF,
25683 IX86_BUILTIN_GATHERALTDIV8SF,
25684 IX86_BUILTIN_GATHERALTSIV4DI,
25685 IX86_BUILTIN_GATHERALTDIV8SI,
25686
25687 /* TFmode support builtins. */
25688 IX86_BUILTIN_INFQ,
25689 IX86_BUILTIN_HUGE_VALQ,
25690 IX86_BUILTIN_FABSQ,
25691 IX86_BUILTIN_COPYSIGNQ,
25692
25693 /* Vectorizer support builtins. */
25694 IX86_BUILTIN_CPYSGNPS,
25695 IX86_BUILTIN_CPYSGNPD,
25696 IX86_BUILTIN_CPYSGNPS256,
25697 IX86_BUILTIN_CPYSGNPD256,
25698
25699 /* FMA4 instructions. */
25700 IX86_BUILTIN_VFMADDSS,
25701 IX86_BUILTIN_VFMADDSD,
25702 IX86_BUILTIN_VFMADDPS,
25703 IX86_BUILTIN_VFMADDPD,
25704 IX86_BUILTIN_VFMADDPS256,
25705 IX86_BUILTIN_VFMADDPD256,
25706 IX86_BUILTIN_VFMADDSUBPS,
25707 IX86_BUILTIN_VFMADDSUBPD,
25708 IX86_BUILTIN_VFMADDSUBPS256,
25709 IX86_BUILTIN_VFMADDSUBPD256,
25710
25711 /* FMA3 instructions. */
25712 IX86_BUILTIN_VFMADDSS3,
25713 IX86_BUILTIN_VFMADDSD3,
25714
25715 /* XOP instructions. */
25716 IX86_BUILTIN_VPCMOV,
25717 IX86_BUILTIN_VPCMOV_V2DI,
25718 IX86_BUILTIN_VPCMOV_V4SI,
25719 IX86_BUILTIN_VPCMOV_V8HI,
25720 IX86_BUILTIN_VPCMOV_V16QI,
25721 IX86_BUILTIN_VPCMOV_V4SF,
25722 IX86_BUILTIN_VPCMOV_V2DF,
25723 IX86_BUILTIN_VPCMOV256,
25724 IX86_BUILTIN_VPCMOV_V4DI256,
25725 IX86_BUILTIN_VPCMOV_V8SI256,
25726 IX86_BUILTIN_VPCMOV_V16HI256,
25727 IX86_BUILTIN_VPCMOV_V32QI256,
25728 IX86_BUILTIN_VPCMOV_V8SF256,
25729 IX86_BUILTIN_VPCMOV_V4DF256,
25730
25731 IX86_BUILTIN_VPPERM,
25732
25733 IX86_BUILTIN_VPMACSSWW,
25734 IX86_BUILTIN_VPMACSWW,
25735 IX86_BUILTIN_VPMACSSWD,
25736 IX86_BUILTIN_VPMACSWD,
25737 IX86_BUILTIN_VPMACSSDD,
25738 IX86_BUILTIN_VPMACSDD,
25739 IX86_BUILTIN_VPMACSSDQL,
25740 IX86_BUILTIN_VPMACSSDQH,
25741 IX86_BUILTIN_VPMACSDQL,
25742 IX86_BUILTIN_VPMACSDQH,
25743 IX86_BUILTIN_VPMADCSSWD,
25744 IX86_BUILTIN_VPMADCSWD,
25745
25746 IX86_BUILTIN_VPHADDBW,
25747 IX86_BUILTIN_VPHADDBD,
25748 IX86_BUILTIN_VPHADDBQ,
25749 IX86_BUILTIN_VPHADDWD,
25750 IX86_BUILTIN_VPHADDWQ,
25751 IX86_BUILTIN_VPHADDDQ,
25752 IX86_BUILTIN_VPHADDUBW,
25753 IX86_BUILTIN_VPHADDUBD,
25754 IX86_BUILTIN_VPHADDUBQ,
25755 IX86_BUILTIN_VPHADDUWD,
25756 IX86_BUILTIN_VPHADDUWQ,
25757 IX86_BUILTIN_VPHADDUDQ,
25758 IX86_BUILTIN_VPHSUBBW,
25759 IX86_BUILTIN_VPHSUBWD,
25760 IX86_BUILTIN_VPHSUBDQ,
25761
25762 IX86_BUILTIN_VPROTB,
25763 IX86_BUILTIN_VPROTW,
25764 IX86_BUILTIN_VPROTD,
25765 IX86_BUILTIN_VPROTQ,
25766 IX86_BUILTIN_VPROTB_IMM,
25767 IX86_BUILTIN_VPROTW_IMM,
25768 IX86_BUILTIN_VPROTD_IMM,
25769 IX86_BUILTIN_VPROTQ_IMM,
25770
25771 IX86_BUILTIN_VPSHLB,
25772 IX86_BUILTIN_VPSHLW,
25773 IX86_BUILTIN_VPSHLD,
25774 IX86_BUILTIN_VPSHLQ,
25775 IX86_BUILTIN_VPSHAB,
25776 IX86_BUILTIN_VPSHAW,
25777 IX86_BUILTIN_VPSHAD,
25778 IX86_BUILTIN_VPSHAQ,
25779
25780 IX86_BUILTIN_VFRCZSS,
25781 IX86_BUILTIN_VFRCZSD,
25782 IX86_BUILTIN_VFRCZPS,
25783 IX86_BUILTIN_VFRCZPD,
25784 IX86_BUILTIN_VFRCZPS256,
25785 IX86_BUILTIN_VFRCZPD256,
25786
25787 IX86_BUILTIN_VPCOMEQUB,
25788 IX86_BUILTIN_VPCOMNEUB,
25789 IX86_BUILTIN_VPCOMLTUB,
25790 IX86_BUILTIN_VPCOMLEUB,
25791 IX86_BUILTIN_VPCOMGTUB,
25792 IX86_BUILTIN_VPCOMGEUB,
25793 IX86_BUILTIN_VPCOMFALSEUB,
25794 IX86_BUILTIN_VPCOMTRUEUB,
25795
25796 IX86_BUILTIN_VPCOMEQUW,
25797 IX86_BUILTIN_VPCOMNEUW,
25798 IX86_BUILTIN_VPCOMLTUW,
25799 IX86_BUILTIN_VPCOMLEUW,
25800 IX86_BUILTIN_VPCOMGTUW,
25801 IX86_BUILTIN_VPCOMGEUW,
25802 IX86_BUILTIN_VPCOMFALSEUW,
25803 IX86_BUILTIN_VPCOMTRUEUW,
25804
25805 IX86_BUILTIN_VPCOMEQUD,
25806 IX86_BUILTIN_VPCOMNEUD,
25807 IX86_BUILTIN_VPCOMLTUD,
25808 IX86_BUILTIN_VPCOMLEUD,
25809 IX86_BUILTIN_VPCOMGTUD,
25810 IX86_BUILTIN_VPCOMGEUD,
25811 IX86_BUILTIN_VPCOMFALSEUD,
25812 IX86_BUILTIN_VPCOMTRUEUD,
25813
25814 IX86_BUILTIN_VPCOMEQUQ,
25815 IX86_BUILTIN_VPCOMNEUQ,
25816 IX86_BUILTIN_VPCOMLTUQ,
25817 IX86_BUILTIN_VPCOMLEUQ,
25818 IX86_BUILTIN_VPCOMGTUQ,
25819 IX86_BUILTIN_VPCOMGEUQ,
25820 IX86_BUILTIN_VPCOMFALSEUQ,
25821 IX86_BUILTIN_VPCOMTRUEUQ,
25822
25823 IX86_BUILTIN_VPCOMEQB,
25824 IX86_BUILTIN_VPCOMNEB,
25825 IX86_BUILTIN_VPCOMLTB,
25826 IX86_BUILTIN_VPCOMLEB,
25827 IX86_BUILTIN_VPCOMGTB,
25828 IX86_BUILTIN_VPCOMGEB,
25829 IX86_BUILTIN_VPCOMFALSEB,
25830 IX86_BUILTIN_VPCOMTRUEB,
25831
25832 IX86_BUILTIN_VPCOMEQW,
25833 IX86_BUILTIN_VPCOMNEW,
25834 IX86_BUILTIN_VPCOMLTW,
25835 IX86_BUILTIN_VPCOMLEW,
25836 IX86_BUILTIN_VPCOMGTW,
25837 IX86_BUILTIN_VPCOMGEW,
25838 IX86_BUILTIN_VPCOMFALSEW,
25839 IX86_BUILTIN_VPCOMTRUEW,
25840
25841 IX86_BUILTIN_VPCOMEQD,
25842 IX86_BUILTIN_VPCOMNED,
25843 IX86_BUILTIN_VPCOMLTD,
25844 IX86_BUILTIN_VPCOMLED,
25845 IX86_BUILTIN_VPCOMGTD,
25846 IX86_BUILTIN_VPCOMGED,
25847 IX86_BUILTIN_VPCOMFALSED,
25848 IX86_BUILTIN_VPCOMTRUED,
25849
25850 IX86_BUILTIN_VPCOMEQQ,
25851 IX86_BUILTIN_VPCOMNEQ,
25852 IX86_BUILTIN_VPCOMLTQ,
25853 IX86_BUILTIN_VPCOMLEQ,
25854 IX86_BUILTIN_VPCOMGTQ,
25855 IX86_BUILTIN_VPCOMGEQ,
25856 IX86_BUILTIN_VPCOMFALSEQ,
25857 IX86_BUILTIN_VPCOMTRUEQ,
25858
25859 /* LWP instructions. */
25860 IX86_BUILTIN_LLWPCB,
25861 IX86_BUILTIN_SLWPCB,
25862 IX86_BUILTIN_LWPVAL32,
25863 IX86_BUILTIN_LWPVAL64,
25864 IX86_BUILTIN_LWPINS32,
25865 IX86_BUILTIN_LWPINS64,
25866
25867 IX86_BUILTIN_CLZS,
25868
25869 /* RTM */
25870 IX86_BUILTIN_XBEGIN,
25871 IX86_BUILTIN_XEND,
25872 IX86_BUILTIN_XABORT,
25873 IX86_BUILTIN_XTEST,
25874
25875 /* BMI instructions. */
25876 IX86_BUILTIN_BEXTR32,
25877 IX86_BUILTIN_BEXTR64,
25878 IX86_BUILTIN_CTZS,
25879
25880 /* TBM instructions. */
25881 IX86_BUILTIN_BEXTRI32,
25882 IX86_BUILTIN_BEXTRI64,
25883
25884 /* BMI2 instructions. */
25885 IX86_BUILTIN_BZHI32,
25886 IX86_BUILTIN_BZHI64,
25887 IX86_BUILTIN_PDEP32,
25888 IX86_BUILTIN_PDEP64,
25889 IX86_BUILTIN_PEXT32,
25890 IX86_BUILTIN_PEXT64,
25891
25892 /* FSGSBASE instructions. */
25893 IX86_BUILTIN_RDFSBASE32,
25894 IX86_BUILTIN_RDFSBASE64,
25895 IX86_BUILTIN_RDGSBASE32,
25896 IX86_BUILTIN_RDGSBASE64,
25897 IX86_BUILTIN_WRFSBASE32,
25898 IX86_BUILTIN_WRFSBASE64,
25899 IX86_BUILTIN_WRGSBASE32,
25900 IX86_BUILTIN_WRGSBASE64,
25901
25902 /* RDRND instructions. */
25903 IX86_BUILTIN_RDRAND16_STEP,
25904 IX86_BUILTIN_RDRAND32_STEP,
25905 IX86_BUILTIN_RDRAND64_STEP,
25906
25907 /* F16C instructions. */
25908 IX86_BUILTIN_CVTPH2PS,
25909 IX86_BUILTIN_CVTPH2PS256,
25910 IX86_BUILTIN_CVTPS2PH,
25911 IX86_BUILTIN_CVTPS2PH256,
25912
25913 /* CFString built-in for darwin */
25914 IX86_BUILTIN_CFSTRING,
25915
25916 /* Builtins to get CPU type and supported features. */
25917 IX86_BUILTIN_CPU_INIT,
25918 IX86_BUILTIN_CPU_IS,
25919 IX86_BUILTIN_CPU_SUPPORTS,
25920
25921 IX86_BUILTIN_MAX
25922 };
25923
25924 /* Table for the ix86 builtin decls. */
25925 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25926
25927 /* Table of all of the builtin functions that are possible with different ISA's
25928 but are waiting to be built until a function is declared to use that
25929 ISA. */
25930 struct builtin_isa {
25931 const char *name; /* function name */
25932 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25933 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25934 bool const_p; /* true if the declaration is constant */
25935 bool set_and_not_built_p;
25936 };
25937
25938 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25939
25940
25941 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25942 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25943 function decl in the ix86_builtins array. Returns the function decl or
25944 NULL_TREE, if the builtin was not added.
25945
25946 If the front end has a special hook for builtin functions, delay adding
25947 builtin functions that aren't in the current ISA until the ISA is changed
25948 with function specific optimization. Doing so, can save about 300K for the
25949 default compiler. When the builtin is expanded, check at that time whether
25950 it is valid.
25951
25952 If the front end doesn't have a special hook, record all builtins, even if
25953 it isn't an instruction set in the current ISA in case the user uses
25954 function specific options for a different ISA, so that we don't get scope
25955 errors if a builtin is added in the middle of a function scope. */
25956
25957 static inline tree
25958 def_builtin (HOST_WIDE_INT mask, const char *name,
25959 enum ix86_builtin_func_type tcode,
25960 enum ix86_builtins code)
25961 {
25962 tree decl = NULL_TREE;
25963
25964 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25965 {
25966 ix86_builtins_isa[(int) code].isa = mask;
25967
25968 mask &= ~OPTION_MASK_ISA_64BIT;
25969 if (mask == 0
25970 || (mask & ix86_isa_flags) != 0
25971 || (lang_hooks.builtin_function
25972 == lang_hooks.builtin_function_ext_scope))
25973
25974 {
25975 tree type = ix86_get_builtin_func_type (tcode);
25976 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25977 NULL, NULL_TREE);
25978 ix86_builtins[(int) code] = decl;
25979 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25980 }
25981 else
25982 {
25983 ix86_builtins[(int) code] = NULL_TREE;
25984 ix86_builtins_isa[(int) code].tcode = tcode;
25985 ix86_builtins_isa[(int) code].name = name;
25986 ix86_builtins_isa[(int) code].const_p = false;
25987 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25988 }
25989 }
25990
25991 return decl;
25992 }
25993
25994 /* Like def_builtin, but also marks the function decl "const". */
25995
25996 static inline tree
25997 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25998 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25999 {
26000 tree decl = def_builtin (mask, name, tcode, code);
26001 if (decl)
26002 TREE_READONLY (decl) = 1;
26003 else
26004 ix86_builtins_isa[(int) code].const_p = true;
26005
26006 return decl;
26007 }
26008
26009 /* Add any new builtin functions for a given ISA that may not have been
26010 declared. This saves a bit of space compared to adding all of the
26011 declarations to the tree, even if we didn't use them. */
26012
26013 static void
26014 ix86_add_new_builtins (HOST_WIDE_INT isa)
26015 {
26016 int i;
26017
26018 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26019 {
26020 if ((ix86_builtins_isa[i].isa & isa) != 0
26021 && ix86_builtins_isa[i].set_and_not_built_p)
26022 {
26023 tree decl, type;
26024
26025 /* Don't define the builtin again. */
26026 ix86_builtins_isa[i].set_and_not_built_p = false;
26027
26028 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26029 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26030 type, i, BUILT_IN_MD, NULL,
26031 NULL_TREE);
26032
26033 ix86_builtins[i] = decl;
26034 if (ix86_builtins_isa[i].const_p)
26035 TREE_READONLY (decl) = 1;
26036 }
26037 }
26038 }
26039
26040 /* Bits for builtin_description.flag. */
26041
26042 /* Set when we don't support the comparison natively, and should
26043 swap_comparison in order to support it. */
26044 #define BUILTIN_DESC_SWAP_OPERANDS 1
26045
26046 struct builtin_description
26047 {
26048 const HOST_WIDE_INT mask;
26049 const enum insn_code icode;
26050 const char *const name;
26051 const enum ix86_builtins code;
26052 const enum rtx_code comparison;
26053 const int flag;
26054 };
26055
26056 static const struct builtin_description bdesc_comi[] =
26057 {
26058 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26059 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26060 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26062 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26065 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26070 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26071 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26072 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26073 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26074 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26075 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26076 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26077 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26079 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26080 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26081 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26082 };
26083
26084 static const struct builtin_description bdesc_pcmpestr[] =
26085 {
26086 /* SSE4.2 */
26087 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26088 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26089 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26090 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26091 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26092 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26093 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26094 };
26095
26096 static const struct builtin_description bdesc_pcmpistr[] =
26097 {
26098 /* SSE4.2 */
26099 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26100 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26101 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26102 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26103 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26104 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26105 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26106 };
26107
26108 /* Special builtins with variable number of arguments. */
26109 static const struct builtin_description bdesc_special_args[] =
26110 {
26111 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26112 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26113 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26114
26115 /* MMX */
26116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26117
26118 /* 3DNow! */
26119 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26120
26121 /* SSE */
26122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26125
26126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26130
26131 /* SSE or 3DNow!A */
26132 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26134
26135 /* SSE2 */
26136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26138 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26140 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26141 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26143 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26146
26147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26149
26150 /* SSE3 */
26151 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26152
26153 /* SSE4.1 */
26154 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26155
26156 /* SSE4A */
26157 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26158 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26159
26160 /* AVX */
26161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26162 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26163
26164 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26165 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26166 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26169
26170 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26172 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26173 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26174 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26175 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26177
26178 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26179 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26181
26182 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26183 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26184 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26185 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26186 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26187 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26188 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26189 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26190
26191 /* AVX2 */
26192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26194 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26196 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26201
26202 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26203 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26204 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26205 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26206 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26207 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26208
26209 /* FSGSBASE */
26210 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26211 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26212 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26213 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26214 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26215 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26216 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26217 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26218
26219 /* RTM */
26220 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26221 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26222 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26223 };
26224
26225 /* Builtins with variable number of arguments. */
26226 static const struct builtin_description bdesc_args[] =
26227 {
26228 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26229 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26230 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26231 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26232 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26233 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26234 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26235
26236 /* MMX */
26237 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26238 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26239 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26240 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26241 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26242 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26243
26244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26245 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26248 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26249 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26250 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26252
26253 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26255
26256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26257 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26260
26261 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26263 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26264 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26265 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26266 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26267
26268 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26269 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26270 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26271 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26272 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26273 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26274
26275 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26276 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26277 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26278
26279 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26280
26281 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26282 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26283 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26284 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26285 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26286 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26287
26288 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26289 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26290 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26291 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26292 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26293 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26294
26295 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26296 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26297 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26298 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26299
26300 /* 3DNow! */
26301 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26302 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26303 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26304 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26305
26306 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26307 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26308 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26309 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26310 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26311 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26312 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26313 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26314 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26315 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26316 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26317 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26318 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26319 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26320 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26321
26322 /* 3DNow!A */
26323 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26324 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26325 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26326 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26327 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26328 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26329
26330 /* SSE */
26331 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26332 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26333 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26334 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26335 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26336 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26339 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26340 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26341 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26342 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26343
26344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26345
26346 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26347 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26348 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26349 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26350 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26351 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26352 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26354
26355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26356 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26357 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26358 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26360 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26361 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26362 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26363 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26364 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26366 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26367 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26368 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26369 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26370 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26371 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26372 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26373 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26374 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26375 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26376 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26377
26378 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26379 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26380 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26381 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26382
26383 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26384 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26385 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26386 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26387
26388 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26389
26390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26391 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26392 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26393 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26394 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26395
26396 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26397 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26398 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26399
26400 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26401
26402 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26403 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26404 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26405
26406 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26407 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26408
26409 /* SSE MMX or 3Dnow!A */
26410 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26411 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26412 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26413
26414 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26415 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26416 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26417 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26418
26419 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26420 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26421
26422 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26423
26424 /* SSE2 */
26425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26426
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26430 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26431 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26432
26433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26438
26439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26440
26441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26443 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26444 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26445
26446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26449
26450 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26451 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26452 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26453 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26456 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26457 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26458
26459 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26460 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26462 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26463 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26464 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26468 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26469 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26470 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26472 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26475 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26476 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26478 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26479
26480 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26481 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26482 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26484
26485 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26487 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26488 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26489
26490 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26491
26492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26493 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26494 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26495
26496 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26497
26498 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26499 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26500 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26501 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26502 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26503 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26504 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26505 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26506
26507 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26508 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26509 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26510 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26515
26516 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26517 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26518
26519 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26521 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26522 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26523
26524 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26526
26527 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26532 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26533
26534 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26535 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26536 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26537 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26538
26539 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26540 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26541 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26542 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26543 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26544 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26545 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26546 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26547
26548 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26549 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26550 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26551
26552 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26553 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26554
26555 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26556 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26557
26558 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26559
26560 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26561 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26562 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26563 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26564
26565 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26566 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26567 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26568 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26569 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26570 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26571 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26572
26573 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26574 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26575 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26576 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26577 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26578 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26579 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26580
26581 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26582 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26583 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26584 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26585
26586 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26587 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26588 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26589
26590 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26591
26592 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26593
26594 /* SSE2 MMX */
26595 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26596 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26597
26598 /* SSE3 */
26599 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26600 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26601
26602 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26603 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26604 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26605 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26606 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26607 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26608
26609 /* SSSE3 */
26610 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26611 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26612 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26613 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26614 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26615 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26616
26617 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26618 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26619 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26620 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26621 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26622 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26623 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26624 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26625 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26626 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26627 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26628 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26629 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26630 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26631 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26632 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26633 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26634 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26635 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26636 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26637 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26638 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26639 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26640 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26641
26642 /* SSSE3. */
26643 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26644 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26645
26646 /* SSE4.1 */
26647 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26648 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26649 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26650 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26651 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26652 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26653 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26654 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26655 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26656 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26657
26658 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26659 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26660 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26661 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26662 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26663 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26664 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26665 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26666 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26667 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26668 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26669 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26670 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26671
26672 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26673 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26674 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26675 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26676 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26677 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26678 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26679 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26680 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26681 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26682 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26683 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26684
26685 /* SSE4.1 */
26686 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26687 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26688 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26689 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26690
26691 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26692 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26693 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26694 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26695
26696 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26697 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26698
26699 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26700 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26701
26702 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26703 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26704 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26705 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26706
26707 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26708 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26709
26710 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26711 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26712
26713 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26714 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26715 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26716
26717 /* SSE4.2 */
26718 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26719 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26720 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26721 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26722 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26723
26724 /* SSE4A */
26725 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26726 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26727 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26728 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26729
26730 /* AES */
26731 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26732 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26733
26734 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26735 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26736 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26737 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26738
26739 /* PCLMUL */
26740 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26741
26742 /* AVX */
26743 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26744 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26747 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26748 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26751 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26753 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26754 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26755 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26757 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26758 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26759 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26760 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26761 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26762 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26763 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26764 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26765 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26766 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26767 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26768 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26769
26770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26774
26775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26782 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26787 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26791 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26792 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26795 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26796 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26798 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26809
26810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26813
26814 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26815 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26816 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26818 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26819
26820 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26821
26822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26824
26825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26829
26830 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26831 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26832
26833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26835
26836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26837 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26840
26841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26842 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26843
26844 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26845 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26846
26847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26849 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26850 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26851
26852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26855 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26856 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26857 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26858
26859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26861 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26862 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26863 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26865 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26866 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26867 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26868 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26869 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26870 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26871 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26873 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26874
26875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26876 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26877
26878 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26879 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26880
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26882
26883 /* AVX2 */
26884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26885 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26886 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26887 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26892 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26893 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26894 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26895 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26901 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26923 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26924 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26925 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26926 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26927 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26928 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26929 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26930 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26931 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26932 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26933 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26934 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26935 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26936 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26937 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26939 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26940 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26951 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26952 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26953 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26955 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26959 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26961 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26965 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26966 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26967 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26968 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26969 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26970 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26971 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26972 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26973 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26974 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26975 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26976 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26977 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26978 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26979 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26980 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26981 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26982 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26983 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26984 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26985 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26986 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26989 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26998 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27000 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27002 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27003 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27004 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27005 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27006 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27007 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27008 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27009 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27010 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27011 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27012 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27013 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27014 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27017 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27018 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27019 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27020 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27021 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27022 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27023 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27024 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27025 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27026 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27028 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27029 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27030
27031 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27032
27033 /* BMI */
27034 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27035 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27036 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27037
27038 /* TBM */
27039 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27040 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27041
27042 /* F16C */
27043 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27044 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27045 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27046 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27047
27048 /* BMI2 */
27049 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27050 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27051 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27052 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27053 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27054 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27055 };
27056
27057 /* FMA4 and XOP. */
27058 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27059 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27060 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27061 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27062 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27063 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27064 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27065 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27066 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27067 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27068 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27069 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27070 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27071 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27072 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27073 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27074 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27075 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27076 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27077 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27078 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27079 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27080 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27081 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27082 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27083 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27084 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27085 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27086 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27087 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27088 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27089 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27090 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27091 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27092 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27093 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27094 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27095 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27096 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27097 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27098 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27099 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27100 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27101 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27102 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27103 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27104 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27105 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27106 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27107 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27108 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27109 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27110
27111 static const struct builtin_description bdesc_multi_arg[] =
27112 {
27113 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27114 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27115 UNKNOWN, (int)MULTI_ARG_3_SF },
27116 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27117 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27118 UNKNOWN, (int)MULTI_ARG_3_DF },
27119
27120 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27121 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27122 UNKNOWN, (int)MULTI_ARG_3_SF },
27123 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27124 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27125 UNKNOWN, (int)MULTI_ARG_3_DF },
27126
27127 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27128 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27129 UNKNOWN, (int)MULTI_ARG_3_SF },
27130 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27131 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27132 UNKNOWN, (int)MULTI_ARG_3_DF },
27133 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27134 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27135 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27136 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27137 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27138 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27139
27140 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27141 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27142 UNKNOWN, (int)MULTI_ARG_3_SF },
27143 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27144 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27145 UNKNOWN, (int)MULTI_ARG_3_DF },
27146 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27147 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27148 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27149 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27150 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27151 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27152
27153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27160
27161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27168
27169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27170
27171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27183
27184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27194 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27200
27201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27202 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27207
27208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27223
27224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27231
27232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27239
27240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27247
27248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27255
27256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27259 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27263
27264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27268 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27271
27272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27275 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27277 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27278 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27279
27280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27281 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27283 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27285 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27286 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27287
27288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27289 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27291 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27292 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27293 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27294 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27295 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27296
27297 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27298 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27299 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27300 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27301 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27302 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27303 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27304 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27305
27306 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27307 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27308 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27309 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27310
27311 };
27312 \f
27313 /* TM vector builtins. */
27314
27315 /* Reuse the existing x86-specific `struct builtin_description' cause
27316 we're lazy. Add casts to make them fit. */
27317 static const struct builtin_description bdesc_tm[] =
27318 {
27319 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27320 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27321 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27322 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27323 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27324 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27325 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27326
27327 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27328 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27329 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27330 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27331 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27332 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27333 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27334
27335 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27336 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27337 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27338 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27339 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27340 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27341 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27342
27343 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27344 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27345 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27346 };
27347
27348 /* TM callbacks. */
27349
27350 /* Return the builtin decl needed to load a vector of TYPE. */
27351
27352 static tree
27353 ix86_builtin_tm_load (tree type)
27354 {
27355 if (TREE_CODE (type) == VECTOR_TYPE)
27356 {
27357 switch (tree_low_cst (TYPE_SIZE (type), 1))
27358 {
27359 case 64:
27360 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27361 case 128:
27362 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27363 case 256:
27364 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27365 }
27366 }
27367 return NULL_TREE;
27368 }
27369
27370 /* Return the builtin decl needed to store a vector of TYPE. */
27371
27372 static tree
27373 ix86_builtin_tm_store (tree type)
27374 {
27375 if (TREE_CODE (type) == VECTOR_TYPE)
27376 {
27377 switch (tree_low_cst (TYPE_SIZE (type), 1))
27378 {
27379 case 64:
27380 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27381 case 128:
27382 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27383 case 256:
27384 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27385 }
27386 }
27387 return NULL_TREE;
27388 }
27389 \f
27390 /* Initialize the transactional memory vector load/store builtins. */
27391
27392 static void
27393 ix86_init_tm_builtins (void)
27394 {
27395 enum ix86_builtin_func_type ftype;
27396 const struct builtin_description *d;
27397 size_t i;
27398 tree decl;
27399 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27400 tree attrs_log, attrs_type_log;
27401
27402 if (!flag_tm)
27403 return;
27404
27405 /* If there are no builtins defined, we must be compiling in a
27406 language without trans-mem support. */
27407 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27408 return;
27409
27410 /* Use whatever attributes a normal TM load has. */
27411 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27412 attrs_load = DECL_ATTRIBUTES (decl);
27413 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27414 /* Use whatever attributes a normal TM store has. */
27415 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27416 attrs_store = DECL_ATTRIBUTES (decl);
27417 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27418 /* Use whatever attributes a normal TM log has. */
27419 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27420 attrs_log = DECL_ATTRIBUTES (decl);
27421 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27422
27423 for (i = 0, d = bdesc_tm;
27424 i < ARRAY_SIZE (bdesc_tm);
27425 i++, d++)
27426 {
27427 if ((d->mask & ix86_isa_flags) != 0
27428 || (lang_hooks.builtin_function
27429 == lang_hooks.builtin_function_ext_scope))
27430 {
27431 tree type, attrs, attrs_type;
27432 enum built_in_function code = (enum built_in_function) d->code;
27433
27434 ftype = (enum ix86_builtin_func_type) d->flag;
27435 type = ix86_get_builtin_func_type (ftype);
27436
27437 if (BUILTIN_TM_LOAD_P (code))
27438 {
27439 attrs = attrs_load;
27440 attrs_type = attrs_type_load;
27441 }
27442 else if (BUILTIN_TM_STORE_P (code))
27443 {
27444 attrs = attrs_store;
27445 attrs_type = attrs_type_store;
27446 }
27447 else
27448 {
27449 attrs = attrs_log;
27450 attrs_type = attrs_type_log;
27451 }
27452 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27453 /* The builtin without the prefix for
27454 calling it directly. */
27455 d->name + strlen ("__builtin_"),
27456 attrs);
27457 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27458 set the TYPE_ATTRIBUTES. */
27459 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27460
27461 set_builtin_decl (code, decl, false);
27462 }
27463 }
27464 }
27465
27466 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27467 in the current target ISA to allow the user to compile particular modules
27468 with different target specific options that differ from the command line
27469 options. */
27470 static void
27471 ix86_init_mmx_sse_builtins (void)
27472 {
27473 const struct builtin_description * d;
27474 enum ix86_builtin_func_type ftype;
27475 size_t i;
27476
27477 /* Add all special builtins with variable number of operands. */
27478 for (i = 0, d = bdesc_special_args;
27479 i < ARRAY_SIZE (bdesc_special_args);
27480 i++, d++)
27481 {
27482 if (d->name == 0)
27483 continue;
27484
27485 ftype = (enum ix86_builtin_func_type) d->flag;
27486 def_builtin (d->mask, d->name, ftype, d->code);
27487 }
27488
27489 /* Add all builtins with variable number of operands. */
27490 for (i = 0, d = bdesc_args;
27491 i < ARRAY_SIZE (bdesc_args);
27492 i++, d++)
27493 {
27494 if (d->name == 0)
27495 continue;
27496
27497 ftype = (enum ix86_builtin_func_type) d->flag;
27498 def_builtin_const (d->mask, d->name, ftype, d->code);
27499 }
27500
27501 /* pcmpestr[im] insns. */
27502 for (i = 0, d = bdesc_pcmpestr;
27503 i < ARRAY_SIZE (bdesc_pcmpestr);
27504 i++, d++)
27505 {
27506 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27507 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27508 else
27509 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27510 def_builtin_const (d->mask, d->name, ftype, d->code);
27511 }
27512
27513 /* pcmpistr[im] insns. */
27514 for (i = 0, d = bdesc_pcmpistr;
27515 i < ARRAY_SIZE (bdesc_pcmpistr);
27516 i++, d++)
27517 {
27518 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27519 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27520 else
27521 ftype = INT_FTYPE_V16QI_V16QI_INT;
27522 def_builtin_const (d->mask, d->name, ftype, d->code);
27523 }
27524
27525 /* comi/ucomi insns. */
27526 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27527 {
27528 if (d->mask == OPTION_MASK_ISA_SSE2)
27529 ftype = INT_FTYPE_V2DF_V2DF;
27530 else
27531 ftype = INT_FTYPE_V4SF_V4SF;
27532 def_builtin_const (d->mask, d->name, ftype, d->code);
27533 }
27534
27535 /* SSE */
27536 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27537 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27538 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27539 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27540
27541 /* SSE or 3DNow!A */
27542 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27543 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27544 IX86_BUILTIN_MASKMOVQ);
27545
27546 /* SSE2 */
27547 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27548 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27549
27550 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27551 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27552 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27553 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27554
27555 /* SSE3. */
27556 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27557 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27558 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27559 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27560
27561 /* AES */
27562 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27563 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27564 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27565 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27566 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27567 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27568 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27569 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27570 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27571 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27572 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27573 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27574
27575 /* PCLMUL */
27576 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27577 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27578
27579 /* RDRND */
27580 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27581 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27582 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27583 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27584 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27585 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27586 IX86_BUILTIN_RDRAND64_STEP);
27587
27588 /* AVX2 */
27589 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27590 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27591 IX86_BUILTIN_GATHERSIV2DF);
27592
27593 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27594 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27595 IX86_BUILTIN_GATHERSIV4DF);
27596
27597 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27598 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27599 IX86_BUILTIN_GATHERDIV2DF);
27600
27601 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27602 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27603 IX86_BUILTIN_GATHERDIV4DF);
27604
27605 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27606 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27607 IX86_BUILTIN_GATHERSIV4SF);
27608
27609 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27610 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27611 IX86_BUILTIN_GATHERSIV8SF);
27612
27613 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27614 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27615 IX86_BUILTIN_GATHERDIV4SF);
27616
27617 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27618 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27619 IX86_BUILTIN_GATHERDIV8SF);
27620
27621 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27622 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27623 IX86_BUILTIN_GATHERSIV2DI);
27624
27625 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27626 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27627 IX86_BUILTIN_GATHERSIV4DI);
27628
27629 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27630 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27631 IX86_BUILTIN_GATHERDIV2DI);
27632
27633 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27634 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27635 IX86_BUILTIN_GATHERDIV4DI);
27636
27637 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27638 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27639 IX86_BUILTIN_GATHERSIV4SI);
27640
27641 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27642 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27643 IX86_BUILTIN_GATHERSIV8SI);
27644
27645 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27646 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27647 IX86_BUILTIN_GATHERDIV4SI);
27648
27649 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27650 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27651 IX86_BUILTIN_GATHERDIV8SI);
27652
27653 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27654 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27655 IX86_BUILTIN_GATHERALTSIV4DF);
27656
27657 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27658 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27659 IX86_BUILTIN_GATHERALTDIV8SF);
27660
27661 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27662 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27663 IX86_BUILTIN_GATHERALTSIV4DI);
27664
27665 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27666 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27667 IX86_BUILTIN_GATHERALTDIV8SI);
27668
27669 /* RTM. */
27670 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27671 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27672
27673 /* MMX access to the vec_init patterns. */
27674 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27675 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27676
27677 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27678 V4HI_FTYPE_HI_HI_HI_HI,
27679 IX86_BUILTIN_VEC_INIT_V4HI);
27680
27681 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27682 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27683 IX86_BUILTIN_VEC_INIT_V8QI);
27684
27685 /* Access to the vec_extract patterns. */
27686 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27687 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27688 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27689 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27690 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27691 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27692 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27693 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27694 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27695 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27696
27697 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27698 "__builtin_ia32_vec_ext_v4hi",
27699 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27700
27701 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27702 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27703
27704 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27705 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27706
27707 /* Access to the vec_set patterns. */
27708 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27709 "__builtin_ia32_vec_set_v2di",
27710 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27711
27712 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27713 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27714
27715 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27716 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27717
27718 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27719 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27720
27721 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27722 "__builtin_ia32_vec_set_v4hi",
27723 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27724
27725 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27726 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27727
27728 /* Add FMA4 multi-arg argument instructions */
27729 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27730 {
27731 if (d->name == 0)
27732 continue;
27733
27734 ftype = (enum ix86_builtin_func_type) d->flag;
27735 def_builtin_const (d->mask, d->name, ftype, d->code);
27736 }
27737 }
27738
27739 /* This builds the processor_model struct type defined in
27740 libgcc/config/i386/cpuinfo.c */
27741
27742 static tree
27743 build_processor_model_struct (void)
27744 {
27745 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
27746 "__cpu_features"};
27747 tree field = NULL_TREE, field_chain = NULL_TREE;
27748 int i;
27749 tree type = make_node (RECORD_TYPE);
27750
27751 /* The first 3 fields are unsigned int. */
27752 for (i = 0; i < 3; ++i)
27753 {
27754 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27755 get_identifier (field_name[i]), unsigned_type_node);
27756 if (field_chain != NULL_TREE)
27757 DECL_CHAIN (field) = field_chain;
27758 field_chain = field;
27759 }
27760
27761 /* The last field is an array of unsigned integers of size one. */
27762 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27763 get_identifier (field_name[3]),
27764 build_array_type (unsigned_type_node,
27765 build_index_type (size_one_node)));
27766 if (field_chain != NULL_TREE)
27767 DECL_CHAIN (field) = field_chain;
27768 field_chain = field;
27769
27770 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
27771 return type;
27772 }
27773
27774 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
27775
27776 static tree
27777 make_var_decl (tree type, const char *name)
27778 {
27779 tree new_decl;
27780
27781 new_decl = build_decl (UNKNOWN_LOCATION,
27782 VAR_DECL,
27783 get_identifier(name),
27784 type);
27785
27786 DECL_EXTERNAL (new_decl) = 1;
27787 TREE_STATIC (new_decl) = 1;
27788 TREE_PUBLIC (new_decl) = 1;
27789 DECL_INITIAL (new_decl) = 0;
27790 DECL_ARTIFICIAL (new_decl) = 0;
27791 DECL_PRESERVE_P (new_decl) = 1;
27792
27793 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
27794 assemble_variable (new_decl, 0, 0, 0);
27795
27796 return new_decl;
27797 }
27798
27799 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
27800 into an integer defined in libgcc/config/i386/cpuinfo.c */
27801
27802 static tree
27803 fold_builtin_cpu (tree fndecl, tree *args)
27804 {
27805 unsigned int i;
27806 enum ix86_builtins fn_code = (enum ix86_builtins)
27807 DECL_FUNCTION_CODE (fndecl);
27808 tree param_string_cst = NULL;
27809
27810 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
27811 enum processor_features
27812 {
27813 F_CMOV = 0,
27814 F_MMX,
27815 F_POPCNT,
27816 F_SSE,
27817 F_SSE2,
27818 F_SSE3,
27819 F_SSSE3,
27820 F_SSE4_1,
27821 F_SSE4_2,
27822 F_AVX,
27823 F_AVX2,
27824 F_MAX
27825 };
27826
27827 /* These are the values for vendor types and cpu types and subtypes
27828 in cpuinfo.c. Cpu types and subtypes should be subtracted by
27829 the corresponding start value. */
27830 enum processor_model
27831 {
27832 M_INTEL = 1,
27833 M_AMD,
27834 M_CPU_TYPE_START,
27835 M_INTEL_ATOM,
27836 M_INTEL_CORE2,
27837 M_INTEL_COREI7,
27838 M_AMDFAM10H,
27839 M_AMDFAM15H,
27840 M_CPU_SUBTYPE_START,
27841 M_INTEL_COREI7_NEHALEM,
27842 M_INTEL_COREI7_WESTMERE,
27843 M_INTEL_COREI7_SANDYBRIDGE,
27844 M_AMDFAM10H_BARCELONA,
27845 M_AMDFAM10H_SHANGHAI,
27846 M_AMDFAM10H_ISTANBUL,
27847 M_AMDFAM15H_BDVER1,
27848 M_AMDFAM15H_BDVER2
27849 };
27850
27851 static struct _arch_names_table
27852 {
27853 const char *const name;
27854 const enum processor_model model;
27855 }
27856 const arch_names_table[] =
27857 {
27858 {"amd", M_AMD},
27859 {"intel", M_INTEL},
27860 {"atom", M_INTEL_ATOM},
27861 {"core2", M_INTEL_CORE2},
27862 {"corei7", M_INTEL_COREI7},
27863 {"nehalem", M_INTEL_COREI7_NEHALEM},
27864 {"westmere", M_INTEL_COREI7_WESTMERE},
27865 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
27866 {"amdfam10h", M_AMDFAM10H},
27867 {"barcelona", M_AMDFAM10H_BARCELONA},
27868 {"shanghai", M_AMDFAM10H_SHANGHAI},
27869 {"istanbul", M_AMDFAM10H_ISTANBUL},
27870 {"amdfam15h", M_AMDFAM15H},
27871 {"bdver1", M_AMDFAM15H_BDVER1},
27872 {"bdver2", M_AMDFAM15H_BDVER2},
27873 };
27874
27875 static struct _isa_names_table
27876 {
27877 const char *const name;
27878 const enum processor_features feature;
27879 }
27880 const isa_names_table[] =
27881 {
27882 {"cmov", F_CMOV},
27883 {"mmx", F_MMX},
27884 {"popcnt", F_POPCNT},
27885 {"sse", F_SSE},
27886 {"sse2", F_SSE2},
27887 {"sse3", F_SSE3},
27888 {"ssse3", F_SSSE3},
27889 {"sse4.1", F_SSE4_1},
27890 {"sse4.2", F_SSE4_2},
27891 {"avx", F_AVX},
27892 {"avx2", F_AVX2}
27893 };
27894
27895 static tree __processor_model_type = NULL_TREE;
27896 static tree __cpu_model_var = NULL_TREE;
27897
27898 if (__processor_model_type == NULL_TREE)
27899 __processor_model_type = build_processor_model_struct ();
27900
27901 if (__cpu_model_var == NULL_TREE)
27902 __cpu_model_var = make_var_decl (__processor_model_type,
27903 "__cpu_model");
27904
27905 gcc_assert ((args != NULL) && (*args != NULL));
27906
27907 param_string_cst = *args;
27908 while (param_string_cst
27909 && TREE_CODE (param_string_cst) != STRING_CST)
27910 {
27911 /* *args must be a expr that can contain other EXPRS leading to a
27912 STRING_CST. */
27913 if (!EXPR_P (param_string_cst))
27914 {
27915 error ("Parameter to builtin must be a string constant or literal");
27916 return integer_zero_node;
27917 }
27918 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
27919 }
27920
27921 gcc_assert (param_string_cst);
27922
27923 if (fn_code == IX86_BUILTIN_CPU_IS)
27924 {
27925 tree ref;
27926 tree field;
27927 unsigned int field_val = 0;
27928 unsigned int NUM_ARCH_NAMES
27929 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
27930
27931 for (i = 0; i < NUM_ARCH_NAMES; i++)
27932 if (strcmp (arch_names_table[i].name,
27933 TREE_STRING_POINTER (param_string_cst)) == 0)
27934 break;
27935
27936 if (i == NUM_ARCH_NAMES)
27937 {
27938 error ("Parameter to builtin not valid: %s",
27939 TREE_STRING_POINTER (param_string_cst));
27940 return integer_zero_node;
27941 }
27942
27943 field = TYPE_FIELDS (__processor_model_type);
27944 field_val = arch_names_table[i].model;
27945
27946 /* CPU types are stored in the next field. */
27947 if (field_val > M_CPU_TYPE_START
27948 && field_val < M_CPU_SUBTYPE_START)
27949 {
27950 field = DECL_CHAIN (field);
27951 field_val -= M_CPU_TYPE_START;
27952 }
27953
27954 /* CPU subtypes are stored in the next field. */
27955 if (field_val > M_CPU_SUBTYPE_START)
27956 {
27957 field = DECL_CHAIN ( DECL_CHAIN (field));
27958 field_val -= M_CPU_SUBTYPE_START;
27959 }
27960
27961 /* Get the appropriate field in __cpu_model. */
27962 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
27963 field, NULL_TREE);
27964
27965 /* Check the value. */
27966 return build2 (EQ_EXPR, unsigned_type_node, ref,
27967 build_int_cstu (unsigned_type_node, field_val));
27968 }
27969 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
27970 {
27971 tree ref;
27972 tree array_elt;
27973 tree field;
27974 unsigned int field_val = 0;
27975 unsigned int NUM_ISA_NAMES
27976 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
27977
27978 for (i = 0; i < NUM_ISA_NAMES; i++)
27979 if (strcmp (isa_names_table[i].name,
27980 TREE_STRING_POINTER (param_string_cst)) == 0)
27981 break;
27982
27983 if (i == NUM_ISA_NAMES)
27984 {
27985 error ("Parameter to builtin not valid: %s",
27986 TREE_STRING_POINTER (param_string_cst));
27987 return integer_zero_node;
27988 }
27989
27990 field = TYPE_FIELDS (__processor_model_type);
27991 /* Get the last field, which is __cpu_features. */
27992 while (DECL_CHAIN (field))
27993 field = DECL_CHAIN (field);
27994
27995 /* Get the appropriate field: __cpu_model.__cpu_features */
27996 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
27997 field, NULL_TREE);
27998
27999 /* Access the 0th element of __cpu_features array. */
28000 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28001 integer_zero_node, NULL_TREE, NULL_TREE);
28002
28003 field_val = (1 << isa_names_table[i].feature);
28004 /* Return __cpu_model.__cpu_features[0] & field_val */
28005 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28006 build_int_cstu (unsigned_type_node, field_val));
28007 }
28008 gcc_unreachable ();
28009 }
28010
28011 static tree
28012 ix86_fold_builtin (tree fndecl, int n_args,
28013 tree *args, bool ignore ATTRIBUTE_UNUSED)
28014 {
28015 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28016 {
28017 enum ix86_builtins fn_code = (enum ix86_builtins)
28018 DECL_FUNCTION_CODE (fndecl);
28019 if (fn_code == IX86_BUILTIN_CPU_IS
28020 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28021 {
28022 gcc_assert (n_args == 1);
28023 return fold_builtin_cpu (fndecl, args);
28024 }
28025 }
28026
28027 return NULL_TREE;
28028 }
28029
28030 /* Make builtins to detect cpu type and features supported. NAME is
28031 the builtin name, CODE is the builtin code, and FTYPE is the function
28032 type of the builtin. */
28033
28034 static void
28035 make_cpu_type_builtin (const char* name, int code,
28036 enum ix86_builtin_func_type ftype, bool is_const)
28037 {
28038 tree decl;
28039 tree type;
28040
28041 type = ix86_get_builtin_func_type (ftype);
28042 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28043 NULL, NULL_TREE);
28044 gcc_assert (decl != NULL_TREE);
28045 ix86_builtins[(int) code] = decl;
28046 TREE_READONLY (decl) = is_const;
28047 }
28048
28049 /* Make builtins to get CPU type and features supported. The created
28050 builtins are :
28051
28052 __builtin_cpu_init (), to detect cpu type and features,
28053 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28054 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28055 */
28056
28057 static void
28058 ix86_init_platform_type_builtins (void)
28059 {
28060 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28061 INT_FTYPE_VOID, false);
28062 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28063 INT_FTYPE_PCCHAR, true);
28064 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28065 INT_FTYPE_PCCHAR, true);
28066 }
28067
28068 /* Internal method for ix86_init_builtins. */
28069
28070 static void
28071 ix86_init_builtins_va_builtins_abi (void)
28072 {
28073 tree ms_va_ref, sysv_va_ref;
28074 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28075 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28076 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28077 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28078
28079 if (!TARGET_64BIT)
28080 return;
28081 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28082 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28083 ms_va_ref = build_reference_type (ms_va_list_type_node);
28084 sysv_va_ref =
28085 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28086
28087 fnvoid_va_end_ms =
28088 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28089 fnvoid_va_start_ms =
28090 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28091 fnvoid_va_end_sysv =
28092 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28093 fnvoid_va_start_sysv =
28094 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28095 NULL_TREE);
28096 fnvoid_va_copy_ms =
28097 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28098 NULL_TREE);
28099 fnvoid_va_copy_sysv =
28100 build_function_type_list (void_type_node, sysv_va_ref,
28101 sysv_va_ref, NULL_TREE);
28102
28103 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28104 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28105 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28106 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28107 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28108 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28109 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28110 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28111 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28112 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28113 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28114 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28115 }
28116
28117 static void
28118 ix86_init_builtin_types (void)
28119 {
28120 tree float128_type_node, float80_type_node;
28121
28122 /* The __float80 type. */
28123 float80_type_node = long_double_type_node;
28124 if (TYPE_MODE (float80_type_node) != XFmode)
28125 {
28126 /* The __float80 type. */
28127 float80_type_node = make_node (REAL_TYPE);
28128
28129 TYPE_PRECISION (float80_type_node) = 80;
28130 layout_type (float80_type_node);
28131 }
28132 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28133
28134 /* The __float128 type. */
28135 float128_type_node = make_node (REAL_TYPE);
28136 TYPE_PRECISION (float128_type_node) = 128;
28137 layout_type (float128_type_node);
28138 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28139
28140 /* This macro is built by i386-builtin-types.awk. */
28141 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28142 }
28143
28144 static void
28145 ix86_init_builtins (void)
28146 {
28147 tree t;
28148
28149 ix86_init_builtin_types ();
28150
28151 /* Builtins to get CPU type and features. */
28152 ix86_init_platform_type_builtins ();
28153
28154 /* TFmode support builtins. */
28155 def_builtin_const (0, "__builtin_infq",
28156 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28157 def_builtin_const (0, "__builtin_huge_valq",
28158 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28159
28160 /* We will expand them to normal call if SSE isn't available since
28161 they are used by libgcc. */
28162 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28163 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28164 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28165 TREE_READONLY (t) = 1;
28166 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28167
28168 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28169 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28170 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28171 TREE_READONLY (t) = 1;
28172 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28173
28174 ix86_init_tm_builtins ();
28175 ix86_init_mmx_sse_builtins ();
28176
28177 if (TARGET_LP64)
28178 ix86_init_builtins_va_builtins_abi ();
28179
28180 #ifdef SUBTARGET_INIT_BUILTINS
28181 SUBTARGET_INIT_BUILTINS;
28182 #endif
28183 }
28184
28185 /* Return the ix86 builtin for CODE. */
28186
28187 static tree
28188 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28189 {
28190 if (code >= IX86_BUILTIN_MAX)
28191 return error_mark_node;
28192
28193 return ix86_builtins[code];
28194 }
28195
28196 /* Errors in the source file can cause expand_expr to return const0_rtx
28197 where we expect a vector. To avoid crashing, use one of the vector
28198 clear instructions. */
28199 static rtx
28200 safe_vector_operand (rtx x, enum machine_mode mode)
28201 {
28202 if (x == const0_rtx)
28203 x = CONST0_RTX (mode);
28204 return x;
28205 }
28206
28207 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28208
28209 static rtx
28210 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28211 {
28212 rtx pat;
28213 tree arg0 = CALL_EXPR_ARG (exp, 0);
28214 tree arg1 = CALL_EXPR_ARG (exp, 1);
28215 rtx op0 = expand_normal (arg0);
28216 rtx op1 = expand_normal (arg1);
28217 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28218 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28219 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28220
28221 if (VECTOR_MODE_P (mode0))
28222 op0 = safe_vector_operand (op0, mode0);
28223 if (VECTOR_MODE_P (mode1))
28224 op1 = safe_vector_operand (op1, mode1);
28225
28226 if (optimize || !target
28227 || GET_MODE (target) != tmode
28228 || !insn_data[icode].operand[0].predicate (target, tmode))
28229 target = gen_reg_rtx (tmode);
28230
28231 if (GET_MODE (op1) == SImode && mode1 == TImode)
28232 {
28233 rtx x = gen_reg_rtx (V4SImode);
28234 emit_insn (gen_sse2_loadd (x, op1));
28235 op1 = gen_lowpart (TImode, x);
28236 }
28237
28238 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28239 op0 = copy_to_mode_reg (mode0, op0);
28240 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28241 op1 = copy_to_mode_reg (mode1, op1);
28242
28243 pat = GEN_FCN (icode) (target, op0, op1);
28244 if (! pat)
28245 return 0;
28246
28247 emit_insn (pat);
28248
28249 return target;
28250 }
28251
28252 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28253
28254 static rtx
28255 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28256 enum ix86_builtin_func_type m_type,
28257 enum rtx_code sub_code)
28258 {
28259 rtx pat;
28260 int i;
28261 int nargs;
28262 bool comparison_p = false;
28263 bool tf_p = false;
28264 bool last_arg_constant = false;
28265 int num_memory = 0;
28266 struct {
28267 rtx op;
28268 enum machine_mode mode;
28269 } args[4];
28270
28271 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28272
28273 switch (m_type)
28274 {
28275 case MULTI_ARG_4_DF2_DI_I:
28276 case MULTI_ARG_4_DF2_DI_I1:
28277 case MULTI_ARG_4_SF2_SI_I:
28278 case MULTI_ARG_4_SF2_SI_I1:
28279 nargs = 4;
28280 last_arg_constant = true;
28281 break;
28282
28283 case MULTI_ARG_3_SF:
28284 case MULTI_ARG_3_DF:
28285 case MULTI_ARG_3_SF2:
28286 case MULTI_ARG_3_DF2:
28287 case MULTI_ARG_3_DI:
28288 case MULTI_ARG_3_SI:
28289 case MULTI_ARG_3_SI_DI:
28290 case MULTI_ARG_3_HI:
28291 case MULTI_ARG_3_HI_SI:
28292 case MULTI_ARG_3_QI:
28293 case MULTI_ARG_3_DI2:
28294 case MULTI_ARG_3_SI2:
28295 case MULTI_ARG_3_HI2:
28296 case MULTI_ARG_3_QI2:
28297 nargs = 3;
28298 break;
28299
28300 case MULTI_ARG_2_SF:
28301 case MULTI_ARG_2_DF:
28302 case MULTI_ARG_2_DI:
28303 case MULTI_ARG_2_SI:
28304 case MULTI_ARG_2_HI:
28305 case MULTI_ARG_2_QI:
28306 nargs = 2;
28307 break;
28308
28309 case MULTI_ARG_2_DI_IMM:
28310 case MULTI_ARG_2_SI_IMM:
28311 case MULTI_ARG_2_HI_IMM:
28312 case MULTI_ARG_2_QI_IMM:
28313 nargs = 2;
28314 last_arg_constant = true;
28315 break;
28316
28317 case MULTI_ARG_1_SF:
28318 case MULTI_ARG_1_DF:
28319 case MULTI_ARG_1_SF2:
28320 case MULTI_ARG_1_DF2:
28321 case MULTI_ARG_1_DI:
28322 case MULTI_ARG_1_SI:
28323 case MULTI_ARG_1_HI:
28324 case MULTI_ARG_1_QI:
28325 case MULTI_ARG_1_SI_DI:
28326 case MULTI_ARG_1_HI_DI:
28327 case MULTI_ARG_1_HI_SI:
28328 case MULTI_ARG_1_QI_DI:
28329 case MULTI_ARG_1_QI_SI:
28330 case MULTI_ARG_1_QI_HI:
28331 nargs = 1;
28332 break;
28333
28334 case MULTI_ARG_2_DI_CMP:
28335 case MULTI_ARG_2_SI_CMP:
28336 case MULTI_ARG_2_HI_CMP:
28337 case MULTI_ARG_2_QI_CMP:
28338 nargs = 2;
28339 comparison_p = true;
28340 break;
28341
28342 case MULTI_ARG_2_SF_TF:
28343 case MULTI_ARG_2_DF_TF:
28344 case MULTI_ARG_2_DI_TF:
28345 case MULTI_ARG_2_SI_TF:
28346 case MULTI_ARG_2_HI_TF:
28347 case MULTI_ARG_2_QI_TF:
28348 nargs = 2;
28349 tf_p = true;
28350 break;
28351
28352 default:
28353 gcc_unreachable ();
28354 }
28355
28356 if (optimize || !target
28357 || GET_MODE (target) != tmode
28358 || !insn_data[icode].operand[0].predicate (target, tmode))
28359 target = gen_reg_rtx (tmode);
28360
28361 gcc_assert (nargs <= 4);
28362
28363 for (i = 0; i < nargs; i++)
28364 {
28365 tree arg = CALL_EXPR_ARG (exp, i);
28366 rtx op = expand_normal (arg);
28367 int adjust = (comparison_p) ? 1 : 0;
28368 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28369
28370 if (last_arg_constant && i == nargs - 1)
28371 {
28372 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28373 {
28374 enum insn_code new_icode = icode;
28375 switch (icode)
28376 {
28377 case CODE_FOR_xop_vpermil2v2df3:
28378 case CODE_FOR_xop_vpermil2v4sf3:
28379 case CODE_FOR_xop_vpermil2v4df3:
28380 case CODE_FOR_xop_vpermil2v8sf3:
28381 error ("the last argument must be a 2-bit immediate");
28382 return gen_reg_rtx (tmode);
28383 case CODE_FOR_xop_rotlv2di3:
28384 new_icode = CODE_FOR_rotlv2di3;
28385 goto xop_rotl;
28386 case CODE_FOR_xop_rotlv4si3:
28387 new_icode = CODE_FOR_rotlv4si3;
28388 goto xop_rotl;
28389 case CODE_FOR_xop_rotlv8hi3:
28390 new_icode = CODE_FOR_rotlv8hi3;
28391 goto xop_rotl;
28392 case CODE_FOR_xop_rotlv16qi3:
28393 new_icode = CODE_FOR_rotlv16qi3;
28394 xop_rotl:
28395 if (CONST_INT_P (op))
28396 {
28397 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28398 op = GEN_INT (INTVAL (op) & mask);
28399 gcc_checking_assert
28400 (insn_data[icode].operand[i + 1].predicate (op, mode));
28401 }
28402 else
28403 {
28404 gcc_checking_assert
28405 (nargs == 2
28406 && insn_data[new_icode].operand[0].mode == tmode
28407 && insn_data[new_icode].operand[1].mode == tmode
28408 && insn_data[new_icode].operand[2].mode == mode
28409 && insn_data[new_icode].operand[0].predicate
28410 == insn_data[icode].operand[0].predicate
28411 && insn_data[new_icode].operand[1].predicate
28412 == insn_data[icode].operand[1].predicate);
28413 icode = new_icode;
28414 goto non_constant;
28415 }
28416 break;
28417 default:
28418 gcc_unreachable ();
28419 }
28420 }
28421 }
28422 else
28423 {
28424 non_constant:
28425 if (VECTOR_MODE_P (mode))
28426 op = safe_vector_operand (op, mode);
28427
28428 /* If we aren't optimizing, only allow one memory operand to be
28429 generated. */
28430 if (memory_operand (op, mode))
28431 num_memory++;
28432
28433 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28434
28435 if (optimize
28436 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28437 || num_memory > 1)
28438 op = force_reg (mode, op);
28439 }
28440
28441 args[i].op = op;
28442 args[i].mode = mode;
28443 }
28444
28445 switch (nargs)
28446 {
28447 case 1:
28448 pat = GEN_FCN (icode) (target, args[0].op);
28449 break;
28450
28451 case 2:
28452 if (tf_p)
28453 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28454 GEN_INT ((int)sub_code));
28455 else if (! comparison_p)
28456 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28457 else
28458 {
28459 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28460 args[0].op,
28461 args[1].op);
28462
28463 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28464 }
28465 break;
28466
28467 case 3:
28468 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28469 break;
28470
28471 case 4:
28472 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28473 break;
28474
28475 default:
28476 gcc_unreachable ();
28477 }
28478
28479 if (! pat)
28480 return 0;
28481
28482 emit_insn (pat);
28483 return target;
28484 }
28485
28486 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28487 insns with vec_merge. */
28488
28489 static rtx
28490 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28491 rtx target)
28492 {
28493 rtx pat;
28494 tree arg0 = CALL_EXPR_ARG (exp, 0);
28495 rtx op1, op0 = expand_normal (arg0);
28496 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28497 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28498
28499 if (optimize || !target
28500 || GET_MODE (target) != tmode
28501 || !insn_data[icode].operand[0].predicate (target, tmode))
28502 target = gen_reg_rtx (tmode);
28503
28504 if (VECTOR_MODE_P (mode0))
28505 op0 = safe_vector_operand (op0, mode0);
28506
28507 if ((optimize && !register_operand (op0, mode0))
28508 || !insn_data[icode].operand[1].predicate (op0, mode0))
28509 op0 = copy_to_mode_reg (mode0, op0);
28510
28511 op1 = op0;
28512 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28513 op1 = copy_to_mode_reg (mode0, op1);
28514
28515 pat = GEN_FCN (icode) (target, op0, op1);
28516 if (! pat)
28517 return 0;
28518 emit_insn (pat);
28519 return target;
28520 }
28521
28522 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28523
28524 static rtx
28525 ix86_expand_sse_compare (const struct builtin_description *d,
28526 tree exp, rtx target, bool swap)
28527 {
28528 rtx pat;
28529 tree arg0 = CALL_EXPR_ARG (exp, 0);
28530 tree arg1 = CALL_EXPR_ARG (exp, 1);
28531 rtx op0 = expand_normal (arg0);
28532 rtx op1 = expand_normal (arg1);
28533 rtx op2;
28534 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28535 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28536 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28537 enum rtx_code comparison = d->comparison;
28538
28539 if (VECTOR_MODE_P (mode0))
28540 op0 = safe_vector_operand (op0, mode0);
28541 if (VECTOR_MODE_P (mode1))
28542 op1 = safe_vector_operand (op1, mode1);
28543
28544 /* Swap operands if we have a comparison that isn't available in
28545 hardware. */
28546 if (swap)
28547 {
28548 rtx tmp = gen_reg_rtx (mode1);
28549 emit_move_insn (tmp, op1);
28550 op1 = op0;
28551 op0 = tmp;
28552 }
28553
28554 if (optimize || !target
28555 || GET_MODE (target) != tmode
28556 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28557 target = gen_reg_rtx (tmode);
28558
28559 if ((optimize && !register_operand (op0, mode0))
28560 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28561 op0 = copy_to_mode_reg (mode0, op0);
28562 if ((optimize && !register_operand (op1, mode1))
28563 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28564 op1 = copy_to_mode_reg (mode1, op1);
28565
28566 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28567 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28568 if (! pat)
28569 return 0;
28570 emit_insn (pat);
28571 return target;
28572 }
28573
28574 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28575
28576 static rtx
28577 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28578 rtx target)
28579 {
28580 rtx pat;
28581 tree arg0 = CALL_EXPR_ARG (exp, 0);
28582 tree arg1 = CALL_EXPR_ARG (exp, 1);
28583 rtx op0 = expand_normal (arg0);
28584 rtx op1 = expand_normal (arg1);
28585 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28586 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28587 enum rtx_code comparison = d->comparison;
28588
28589 if (VECTOR_MODE_P (mode0))
28590 op0 = safe_vector_operand (op0, mode0);
28591 if (VECTOR_MODE_P (mode1))
28592 op1 = safe_vector_operand (op1, mode1);
28593
28594 /* Swap operands if we have a comparison that isn't available in
28595 hardware. */
28596 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28597 {
28598 rtx tmp = op1;
28599 op1 = op0;
28600 op0 = tmp;
28601 }
28602
28603 target = gen_reg_rtx (SImode);
28604 emit_move_insn (target, const0_rtx);
28605 target = gen_rtx_SUBREG (QImode, target, 0);
28606
28607 if ((optimize && !register_operand (op0, mode0))
28608 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28609 op0 = copy_to_mode_reg (mode0, op0);
28610 if ((optimize && !register_operand (op1, mode1))
28611 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28612 op1 = copy_to_mode_reg (mode1, op1);
28613
28614 pat = GEN_FCN (d->icode) (op0, op1);
28615 if (! pat)
28616 return 0;
28617 emit_insn (pat);
28618 emit_insn (gen_rtx_SET (VOIDmode,
28619 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28620 gen_rtx_fmt_ee (comparison, QImode,
28621 SET_DEST (pat),
28622 const0_rtx)));
28623
28624 return SUBREG_REG (target);
28625 }
28626
28627 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28628
28629 static rtx
28630 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28631 rtx target)
28632 {
28633 rtx pat;
28634 tree arg0 = CALL_EXPR_ARG (exp, 0);
28635 rtx op1, op0 = expand_normal (arg0);
28636 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28637 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28638
28639 if (optimize || target == 0
28640 || GET_MODE (target) != tmode
28641 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28642 target = gen_reg_rtx (tmode);
28643
28644 if (VECTOR_MODE_P (mode0))
28645 op0 = safe_vector_operand (op0, mode0);
28646
28647 if ((optimize && !register_operand (op0, mode0))
28648 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28649 op0 = copy_to_mode_reg (mode0, op0);
28650
28651 op1 = GEN_INT (d->comparison);
28652
28653 pat = GEN_FCN (d->icode) (target, op0, op1);
28654 if (! pat)
28655 return 0;
28656 emit_insn (pat);
28657 return target;
28658 }
28659
28660 static rtx
28661 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28662 tree exp, rtx target)
28663 {
28664 rtx pat;
28665 tree arg0 = CALL_EXPR_ARG (exp, 0);
28666 tree arg1 = CALL_EXPR_ARG (exp, 1);
28667 rtx op0 = expand_normal (arg0);
28668 rtx op1 = expand_normal (arg1);
28669 rtx op2;
28670 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28671 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28672 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28673
28674 if (optimize || target == 0
28675 || GET_MODE (target) != tmode
28676 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28677 target = gen_reg_rtx (tmode);
28678
28679 op0 = safe_vector_operand (op0, mode0);
28680 op1 = safe_vector_operand (op1, mode1);
28681
28682 if ((optimize && !register_operand (op0, mode0))
28683 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28684 op0 = copy_to_mode_reg (mode0, op0);
28685 if ((optimize && !register_operand (op1, mode1))
28686 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28687 op1 = copy_to_mode_reg (mode1, op1);
28688
28689 op2 = GEN_INT (d->comparison);
28690
28691 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28692 if (! pat)
28693 return 0;
28694 emit_insn (pat);
28695 return target;
28696 }
28697
28698 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28699
28700 static rtx
28701 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28702 rtx target)
28703 {
28704 rtx pat;
28705 tree arg0 = CALL_EXPR_ARG (exp, 0);
28706 tree arg1 = CALL_EXPR_ARG (exp, 1);
28707 rtx op0 = expand_normal (arg0);
28708 rtx op1 = expand_normal (arg1);
28709 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28710 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28711 enum rtx_code comparison = d->comparison;
28712
28713 if (VECTOR_MODE_P (mode0))
28714 op0 = safe_vector_operand (op0, mode0);
28715 if (VECTOR_MODE_P (mode1))
28716 op1 = safe_vector_operand (op1, mode1);
28717
28718 target = gen_reg_rtx (SImode);
28719 emit_move_insn (target, const0_rtx);
28720 target = gen_rtx_SUBREG (QImode, target, 0);
28721
28722 if ((optimize && !register_operand (op0, mode0))
28723 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28724 op0 = copy_to_mode_reg (mode0, op0);
28725 if ((optimize && !register_operand (op1, mode1))
28726 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28727 op1 = copy_to_mode_reg (mode1, op1);
28728
28729 pat = GEN_FCN (d->icode) (op0, op1);
28730 if (! pat)
28731 return 0;
28732 emit_insn (pat);
28733 emit_insn (gen_rtx_SET (VOIDmode,
28734 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28735 gen_rtx_fmt_ee (comparison, QImode,
28736 SET_DEST (pat),
28737 const0_rtx)));
28738
28739 return SUBREG_REG (target);
28740 }
28741
28742 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28743
28744 static rtx
28745 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28746 tree exp, rtx target)
28747 {
28748 rtx pat;
28749 tree arg0 = CALL_EXPR_ARG (exp, 0);
28750 tree arg1 = CALL_EXPR_ARG (exp, 1);
28751 tree arg2 = CALL_EXPR_ARG (exp, 2);
28752 tree arg3 = CALL_EXPR_ARG (exp, 3);
28753 tree arg4 = CALL_EXPR_ARG (exp, 4);
28754 rtx scratch0, scratch1;
28755 rtx op0 = expand_normal (arg0);
28756 rtx op1 = expand_normal (arg1);
28757 rtx op2 = expand_normal (arg2);
28758 rtx op3 = expand_normal (arg3);
28759 rtx op4 = expand_normal (arg4);
28760 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28761
28762 tmode0 = insn_data[d->icode].operand[0].mode;
28763 tmode1 = insn_data[d->icode].operand[1].mode;
28764 modev2 = insn_data[d->icode].operand[2].mode;
28765 modei3 = insn_data[d->icode].operand[3].mode;
28766 modev4 = insn_data[d->icode].operand[4].mode;
28767 modei5 = insn_data[d->icode].operand[5].mode;
28768 modeimm = insn_data[d->icode].operand[6].mode;
28769
28770 if (VECTOR_MODE_P (modev2))
28771 op0 = safe_vector_operand (op0, modev2);
28772 if (VECTOR_MODE_P (modev4))
28773 op2 = safe_vector_operand (op2, modev4);
28774
28775 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28776 op0 = copy_to_mode_reg (modev2, op0);
28777 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28778 op1 = copy_to_mode_reg (modei3, op1);
28779 if ((optimize && !register_operand (op2, modev4))
28780 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28781 op2 = copy_to_mode_reg (modev4, op2);
28782 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28783 op3 = copy_to_mode_reg (modei5, op3);
28784
28785 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28786 {
28787 error ("the fifth argument must be an 8-bit immediate");
28788 return const0_rtx;
28789 }
28790
28791 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28792 {
28793 if (optimize || !target
28794 || GET_MODE (target) != tmode0
28795 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28796 target = gen_reg_rtx (tmode0);
28797
28798 scratch1 = gen_reg_rtx (tmode1);
28799
28800 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28801 }
28802 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28803 {
28804 if (optimize || !target
28805 || GET_MODE (target) != tmode1
28806 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28807 target = gen_reg_rtx (tmode1);
28808
28809 scratch0 = gen_reg_rtx (tmode0);
28810
28811 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28812 }
28813 else
28814 {
28815 gcc_assert (d->flag);
28816
28817 scratch0 = gen_reg_rtx (tmode0);
28818 scratch1 = gen_reg_rtx (tmode1);
28819
28820 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28821 }
28822
28823 if (! pat)
28824 return 0;
28825
28826 emit_insn (pat);
28827
28828 if (d->flag)
28829 {
28830 target = gen_reg_rtx (SImode);
28831 emit_move_insn (target, const0_rtx);
28832 target = gen_rtx_SUBREG (QImode, target, 0);
28833
28834 emit_insn
28835 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28836 gen_rtx_fmt_ee (EQ, QImode,
28837 gen_rtx_REG ((enum machine_mode) d->flag,
28838 FLAGS_REG),
28839 const0_rtx)));
28840 return SUBREG_REG (target);
28841 }
28842 else
28843 return target;
28844 }
28845
28846
28847 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28848
28849 static rtx
28850 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28851 tree exp, rtx target)
28852 {
28853 rtx pat;
28854 tree arg0 = CALL_EXPR_ARG (exp, 0);
28855 tree arg1 = CALL_EXPR_ARG (exp, 1);
28856 tree arg2 = CALL_EXPR_ARG (exp, 2);
28857 rtx scratch0, scratch1;
28858 rtx op0 = expand_normal (arg0);
28859 rtx op1 = expand_normal (arg1);
28860 rtx op2 = expand_normal (arg2);
28861 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28862
28863 tmode0 = insn_data[d->icode].operand[0].mode;
28864 tmode1 = insn_data[d->icode].operand[1].mode;
28865 modev2 = insn_data[d->icode].operand[2].mode;
28866 modev3 = insn_data[d->icode].operand[3].mode;
28867 modeimm = insn_data[d->icode].operand[4].mode;
28868
28869 if (VECTOR_MODE_P (modev2))
28870 op0 = safe_vector_operand (op0, modev2);
28871 if (VECTOR_MODE_P (modev3))
28872 op1 = safe_vector_operand (op1, modev3);
28873
28874 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28875 op0 = copy_to_mode_reg (modev2, op0);
28876 if ((optimize && !register_operand (op1, modev3))
28877 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28878 op1 = copy_to_mode_reg (modev3, op1);
28879
28880 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28881 {
28882 error ("the third argument must be an 8-bit immediate");
28883 return const0_rtx;
28884 }
28885
28886 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28887 {
28888 if (optimize || !target
28889 || GET_MODE (target) != tmode0
28890 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28891 target = gen_reg_rtx (tmode0);
28892
28893 scratch1 = gen_reg_rtx (tmode1);
28894
28895 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28896 }
28897 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28898 {
28899 if (optimize || !target
28900 || GET_MODE (target) != tmode1
28901 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28902 target = gen_reg_rtx (tmode1);
28903
28904 scratch0 = gen_reg_rtx (tmode0);
28905
28906 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28907 }
28908 else
28909 {
28910 gcc_assert (d->flag);
28911
28912 scratch0 = gen_reg_rtx (tmode0);
28913 scratch1 = gen_reg_rtx (tmode1);
28914
28915 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28916 }
28917
28918 if (! pat)
28919 return 0;
28920
28921 emit_insn (pat);
28922
28923 if (d->flag)
28924 {
28925 target = gen_reg_rtx (SImode);
28926 emit_move_insn (target, const0_rtx);
28927 target = gen_rtx_SUBREG (QImode, target, 0);
28928
28929 emit_insn
28930 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28931 gen_rtx_fmt_ee (EQ, QImode,
28932 gen_rtx_REG ((enum machine_mode) d->flag,
28933 FLAGS_REG),
28934 const0_rtx)));
28935 return SUBREG_REG (target);
28936 }
28937 else
28938 return target;
28939 }
28940
28941 /* Subroutine of ix86_expand_builtin to take care of insns with
28942 variable number of operands. */
28943
28944 static rtx
28945 ix86_expand_args_builtin (const struct builtin_description *d,
28946 tree exp, rtx target)
28947 {
28948 rtx pat, real_target;
28949 unsigned int i, nargs;
28950 unsigned int nargs_constant = 0;
28951 int num_memory = 0;
28952 struct
28953 {
28954 rtx op;
28955 enum machine_mode mode;
28956 } args[4];
28957 bool last_arg_count = false;
28958 enum insn_code icode = d->icode;
28959 const struct insn_data_d *insn_p = &insn_data[icode];
28960 enum machine_mode tmode = insn_p->operand[0].mode;
28961 enum machine_mode rmode = VOIDmode;
28962 bool swap = false;
28963 enum rtx_code comparison = d->comparison;
28964
28965 switch ((enum ix86_builtin_func_type) d->flag)
28966 {
28967 case V2DF_FTYPE_V2DF_ROUND:
28968 case V4DF_FTYPE_V4DF_ROUND:
28969 case V4SF_FTYPE_V4SF_ROUND:
28970 case V8SF_FTYPE_V8SF_ROUND:
28971 case V4SI_FTYPE_V4SF_ROUND:
28972 case V8SI_FTYPE_V8SF_ROUND:
28973 return ix86_expand_sse_round (d, exp, target);
28974 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28975 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28976 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28977 case INT_FTYPE_V8SF_V8SF_PTEST:
28978 case INT_FTYPE_V4DI_V4DI_PTEST:
28979 case INT_FTYPE_V4DF_V4DF_PTEST:
28980 case INT_FTYPE_V4SF_V4SF_PTEST:
28981 case INT_FTYPE_V2DI_V2DI_PTEST:
28982 case INT_FTYPE_V2DF_V2DF_PTEST:
28983 return ix86_expand_sse_ptest (d, exp, target);
28984 case FLOAT128_FTYPE_FLOAT128:
28985 case FLOAT_FTYPE_FLOAT:
28986 case INT_FTYPE_INT:
28987 case UINT64_FTYPE_INT:
28988 case UINT16_FTYPE_UINT16:
28989 case INT64_FTYPE_INT64:
28990 case INT64_FTYPE_V4SF:
28991 case INT64_FTYPE_V2DF:
28992 case INT_FTYPE_V16QI:
28993 case INT_FTYPE_V8QI:
28994 case INT_FTYPE_V8SF:
28995 case INT_FTYPE_V4DF:
28996 case INT_FTYPE_V4SF:
28997 case INT_FTYPE_V2DF:
28998 case INT_FTYPE_V32QI:
28999 case V16QI_FTYPE_V16QI:
29000 case V8SI_FTYPE_V8SF:
29001 case V8SI_FTYPE_V4SI:
29002 case V8HI_FTYPE_V8HI:
29003 case V8HI_FTYPE_V16QI:
29004 case V8QI_FTYPE_V8QI:
29005 case V8SF_FTYPE_V8SF:
29006 case V8SF_FTYPE_V8SI:
29007 case V8SF_FTYPE_V4SF:
29008 case V8SF_FTYPE_V8HI:
29009 case V4SI_FTYPE_V4SI:
29010 case V4SI_FTYPE_V16QI:
29011 case V4SI_FTYPE_V4SF:
29012 case V4SI_FTYPE_V8SI:
29013 case V4SI_FTYPE_V8HI:
29014 case V4SI_FTYPE_V4DF:
29015 case V4SI_FTYPE_V2DF:
29016 case V4HI_FTYPE_V4HI:
29017 case V4DF_FTYPE_V4DF:
29018 case V4DF_FTYPE_V4SI:
29019 case V4DF_FTYPE_V4SF:
29020 case V4DF_FTYPE_V2DF:
29021 case V4SF_FTYPE_V4SF:
29022 case V4SF_FTYPE_V4SI:
29023 case V4SF_FTYPE_V8SF:
29024 case V4SF_FTYPE_V4DF:
29025 case V4SF_FTYPE_V8HI:
29026 case V4SF_FTYPE_V2DF:
29027 case V2DI_FTYPE_V2DI:
29028 case V2DI_FTYPE_V16QI:
29029 case V2DI_FTYPE_V8HI:
29030 case V2DI_FTYPE_V4SI:
29031 case V2DF_FTYPE_V2DF:
29032 case V2DF_FTYPE_V4SI:
29033 case V2DF_FTYPE_V4DF:
29034 case V2DF_FTYPE_V4SF:
29035 case V2DF_FTYPE_V2SI:
29036 case V2SI_FTYPE_V2SI:
29037 case V2SI_FTYPE_V4SF:
29038 case V2SI_FTYPE_V2SF:
29039 case V2SI_FTYPE_V2DF:
29040 case V2SF_FTYPE_V2SF:
29041 case V2SF_FTYPE_V2SI:
29042 case V32QI_FTYPE_V32QI:
29043 case V32QI_FTYPE_V16QI:
29044 case V16HI_FTYPE_V16HI:
29045 case V16HI_FTYPE_V8HI:
29046 case V8SI_FTYPE_V8SI:
29047 case V16HI_FTYPE_V16QI:
29048 case V8SI_FTYPE_V16QI:
29049 case V4DI_FTYPE_V16QI:
29050 case V8SI_FTYPE_V8HI:
29051 case V4DI_FTYPE_V8HI:
29052 case V4DI_FTYPE_V4SI:
29053 case V4DI_FTYPE_V2DI:
29054 nargs = 1;
29055 break;
29056 case V4SF_FTYPE_V4SF_VEC_MERGE:
29057 case V2DF_FTYPE_V2DF_VEC_MERGE:
29058 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29059 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29060 case V16QI_FTYPE_V16QI_V16QI:
29061 case V16QI_FTYPE_V8HI_V8HI:
29062 case V8QI_FTYPE_V8QI_V8QI:
29063 case V8QI_FTYPE_V4HI_V4HI:
29064 case V8HI_FTYPE_V8HI_V8HI:
29065 case V8HI_FTYPE_V16QI_V16QI:
29066 case V8HI_FTYPE_V4SI_V4SI:
29067 case V8SF_FTYPE_V8SF_V8SF:
29068 case V8SF_FTYPE_V8SF_V8SI:
29069 case V4SI_FTYPE_V4SI_V4SI:
29070 case V4SI_FTYPE_V8HI_V8HI:
29071 case V4SI_FTYPE_V4SF_V4SF:
29072 case V4SI_FTYPE_V2DF_V2DF:
29073 case V4HI_FTYPE_V4HI_V4HI:
29074 case V4HI_FTYPE_V8QI_V8QI:
29075 case V4HI_FTYPE_V2SI_V2SI:
29076 case V4DF_FTYPE_V4DF_V4DF:
29077 case V4DF_FTYPE_V4DF_V4DI:
29078 case V4SF_FTYPE_V4SF_V4SF:
29079 case V4SF_FTYPE_V4SF_V4SI:
29080 case V4SF_FTYPE_V4SF_V2SI:
29081 case V4SF_FTYPE_V4SF_V2DF:
29082 case V4SF_FTYPE_V4SF_DI:
29083 case V4SF_FTYPE_V4SF_SI:
29084 case V2DI_FTYPE_V2DI_V2DI:
29085 case V2DI_FTYPE_V16QI_V16QI:
29086 case V2DI_FTYPE_V4SI_V4SI:
29087 case V2DI_FTYPE_V2DI_V16QI:
29088 case V2DI_FTYPE_V2DF_V2DF:
29089 case V2SI_FTYPE_V2SI_V2SI:
29090 case V2SI_FTYPE_V4HI_V4HI:
29091 case V2SI_FTYPE_V2SF_V2SF:
29092 case V2DF_FTYPE_V2DF_V2DF:
29093 case V2DF_FTYPE_V2DF_V4SF:
29094 case V2DF_FTYPE_V2DF_V2DI:
29095 case V2DF_FTYPE_V2DF_DI:
29096 case V2DF_FTYPE_V2DF_SI:
29097 case V2SF_FTYPE_V2SF_V2SF:
29098 case V1DI_FTYPE_V1DI_V1DI:
29099 case V1DI_FTYPE_V8QI_V8QI:
29100 case V1DI_FTYPE_V2SI_V2SI:
29101 case V32QI_FTYPE_V16HI_V16HI:
29102 case V16HI_FTYPE_V8SI_V8SI:
29103 case V32QI_FTYPE_V32QI_V32QI:
29104 case V16HI_FTYPE_V32QI_V32QI:
29105 case V16HI_FTYPE_V16HI_V16HI:
29106 case V8SI_FTYPE_V4DF_V4DF:
29107 case V8SI_FTYPE_V8SI_V8SI:
29108 case V8SI_FTYPE_V16HI_V16HI:
29109 case V4DI_FTYPE_V4DI_V4DI:
29110 case V4DI_FTYPE_V8SI_V8SI:
29111 if (comparison == UNKNOWN)
29112 return ix86_expand_binop_builtin (icode, exp, target);
29113 nargs = 2;
29114 break;
29115 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29116 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29117 gcc_assert (comparison != UNKNOWN);
29118 nargs = 2;
29119 swap = true;
29120 break;
29121 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29122 case V16HI_FTYPE_V16HI_SI_COUNT:
29123 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29124 case V8SI_FTYPE_V8SI_SI_COUNT:
29125 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29126 case V4DI_FTYPE_V4DI_INT_COUNT:
29127 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29128 case V8HI_FTYPE_V8HI_SI_COUNT:
29129 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29130 case V4SI_FTYPE_V4SI_SI_COUNT:
29131 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29132 case V4HI_FTYPE_V4HI_SI_COUNT:
29133 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29134 case V2DI_FTYPE_V2DI_SI_COUNT:
29135 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29136 case V2SI_FTYPE_V2SI_SI_COUNT:
29137 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29138 case V1DI_FTYPE_V1DI_SI_COUNT:
29139 nargs = 2;
29140 last_arg_count = true;
29141 break;
29142 case UINT64_FTYPE_UINT64_UINT64:
29143 case UINT_FTYPE_UINT_UINT:
29144 case UINT_FTYPE_UINT_USHORT:
29145 case UINT_FTYPE_UINT_UCHAR:
29146 case UINT16_FTYPE_UINT16_INT:
29147 case UINT8_FTYPE_UINT8_INT:
29148 nargs = 2;
29149 break;
29150 case V2DI_FTYPE_V2DI_INT_CONVERT:
29151 nargs = 2;
29152 rmode = V1TImode;
29153 nargs_constant = 1;
29154 break;
29155 case V4DI_FTYPE_V4DI_INT_CONVERT:
29156 nargs = 2;
29157 rmode = V2TImode;
29158 nargs_constant = 1;
29159 break;
29160 case V8HI_FTYPE_V8HI_INT:
29161 case V8HI_FTYPE_V8SF_INT:
29162 case V8HI_FTYPE_V4SF_INT:
29163 case V8SF_FTYPE_V8SF_INT:
29164 case V4SI_FTYPE_V4SI_INT:
29165 case V4SI_FTYPE_V8SI_INT:
29166 case V4HI_FTYPE_V4HI_INT:
29167 case V4DF_FTYPE_V4DF_INT:
29168 case V4SF_FTYPE_V4SF_INT:
29169 case V4SF_FTYPE_V8SF_INT:
29170 case V2DI_FTYPE_V2DI_INT:
29171 case V2DF_FTYPE_V2DF_INT:
29172 case V2DF_FTYPE_V4DF_INT:
29173 case V16HI_FTYPE_V16HI_INT:
29174 case V8SI_FTYPE_V8SI_INT:
29175 case V4DI_FTYPE_V4DI_INT:
29176 case V2DI_FTYPE_V4DI_INT:
29177 nargs = 2;
29178 nargs_constant = 1;
29179 break;
29180 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29181 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29182 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29183 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29184 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29185 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29186 nargs = 3;
29187 break;
29188 case V32QI_FTYPE_V32QI_V32QI_INT:
29189 case V16HI_FTYPE_V16HI_V16HI_INT:
29190 case V16QI_FTYPE_V16QI_V16QI_INT:
29191 case V4DI_FTYPE_V4DI_V4DI_INT:
29192 case V8HI_FTYPE_V8HI_V8HI_INT:
29193 case V8SI_FTYPE_V8SI_V8SI_INT:
29194 case V8SI_FTYPE_V8SI_V4SI_INT:
29195 case V8SF_FTYPE_V8SF_V8SF_INT:
29196 case V8SF_FTYPE_V8SF_V4SF_INT:
29197 case V4SI_FTYPE_V4SI_V4SI_INT:
29198 case V4DF_FTYPE_V4DF_V4DF_INT:
29199 case V4DF_FTYPE_V4DF_V2DF_INT:
29200 case V4SF_FTYPE_V4SF_V4SF_INT:
29201 case V2DI_FTYPE_V2DI_V2DI_INT:
29202 case V4DI_FTYPE_V4DI_V2DI_INT:
29203 case V2DF_FTYPE_V2DF_V2DF_INT:
29204 nargs = 3;
29205 nargs_constant = 1;
29206 break;
29207 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29208 nargs = 3;
29209 rmode = V4DImode;
29210 nargs_constant = 1;
29211 break;
29212 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29213 nargs = 3;
29214 rmode = V2DImode;
29215 nargs_constant = 1;
29216 break;
29217 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29218 nargs = 3;
29219 rmode = DImode;
29220 nargs_constant = 1;
29221 break;
29222 case V2DI_FTYPE_V2DI_UINT_UINT:
29223 nargs = 3;
29224 nargs_constant = 2;
29225 break;
29226 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29227 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29228 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29229 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29230 nargs = 4;
29231 nargs_constant = 1;
29232 break;
29233 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29234 nargs = 4;
29235 nargs_constant = 2;
29236 break;
29237 default:
29238 gcc_unreachable ();
29239 }
29240
29241 gcc_assert (nargs <= ARRAY_SIZE (args));
29242
29243 if (comparison != UNKNOWN)
29244 {
29245 gcc_assert (nargs == 2);
29246 return ix86_expand_sse_compare (d, exp, target, swap);
29247 }
29248
29249 if (rmode == VOIDmode || rmode == tmode)
29250 {
29251 if (optimize
29252 || target == 0
29253 || GET_MODE (target) != tmode
29254 || !insn_p->operand[0].predicate (target, tmode))
29255 target = gen_reg_rtx (tmode);
29256 real_target = target;
29257 }
29258 else
29259 {
29260 target = gen_reg_rtx (rmode);
29261 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29262 }
29263
29264 for (i = 0; i < nargs; i++)
29265 {
29266 tree arg = CALL_EXPR_ARG (exp, i);
29267 rtx op = expand_normal (arg);
29268 enum machine_mode mode = insn_p->operand[i + 1].mode;
29269 bool match = insn_p->operand[i + 1].predicate (op, mode);
29270
29271 if (last_arg_count && (i + 1) == nargs)
29272 {
29273 /* SIMD shift insns take either an 8-bit immediate or
29274 register as count. But builtin functions take int as
29275 count. If count doesn't match, we put it in register. */
29276 if (!match)
29277 {
29278 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29279 if (!insn_p->operand[i + 1].predicate (op, mode))
29280 op = copy_to_reg (op);
29281 }
29282 }
29283 else if ((nargs - i) <= nargs_constant)
29284 {
29285 if (!match)
29286 switch (icode)
29287 {
29288 case CODE_FOR_avx2_inserti128:
29289 case CODE_FOR_avx2_extracti128:
29290 error ("the last argument must be an 1-bit immediate");
29291 return const0_rtx;
29292
29293 case CODE_FOR_sse4_1_roundsd:
29294 case CODE_FOR_sse4_1_roundss:
29295
29296 case CODE_FOR_sse4_1_roundpd:
29297 case CODE_FOR_sse4_1_roundps:
29298 case CODE_FOR_avx_roundpd256:
29299 case CODE_FOR_avx_roundps256:
29300
29301 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29302 case CODE_FOR_sse4_1_roundps_sfix:
29303 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29304 case CODE_FOR_avx_roundps_sfix256:
29305
29306 case CODE_FOR_sse4_1_blendps:
29307 case CODE_FOR_avx_blendpd256:
29308 case CODE_FOR_avx_vpermilv4df:
29309 error ("the last argument must be a 4-bit immediate");
29310 return const0_rtx;
29311
29312 case CODE_FOR_sse4_1_blendpd:
29313 case CODE_FOR_avx_vpermilv2df:
29314 case CODE_FOR_xop_vpermil2v2df3:
29315 case CODE_FOR_xop_vpermil2v4sf3:
29316 case CODE_FOR_xop_vpermil2v4df3:
29317 case CODE_FOR_xop_vpermil2v8sf3:
29318 error ("the last argument must be a 2-bit immediate");
29319 return const0_rtx;
29320
29321 case CODE_FOR_avx_vextractf128v4df:
29322 case CODE_FOR_avx_vextractf128v8sf:
29323 case CODE_FOR_avx_vextractf128v8si:
29324 case CODE_FOR_avx_vinsertf128v4df:
29325 case CODE_FOR_avx_vinsertf128v8sf:
29326 case CODE_FOR_avx_vinsertf128v8si:
29327 error ("the last argument must be a 1-bit immediate");
29328 return const0_rtx;
29329
29330 case CODE_FOR_avx_vmcmpv2df3:
29331 case CODE_FOR_avx_vmcmpv4sf3:
29332 case CODE_FOR_avx_cmpv2df3:
29333 case CODE_FOR_avx_cmpv4sf3:
29334 case CODE_FOR_avx_cmpv4df3:
29335 case CODE_FOR_avx_cmpv8sf3:
29336 error ("the last argument must be a 5-bit immediate");
29337 return const0_rtx;
29338
29339 default:
29340 switch (nargs_constant)
29341 {
29342 case 2:
29343 if ((nargs - i) == nargs_constant)
29344 {
29345 error ("the next to last argument must be an 8-bit immediate");
29346 break;
29347 }
29348 case 1:
29349 error ("the last argument must be an 8-bit immediate");
29350 break;
29351 default:
29352 gcc_unreachable ();
29353 }
29354 return const0_rtx;
29355 }
29356 }
29357 else
29358 {
29359 if (VECTOR_MODE_P (mode))
29360 op = safe_vector_operand (op, mode);
29361
29362 /* If we aren't optimizing, only allow one memory operand to
29363 be generated. */
29364 if (memory_operand (op, mode))
29365 num_memory++;
29366
29367 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29368 {
29369 if (optimize || !match || num_memory > 1)
29370 op = copy_to_mode_reg (mode, op);
29371 }
29372 else
29373 {
29374 op = copy_to_reg (op);
29375 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29376 }
29377 }
29378
29379 args[i].op = op;
29380 args[i].mode = mode;
29381 }
29382
29383 switch (nargs)
29384 {
29385 case 1:
29386 pat = GEN_FCN (icode) (real_target, args[0].op);
29387 break;
29388 case 2:
29389 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29390 break;
29391 case 3:
29392 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29393 args[2].op);
29394 break;
29395 case 4:
29396 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29397 args[2].op, args[3].op);
29398 break;
29399 default:
29400 gcc_unreachable ();
29401 }
29402
29403 if (! pat)
29404 return 0;
29405
29406 emit_insn (pat);
29407 return target;
29408 }
29409
29410 /* Subroutine of ix86_expand_builtin to take care of special insns
29411 with variable number of operands. */
29412
29413 static rtx
29414 ix86_expand_special_args_builtin (const struct builtin_description *d,
29415 tree exp, rtx target)
29416 {
29417 tree arg;
29418 rtx pat, op;
29419 unsigned int i, nargs, arg_adjust, memory;
29420 struct
29421 {
29422 rtx op;
29423 enum machine_mode mode;
29424 } args[3];
29425 enum insn_code icode = d->icode;
29426 bool last_arg_constant = false;
29427 const struct insn_data_d *insn_p = &insn_data[icode];
29428 enum machine_mode tmode = insn_p->operand[0].mode;
29429 enum { load, store } klass;
29430
29431 switch ((enum ix86_builtin_func_type) d->flag)
29432 {
29433 case VOID_FTYPE_VOID:
29434 if (icode == CODE_FOR_avx_vzeroupper)
29435 target = GEN_INT (vzeroupper_intrinsic);
29436 emit_insn (GEN_FCN (icode) (target));
29437 return 0;
29438 case VOID_FTYPE_UINT64:
29439 case VOID_FTYPE_UNSIGNED:
29440 nargs = 0;
29441 klass = store;
29442 memory = 0;
29443 break;
29444
29445 case INT_FTYPE_VOID:
29446 case UINT64_FTYPE_VOID:
29447 case UNSIGNED_FTYPE_VOID:
29448 nargs = 0;
29449 klass = load;
29450 memory = 0;
29451 break;
29452 case UINT64_FTYPE_PUNSIGNED:
29453 case V2DI_FTYPE_PV2DI:
29454 case V4DI_FTYPE_PV4DI:
29455 case V32QI_FTYPE_PCCHAR:
29456 case V16QI_FTYPE_PCCHAR:
29457 case V8SF_FTYPE_PCV4SF:
29458 case V8SF_FTYPE_PCFLOAT:
29459 case V4SF_FTYPE_PCFLOAT:
29460 case V4DF_FTYPE_PCV2DF:
29461 case V4DF_FTYPE_PCDOUBLE:
29462 case V2DF_FTYPE_PCDOUBLE:
29463 case VOID_FTYPE_PVOID:
29464 nargs = 1;
29465 klass = load;
29466 memory = 0;
29467 break;
29468 case VOID_FTYPE_PV2SF_V4SF:
29469 case VOID_FTYPE_PV4DI_V4DI:
29470 case VOID_FTYPE_PV2DI_V2DI:
29471 case VOID_FTYPE_PCHAR_V32QI:
29472 case VOID_FTYPE_PCHAR_V16QI:
29473 case VOID_FTYPE_PFLOAT_V8SF:
29474 case VOID_FTYPE_PFLOAT_V4SF:
29475 case VOID_FTYPE_PDOUBLE_V4DF:
29476 case VOID_FTYPE_PDOUBLE_V2DF:
29477 case VOID_FTYPE_PLONGLONG_LONGLONG:
29478 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29479 case VOID_FTYPE_PINT_INT:
29480 nargs = 1;
29481 klass = store;
29482 /* Reserve memory operand for target. */
29483 memory = ARRAY_SIZE (args);
29484 break;
29485 case V4SF_FTYPE_V4SF_PCV2SF:
29486 case V2DF_FTYPE_V2DF_PCDOUBLE:
29487 nargs = 2;
29488 klass = load;
29489 memory = 1;
29490 break;
29491 case V8SF_FTYPE_PCV8SF_V8SI:
29492 case V4DF_FTYPE_PCV4DF_V4DI:
29493 case V4SF_FTYPE_PCV4SF_V4SI:
29494 case V2DF_FTYPE_PCV2DF_V2DI:
29495 case V8SI_FTYPE_PCV8SI_V8SI:
29496 case V4DI_FTYPE_PCV4DI_V4DI:
29497 case V4SI_FTYPE_PCV4SI_V4SI:
29498 case V2DI_FTYPE_PCV2DI_V2DI:
29499 nargs = 2;
29500 klass = load;
29501 memory = 0;
29502 break;
29503 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29504 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29505 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29506 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29507 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29508 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29509 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29510 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29511 nargs = 2;
29512 klass = store;
29513 /* Reserve memory operand for target. */
29514 memory = ARRAY_SIZE (args);
29515 break;
29516 case VOID_FTYPE_UINT_UINT_UINT:
29517 case VOID_FTYPE_UINT64_UINT_UINT:
29518 case UCHAR_FTYPE_UINT_UINT_UINT:
29519 case UCHAR_FTYPE_UINT64_UINT_UINT:
29520 nargs = 3;
29521 klass = load;
29522 memory = ARRAY_SIZE (args);
29523 last_arg_constant = true;
29524 break;
29525 default:
29526 gcc_unreachable ();
29527 }
29528
29529 gcc_assert (nargs <= ARRAY_SIZE (args));
29530
29531 if (klass == store)
29532 {
29533 arg = CALL_EXPR_ARG (exp, 0);
29534 op = expand_normal (arg);
29535 gcc_assert (target == 0);
29536 if (memory)
29537 {
29538 if (GET_MODE (op) != Pmode)
29539 op = convert_to_mode (Pmode, op, 1);
29540 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29541 }
29542 else
29543 target = force_reg (tmode, op);
29544 arg_adjust = 1;
29545 }
29546 else
29547 {
29548 arg_adjust = 0;
29549 if (optimize
29550 || target == 0
29551 || !register_operand (target, tmode)
29552 || GET_MODE (target) != tmode)
29553 target = gen_reg_rtx (tmode);
29554 }
29555
29556 for (i = 0; i < nargs; i++)
29557 {
29558 enum machine_mode mode = insn_p->operand[i + 1].mode;
29559 bool match;
29560
29561 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29562 op = expand_normal (arg);
29563 match = insn_p->operand[i + 1].predicate (op, mode);
29564
29565 if (last_arg_constant && (i + 1) == nargs)
29566 {
29567 if (!match)
29568 {
29569 if (icode == CODE_FOR_lwp_lwpvalsi3
29570 || icode == CODE_FOR_lwp_lwpinssi3
29571 || icode == CODE_FOR_lwp_lwpvaldi3
29572 || icode == CODE_FOR_lwp_lwpinsdi3)
29573 error ("the last argument must be a 32-bit immediate");
29574 else
29575 error ("the last argument must be an 8-bit immediate");
29576 return const0_rtx;
29577 }
29578 }
29579 else
29580 {
29581 if (i == memory)
29582 {
29583 /* This must be the memory operand. */
29584 if (GET_MODE (op) != Pmode)
29585 op = convert_to_mode (Pmode, op, 1);
29586 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29587 gcc_assert (GET_MODE (op) == mode
29588 || GET_MODE (op) == VOIDmode);
29589 }
29590 else
29591 {
29592 /* This must be register. */
29593 if (VECTOR_MODE_P (mode))
29594 op = safe_vector_operand (op, mode);
29595
29596 gcc_assert (GET_MODE (op) == mode
29597 || GET_MODE (op) == VOIDmode);
29598 op = copy_to_mode_reg (mode, op);
29599 }
29600 }
29601
29602 args[i].op = op;
29603 args[i].mode = mode;
29604 }
29605
29606 switch (nargs)
29607 {
29608 case 0:
29609 pat = GEN_FCN (icode) (target);
29610 break;
29611 case 1:
29612 pat = GEN_FCN (icode) (target, args[0].op);
29613 break;
29614 case 2:
29615 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29616 break;
29617 case 3:
29618 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29619 break;
29620 default:
29621 gcc_unreachable ();
29622 }
29623
29624 if (! pat)
29625 return 0;
29626 emit_insn (pat);
29627 return klass == store ? 0 : target;
29628 }
29629
29630 /* Return the integer constant in ARG. Constrain it to be in the range
29631 of the subparts of VEC_TYPE; issue an error if not. */
29632
29633 static int
29634 get_element_number (tree vec_type, tree arg)
29635 {
29636 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29637
29638 if (!host_integerp (arg, 1)
29639 || (elt = tree_low_cst (arg, 1), elt > max))
29640 {
29641 error ("selector must be an integer constant in the range 0..%wi", max);
29642 return 0;
29643 }
29644
29645 return elt;
29646 }
29647
29648 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29649 ix86_expand_vector_init. We DO have language-level syntax for this, in
29650 the form of (type){ init-list }. Except that since we can't place emms
29651 instructions from inside the compiler, we can't allow the use of MMX
29652 registers unless the user explicitly asks for it. So we do *not* define
29653 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29654 we have builtins invoked by mmintrin.h that gives us license to emit
29655 these sorts of instructions. */
29656
29657 static rtx
29658 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29659 {
29660 enum machine_mode tmode = TYPE_MODE (type);
29661 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29662 int i, n_elt = GET_MODE_NUNITS (tmode);
29663 rtvec v = rtvec_alloc (n_elt);
29664
29665 gcc_assert (VECTOR_MODE_P (tmode));
29666 gcc_assert (call_expr_nargs (exp) == n_elt);
29667
29668 for (i = 0; i < n_elt; ++i)
29669 {
29670 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29671 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29672 }
29673
29674 if (!target || !register_operand (target, tmode))
29675 target = gen_reg_rtx (tmode);
29676
29677 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29678 return target;
29679 }
29680
29681 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29682 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29683 had a language-level syntax for referencing vector elements. */
29684
29685 static rtx
29686 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29687 {
29688 enum machine_mode tmode, mode0;
29689 tree arg0, arg1;
29690 int elt;
29691 rtx op0;
29692
29693 arg0 = CALL_EXPR_ARG (exp, 0);
29694 arg1 = CALL_EXPR_ARG (exp, 1);
29695
29696 op0 = expand_normal (arg0);
29697 elt = get_element_number (TREE_TYPE (arg0), arg1);
29698
29699 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29700 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29701 gcc_assert (VECTOR_MODE_P (mode0));
29702
29703 op0 = force_reg (mode0, op0);
29704
29705 if (optimize || !target || !register_operand (target, tmode))
29706 target = gen_reg_rtx (tmode);
29707
29708 ix86_expand_vector_extract (true, target, op0, elt);
29709
29710 return target;
29711 }
29712
29713 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29714 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29715 a language-level syntax for referencing vector elements. */
29716
29717 static rtx
29718 ix86_expand_vec_set_builtin (tree exp)
29719 {
29720 enum machine_mode tmode, mode1;
29721 tree arg0, arg1, arg2;
29722 int elt;
29723 rtx op0, op1, target;
29724
29725 arg0 = CALL_EXPR_ARG (exp, 0);
29726 arg1 = CALL_EXPR_ARG (exp, 1);
29727 arg2 = CALL_EXPR_ARG (exp, 2);
29728
29729 tmode = TYPE_MODE (TREE_TYPE (arg0));
29730 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29731 gcc_assert (VECTOR_MODE_P (tmode));
29732
29733 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29734 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29735 elt = get_element_number (TREE_TYPE (arg0), arg2);
29736
29737 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29738 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29739
29740 op0 = force_reg (tmode, op0);
29741 op1 = force_reg (mode1, op1);
29742
29743 /* OP0 is the source of these builtin functions and shouldn't be
29744 modified. Create a copy, use it and return it as target. */
29745 target = gen_reg_rtx (tmode);
29746 emit_move_insn (target, op0);
29747 ix86_expand_vector_set (true, target, op1, elt);
29748
29749 return target;
29750 }
29751
29752 /* Expand an expression EXP that calls a built-in function,
29753 with result going to TARGET if that's convenient
29754 (and in mode MODE if that's convenient).
29755 SUBTARGET may be used as the target for computing one of EXP's operands.
29756 IGNORE is nonzero if the value is to be ignored. */
29757
29758 static rtx
29759 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29760 enum machine_mode mode ATTRIBUTE_UNUSED,
29761 int ignore ATTRIBUTE_UNUSED)
29762 {
29763 const struct builtin_description *d;
29764 size_t i;
29765 enum insn_code icode;
29766 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29767 tree arg0, arg1, arg2, arg3, arg4;
29768 rtx op0, op1, op2, op3, op4, pat;
29769 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29770 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29771
29772 /* For CPU builtins that can be folded, fold first and expand the fold. */
29773 switch (fcode)
29774 {
29775 case IX86_BUILTIN_CPU_INIT:
29776 {
29777 /* Make it call __cpu_indicator_init in libgcc. */
29778 tree call_expr, fndecl, type;
29779 type = build_function_type_list (integer_type_node, NULL_TREE);
29780 fndecl = build_fn_decl ("__cpu_indicator_init", type);
29781 call_expr = build_call_expr (fndecl, 0);
29782 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
29783 }
29784 case IX86_BUILTIN_CPU_IS:
29785 case IX86_BUILTIN_CPU_SUPPORTS:
29786 {
29787 tree arg0 = CALL_EXPR_ARG (exp, 0);
29788 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
29789 gcc_assert (fold_expr != NULL_TREE);
29790 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
29791 }
29792 }
29793
29794 /* Determine whether the builtin function is available under the current ISA.
29795 Originally the builtin was not created if it wasn't applicable to the
29796 current ISA based on the command line switches. With function specific
29797 options, we need to check in the context of the function making the call
29798 whether it is supported. */
29799 if (ix86_builtins_isa[fcode].isa
29800 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29801 {
29802 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29803 NULL, (enum fpmath_unit) 0, false);
29804
29805 if (!opts)
29806 error ("%qE needs unknown isa option", fndecl);
29807 else
29808 {
29809 gcc_assert (opts != NULL);
29810 error ("%qE needs isa option %s", fndecl, opts);
29811 free (opts);
29812 }
29813 return const0_rtx;
29814 }
29815
29816 switch (fcode)
29817 {
29818 case IX86_BUILTIN_MASKMOVQ:
29819 case IX86_BUILTIN_MASKMOVDQU:
29820 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29821 ? CODE_FOR_mmx_maskmovq
29822 : CODE_FOR_sse2_maskmovdqu);
29823 /* Note the arg order is different from the operand order. */
29824 arg1 = CALL_EXPR_ARG (exp, 0);
29825 arg2 = CALL_EXPR_ARG (exp, 1);
29826 arg0 = CALL_EXPR_ARG (exp, 2);
29827 op0 = expand_normal (arg0);
29828 op1 = expand_normal (arg1);
29829 op2 = expand_normal (arg2);
29830 mode0 = insn_data[icode].operand[0].mode;
29831 mode1 = insn_data[icode].operand[1].mode;
29832 mode2 = insn_data[icode].operand[2].mode;
29833
29834 if (GET_MODE (op0) != Pmode)
29835 op0 = convert_to_mode (Pmode, op0, 1);
29836 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29837
29838 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29839 op0 = copy_to_mode_reg (mode0, op0);
29840 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29841 op1 = copy_to_mode_reg (mode1, op1);
29842 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29843 op2 = copy_to_mode_reg (mode2, op2);
29844 pat = GEN_FCN (icode) (op0, op1, op2);
29845 if (! pat)
29846 return 0;
29847 emit_insn (pat);
29848 return 0;
29849
29850 case IX86_BUILTIN_LDMXCSR:
29851 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29852 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29853 emit_move_insn (target, op0);
29854 emit_insn (gen_sse_ldmxcsr (target));
29855 return 0;
29856
29857 case IX86_BUILTIN_STMXCSR:
29858 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29859 emit_insn (gen_sse_stmxcsr (target));
29860 return copy_to_mode_reg (SImode, target);
29861
29862 case IX86_BUILTIN_CLFLUSH:
29863 arg0 = CALL_EXPR_ARG (exp, 0);
29864 op0 = expand_normal (arg0);
29865 icode = CODE_FOR_sse2_clflush;
29866 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29867 {
29868 if (GET_MODE (op0) != Pmode)
29869 op0 = convert_to_mode (Pmode, op0, 1);
29870 op0 = force_reg (Pmode, op0);
29871 }
29872
29873 emit_insn (gen_sse2_clflush (op0));
29874 return 0;
29875
29876 case IX86_BUILTIN_MONITOR:
29877 arg0 = CALL_EXPR_ARG (exp, 0);
29878 arg1 = CALL_EXPR_ARG (exp, 1);
29879 arg2 = CALL_EXPR_ARG (exp, 2);
29880 op0 = expand_normal (arg0);
29881 op1 = expand_normal (arg1);
29882 op2 = expand_normal (arg2);
29883 if (!REG_P (op0))
29884 {
29885 if (GET_MODE (op0) != Pmode)
29886 op0 = convert_to_mode (Pmode, op0, 1);
29887 op0 = force_reg (Pmode, op0);
29888 }
29889 if (!REG_P (op1))
29890 op1 = copy_to_mode_reg (SImode, op1);
29891 if (!REG_P (op2))
29892 op2 = copy_to_mode_reg (SImode, op2);
29893 emit_insn (ix86_gen_monitor (op0, op1, op2));
29894 return 0;
29895
29896 case IX86_BUILTIN_MWAIT:
29897 arg0 = CALL_EXPR_ARG (exp, 0);
29898 arg1 = CALL_EXPR_ARG (exp, 1);
29899 op0 = expand_normal (arg0);
29900 op1 = expand_normal (arg1);
29901 if (!REG_P (op0))
29902 op0 = copy_to_mode_reg (SImode, op0);
29903 if (!REG_P (op1))
29904 op1 = copy_to_mode_reg (SImode, op1);
29905 emit_insn (gen_sse3_mwait (op0, op1));
29906 return 0;
29907
29908 case IX86_BUILTIN_VEC_INIT_V2SI:
29909 case IX86_BUILTIN_VEC_INIT_V4HI:
29910 case IX86_BUILTIN_VEC_INIT_V8QI:
29911 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29912
29913 case IX86_BUILTIN_VEC_EXT_V2DF:
29914 case IX86_BUILTIN_VEC_EXT_V2DI:
29915 case IX86_BUILTIN_VEC_EXT_V4SF:
29916 case IX86_BUILTIN_VEC_EXT_V4SI:
29917 case IX86_BUILTIN_VEC_EXT_V8HI:
29918 case IX86_BUILTIN_VEC_EXT_V2SI:
29919 case IX86_BUILTIN_VEC_EXT_V4HI:
29920 case IX86_BUILTIN_VEC_EXT_V16QI:
29921 return ix86_expand_vec_ext_builtin (exp, target);
29922
29923 case IX86_BUILTIN_VEC_SET_V2DI:
29924 case IX86_BUILTIN_VEC_SET_V4SF:
29925 case IX86_BUILTIN_VEC_SET_V4SI:
29926 case IX86_BUILTIN_VEC_SET_V8HI:
29927 case IX86_BUILTIN_VEC_SET_V4HI:
29928 case IX86_BUILTIN_VEC_SET_V16QI:
29929 return ix86_expand_vec_set_builtin (exp);
29930
29931 case IX86_BUILTIN_INFQ:
29932 case IX86_BUILTIN_HUGE_VALQ:
29933 {
29934 REAL_VALUE_TYPE inf;
29935 rtx tmp;
29936
29937 real_inf (&inf);
29938 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29939
29940 tmp = validize_mem (force_const_mem (mode, tmp));
29941
29942 if (target == 0)
29943 target = gen_reg_rtx (mode);
29944
29945 emit_move_insn (target, tmp);
29946 return target;
29947 }
29948
29949 case IX86_BUILTIN_LLWPCB:
29950 arg0 = CALL_EXPR_ARG (exp, 0);
29951 op0 = expand_normal (arg0);
29952 icode = CODE_FOR_lwp_llwpcb;
29953 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29954 {
29955 if (GET_MODE (op0) != Pmode)
29956 op0 = convert_to_mode (Pmode, op0, 1);
29957 op0 = force_reg (Pmode, op0);
29958 }
29959 emit_insn (gen_lwp_llwpcb (op0));
29960 return 0;
29961
29962 case IX86_BUILTIN_SLWPCB:
29963 icode = CODE_FOR_lwp_slwpcb;
29964 if (!target
29965 || !insn_data[icode].operand[0].predicate (target, Pmode))
29966 target = gen_reg_rtx (Pmode);
29967 emit_insn (gen_lwp_slwpcb (target));
29968 return target;
29969
29970 case IX86_BUILTIN_BEXTRI32:
29971 case IX86_BUILTIN_BEXTRI64:
29972 arg0 = CALL_EXPR_ARG (exp, 0);
29973 arg1 = CALL_EXPR_ARG (exp, 1);
29974 op0 = expand_normal (arg0);
29975 op1 = expand_normal (arg1);
29976 icode = (fcode == IX86_BUILTIN_BEXTRI32
29977 ? CODE_FOR_tbm_bextri_si
29978 : CODE_FOR_tbm_bextri_di);
29979 if (!CONST_INT_P (op1))
29980 {
29981 error ("last argument must be an immediate");
29982 return const0_rtx;
29983 }
29984 else
29985 {
29986 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29987 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29988 op1 = GEN_INT (length);
29989 op2 = GEN_INT (lsb_index);
29990 pat = GEN_FCN (icode) (target, op0, op1, op2);
29991 if (pat)
29992 emit_insn (pat);
29993 return target;
29994 }
29995
29996 case IX86_BUILTIN_RDRAND16_STEP:
29997 icode = CODE_FOR_rdrandhi_1;
29998 mode0 = HImode;
29999 goto rdrand_step;
30000
30001 case IX86_BUILTIN_RDRAND32_STEP:
30002 icode = CODE_FOR_rdrandsi_1;
30003 mode0 = SImode;
30004 goto rdrand_step;
30005
30006 case IX86_BUILTIN_RDRAND64_STEP:
30007 icode = CODE_FOR_rdranddi_1;
30008 mode0 = DImode;
30009
30010 rdrand_step:
30011 op0 = gen_reg_rtx (mode0);
30012 emit_insn (GEN_FCN (icode) (op0));
30013
30014 arg0 = CALL_EXPR_ARG (exp, 0);
30015 op1 = expand_normal (arg0);
30016 if (!address_operand (op1, VOIDmode))
30017 {
30018 op1 = convert_memory_address (Pmode, op1);
30019 op1 = copy_addr_to_reg (op1);
30020 }
30021 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30022
30023 op1 = gen_reg_rtx (SImode);
30024 emit_move_insn (op1, CONST1_RTX (SImode));
30025
30026 /* Emit SImode conditional move. */
30027 if (mode0 == HImode)
30028 {
30029 op2 = gen_reg_rtx (SImode);
30030 emit_insn (gen_zero_extendhisi2 (op2, op0));
30031 }
30032 else if (mode0 == SImode)
30033 op2 = op0;
30034 else
30035 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30036
30037 if (target == 0)
30038 target = gen_reg_rtx (SImode);
30039
30040 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30041 const0_rtx);
30042 emit_insn (gen_rtx_SET (VOIDmode, target,
30043 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30044 return target;
30045
30046 case IX86_BUILTIN_GATHERSIV2DF:
30047 icode = CODE_FOR_avx2_gathersiv2df;
30048 goto gather_gen;
30049 case IX86_BUILTIN_GATHERSIV4DF:
30050 icode = CODE_FOR_avx2_gathersiv4df;
30051 goto gather_gen;
30052 case IX86_BUILTIN_GATHERDIV2DF:
30053 icode = CODE_FOR_avx2_gatherdiv2df;
30054 goto gather_gen;
30055 case IX86_BUILTIN_GATHERDIV4DF:
30056 icode = CODE_FOR_avx2_gatherdiv4df;
30057 goto gather_gen;
30058 case IX86_BUILTIN_GATHERSIV4SF:
30059 icode = CODE_FOR_avx2_gathersiv4sf;
30060 goto gather_gen;
30061 case IX86_BUILTIN_GATHERSIV8SF:
30062 icode = CODE_FOR_avx2_gathersiv8sf;
30063 goto gather_gen;
30064 case IX86_BUILTIN_GATHERDIV4SF:
30065 icode = CODE_FOR_avx2_gatherdiv4sf;
30066 goto gather_gen;
30067 case IX86_BUILTIN_GATHERDIV8SF:
30068 icode = CODE_FOR_avx2_gatherdiv8sf;
30069 goto gather_gen;
30070 case IX86_BUILTIN_GATHERSIV2DI:
30071 icode = CODE_FOR_avx2_gathersiv2di;
30072 goto gather_gen;
30073 case IX86_BUILTIN_GATHERSIV4DI:
30074 icode = CODE_FOR_avx2_gathersiv4di;
30075 goto gather_gen;
30076 case IX86_BUILTIN_GATHERDIV2DI:
30077 icode = CODE_FOR_avx2_gatherdiv2di;
30078 goto gather_gen;
30079 case IX86_BUILTIN_GATHERDIV4DI:
30080 icode = CODE_FOR_avx2_gatherdiv4di;
30081 goto gather_gen;
30082 case IX86_BUILTIN_GATHERSIV4SI:
30083 icode = CODE_FOR_avx2_gathersiv4si;
30084 goto gather_gen;
30085 case IX86_BUILTIN_GATHERSIV8SI:
30086 icode = CODE_FOR_avx2_gathersiv8si;
30087 goto gather_gen;
30088 case IX86_BUILTIN_GATHERDIV4SI:
30089 icode = CODE_FOR_avx2_gatherdiv4si;
30090 goto gather_gen;
30091 case IX86_BUILTIN_GATHERDIV8SI:
30092 icode = CODE_FOR_avx2_gatherdiv8si;
30093 goto gather_gen;
30094 case IX86_BUILTIN_GATHERALTSIV4DF:
30095 icode = CODE_FOR_avx2_gathersiv4df;
30096 goto gather_gen;
30097 case IX86_BUILTIN_GATHERALTDIV8SF:
30098 icode = CODE_FOR_avx2_gatherdiv8sf;
30099 goto gather_gen;
30100 case IX86_BUILTIN_GATHERALTSIV4DI:
30101 icode = CODE_FOR_avx2_gathersiv4di;
30102 goto gather_gen;
30103 case IX86_BUILTIN_GATHERALTDIV8SI:
30104 icode = CODE_FOR_avx2_gatherdiv8si;
30105 goto gather_gen;
30106
30107 gather_gen:
30108 arg0 = CALL_EXPR_ARG (exp, 0);
30109 arg1 = CALL_EXPR_ARG (exp, 1);
30110 arg2 = CALL_EXPR_ARG (exp, 2);
30111 arg3 = CALL_EXPR_ARG (exp, 3);
30112 arg4 = CALL_EXPR_ARG (exp, 4);
30113 op0 = expand_normal (arg0);
30114 op1 = expand_normal (arg1);
30115 op2 = expand_normal (arg2);
30116 op3 = expand_normal (arg3);
30117 op4 = expand_normal (arg4);
30118 /* Note the arg order is different from the operand order. */
30119 mode0 = insn_data[icode].operand[1].mode;
30120 mode2 = insn_data[icode].operand[3].mode;
30121 mode3 = insn_data[icode].operand[4].mode;
30122 mode4 = insn_data[icode].operand[5].mode;
30123
30124 if (target == NULL_RTX
30125 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30126 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30127 else
30128 subtarget = target;
30129
30130 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30131 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30132 {
30133 rtx half = gen_reg_rtx (V4SImode);
30134 if (!nonimmediate_operand (op2, V8SImode))
30135 op2 = copy_to_mode_reg (V8SImode, op2);
30136 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30137 op2 = half;
30138 }
30139 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30140 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30141 {
30142 rtx (*gen) (rtx, rtx);
30143 rtx half = gen_reg_rtx (mode0);
30144 if (mode0 == V4SFmode)
30145 gen = gen_vec_extract_lo_v8sf;
30146 else
30147 gen = gen_vec_extract_lo_v8si;
30148 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30149 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30150 emit_insn (gen (half, op0));
30151 op0 = half;
30152 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30153 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30154 emit_insn (gen (half, op3));
30155 op3 = half;
30156 }
30157
30158 /* Force memory operand only with base register here. But we
30159 don't want to do it on memory operand for other builtin
30160 functions. */
30161 if (GET_MODE (op1) != Pmode)
30162 op1 = convert_to_mode (Pmode, op1, 1);
30163 op1 = force_reg (Pmode, op1);
30164
30165 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30166 op0 = copy_to_mode_reg (mode0, op0);
30167 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30168 op1 = copy_to_mode_reg (Pmode, op1);
30169 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30170 op2 = copy_to_mode_reg (mode2, op2);
30171 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30172 op3 = copy_to_mode_reg (mode3, op3);
30173 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30174 {
30175 error ("last argument must be scale 1, 2, 4, 8");
30176 return const0_rtx;
30177 }
30178
30179 /* Optimize. If mask is known to have all high bits set,
30180 replace op0 with pc_rtx to signal that the instruction
30181 overwrites the whole destination and doesn't use its
30182 previous contents. */
30183 if (optimize)
30184 {
30185 if (TREE_CODE (arg3) == VECTOR_CST)
30186 {
30187 unsigned int negative = 0;
30188 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30189 {
30190 tree cst = VECTOR_CST_ELT (arg3, i);
30191 if (TREE_CODE (cst) == INTEGER_CST
30192 && tree_int_cst_sign_bit (cst))
30193 negative++;
30194 else if (TREE_CODE (cst) == REAL_CST
30195 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30196 negative++;
30197 }
30198 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30199 op0 = pc_rtx;
30200 }
30201 else if (TREE_CODE (arg3) == SSA_NAME)
30202 {
30203 /* Recognize also when mask is like:
30204 __v2df src = _mm_setzero_pd ();
30205 __v2df mask = _mm_cmpeq_pd (src, src);
30206 or
30207 __v8sf src = _mm256_setzero_ps ();
30208 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30209 as that is a cheaper way to load all ones into
30210 a register than having to load a constant from
30211 memory. */
30212 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30213 if (is_gimple_call (def_stmt))
30214 {
30215 tree fndecl = gimple_call_fndecl (def_stmt);
30216 if (fndecl
30217 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30218 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30219 {
30220 case IX86_BUILTIN_CMPPD:
30221 case IX86_BUILTIN_CMPPS:
30222 case IX86_BUILTIN_CMPPD256:
30223 case IX86_BUILTIN_CMPPS256:
30224 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30225 break;
30226 /* FALLTHRU */
30227 case IX86_BUILTIN_CMPEQPD:
30228 case IX86_BUILTIN_CMPEQPS:
30229 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30230 && initializer_zerop (gimple_call_arg (def_stmt,
30231 1)))
30232 op0 = pc_rtx;
30233 break;
30234 default:
30235 break;
30236 }
30237 }
30238 }
30239 }
30240
30241 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30242 if (! pat)
30243 return const0_rtx;
30244 emit_insn (pat);
30245
30246 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30247 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30248 {
30249 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30250 ? V4SFmode : V4SImode;
30251 if (target == NULL_RTX)
30252 target = gen_reg_rtx (tmode);
30253 if (tmode == V4SFmode)
30254 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30255 else
30256 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30257 }
30258 else
30259 target = subtarget;
30260
30261 return target;
30262
30263 case IX86_BUILTIN_XABORT:
30264 icode = CODE_FOR_xabort;
30265 arg0 = CALL_EXPR_ARG (exp, 0);
30266 op0 = expand_normal (arg0);
30267 mode0 = insn_data[icode].operand[0].mode;
30268 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30269 {
30270 error ("the xabort's argument must be an 8-bit immediate");
30271 return const0_rtx;
30272 }
30273 emit_insn (gen_xabort (op0));
30274 return 0;
30275
30276 default:
30277 break;
30278 }
30279
30280 for (i = 0, d = bdesc_special_args;
30281 i < ARRAY_SIZE (bdesc_special_args);
30282 i++, d++)
30283 if (d->code == fcode)
30284 return ix86_expand_special_args_builtin (d, exp, target);
30285
30286 for (i = 0, d = bdesc_args;
30287 i < ARRAY_SIZE (bdesc_args);
30288 i++, d++)
30289 if (d->code == fcode)
30290 switch (fcode)
30291 {
30292 case IX86_BUILTIN_FABSQ:
30293 case IX86_BUILTIN_COPYSIGNQ:
30294 if (!TARGET_SSE)
30295 /* Emit a normal call if SSE isn't available. */
30296 return expand_call (exp, target, ignore);
30297 default:
30298 return ix86_expand_args_builtin (d, exp, target);
30299 }
30300
30301 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30302 if (d->code == fcode)
30303 return ix86_expand_sse_comi (d, exp, target);
30304
30305 for (i = 0, d = bdesc_pcmpestr;
30306 i < ARRAY_SIZE (bdesc_pcmpestr);
30307 i++, d++)
30308 if (d->code == fcode)
30309 return ix86_expand_sse_pcmpestr (d, exp, target);
30310
30311 for (i = 0, d = bdesc_pcmpistr;
30312 i < ARRAY_SIZE (bdesc_pcmpistr);
30313 i++, d++)
30314 if (d->code == fcode)
30315 return ix86_expand_sse_pcmpistr (d, exp, target);
30316
30317 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30318 if (d->code == fcode)
30319 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30320 (enum ix86_builtin_func_type)
30321 d->flag, d->comparison);
30322
30323 gcc_unreachable ();
30324 }
30325
30326 /* Returns a function decl for a vectorized version of the builtin function
30327 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30328 if it is not available. */
30329
30330 static tree
30331 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30332 tree type_in)
30333 {
30334 enum machine_mode in_mode, out_mode;
30335 int in_n, out_n;
30336 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30337
30338 if (TREE_CODE (type_out) != VECTOR_TYPE
30339 || TREE_CODE (type_in) != VECTOR_TYPE
30340 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30341 return NULL_TREE;
30342
30343 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30344 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30345 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30346 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30347
30348 switch (fn)
30349 {
30350 case BUILT_IN_SQRT:
30351 if (out_mode == DFmode && in_mode == DFmode)
30352 {
30353 if (out_n == 2 && in_n == 2)
30354 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30355 else if (out_n == 4 && in_n == 4)
30356 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30357 }
30358 break;
30359
30360 case BUILT_IN_SQRTF:
30361 if (out_mode == SFmode && in_mode == SFmode)
30362 {
30363 if (out_n == 4 && in_n == 4)
30364 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30365 else if (out_n == 8 && in_n == 8)
30366 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30367 }
30368 break;
30369
30370 case BUILT_IN_IFLOOR:
30371 case BUILT_IN_LFLOOR:
30372 case BUILT_IN_LLFLOOR:
30373 /* The round insn does not trap on denormals. */
30374 if (flag_trapping_math || !TARGET_ROUND)
30375 break;
30376
30377 if (out_mode == SImode && in_mode == DFmode)
30378 {
30379 if (out_n == 4 && in_n == 2)
30380 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30381 else if (out_n == 8 && in_n == 4)
30382 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30383 }
30384 break;
30385
30386 case BUILT_IN_IFLOORF:
30387 case BUILT_IN_LFLOORF:
30388 case BUILT_IN_LLFLOORF:
30389 /* The round insn does not trap on denormals. */
30390 if (flag_trapping_math || !TARGET_ROUND)
30391 break;
30392
30393 if (out_mode == SImode && in_mode == SFmode)
30394 {
30395 if (out_n == 4 && in_n == 4)
30396 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30397 else if (out_n == 8 && in_n == 8)
30398 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30399 }
30400 break;
30401
30402 case BUILT_IN_ICEIL:
30403 case BUILT_IN_LCEIL:
30404 case BUILT_IN_LLCEIL:
30405 /* The round insn does not trap on denormals. */
30406 if (flag_trapping_math || !TARGET_ROUND)
30407 break;
30408
30409 if (out_mode == SImode && in_mode == DFmode)
30410 {
30411 if (out_n == 4 && in_n == 2)
30412 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30413 else if (out_n == 8 && in_n == 4)
30414 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30415 }
30416 break;
30417
30418 case BUILT_IN_ICEILF:
30419 case BUILT_IN_LCEILF:
30420 case BUILT_IN_LLCEILF:
30421 /* The round insn does not trap on denormals. */
30422 if (flag_trapping_math || !TARGET_ROUND)
30423 break;
30424
30425 if (out_mode == SImode && in_mode == SFmode)
30426 {
30427 if (out_n == 4 && in_n == 4)
30428 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30429 else if (out_n == 8 && in_n == 8)
30430 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30431 }
30432 break;
30433
30434 case BUILT_IN_IRINT:
30435 case BUILT_IN_LRINT:
30436 case BUILT_IN_LLRINT:
30437 if (out_mode == SImode && in_mode == DFmode)
30438 {
30439 if (out_n == 4 && in_n == 2)
30440 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30441 else if (out_n == 8 && in_n == 4)
30442 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30443 }
30444 break;
30445
30446 case BUILT_IN_IRINTF:
30447 case BUILT_IN_LRINTF:
30448 case BUILT_IN_LLRINTF:
30449 if (out_mode == SImode && in_mode == SFmode)
30450 {
30451 if (out_n == 4 && in_n == 4)
30452 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30453 else if (out_n == 8 && in_n == 8)
30454 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30455 }
30456 break;
30457
30458 case BUILT_IN_IROUND:
30459 case BUILT_IN_LROUND:
30460 case BUILT_IN_LLROUND:
30461 /* The round insn does not trap on denormals. */
30462 if (flag_trapping_math || !TARGET_ROUND)
30463 break;
30464
30465 if (out_mode == SImode && in_mode == DFmode)
30466 {
30467 if (out_n == 4 && in_n == 2)
30468 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30469 else if (out_n == 8 && in_n == 4)
30470 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30471 }
30472 break;
30473
30474 case BUILT_IN_IROUNDF:
30475 case BUILT_IN_LROUNDF:
30476 case BUILT_IN_LLROUNDF:
30477 /* The round insn does not trap on denormals. */
30478 if (flag_trapping_math || !TARGET_ROUND)
30479 break;
30480
30481 if (out_mode == SImode && in_mode == SFmode)
30482 {
30483 if (out_n == 4 && in_n == 4)
30484 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30485 else if (out_n == 8 && in_n == 8)
30486 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30487 }
30488 break;
30489
30490 case BUILT_IN_COPYSIGN:
30491 if (out_mode == DFmode && in_mode == DFmode)
30492 {
30493 if (out_n == 2 && in_n == 2)
30494 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30495 else if (out_n == 4 && in_n == 4)
30496 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30497 }
30498 break;
30499
30500 case BUILT_IN_COPYSIGNF:
30501 if (out_mode == SFmode && in_mode == SFmode)
30502 {
30503 if (out_n == 4 && in_n == 4)
30504 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30505 else if (out_n == 8 && in_n == 8)
30506 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30507 }
30508 break;
30509
30510 case BUILT_IN_FLOOR:
30511 /* The round insn does not trap on denormals. */
30512 if (flag_trapping_math || !TARGET_ROUND)
30513 break;
30514
30515 if (out_mode == DFmode && in_mode == DFmode)
30516 {
30517 if (out_n == 2 && in_n == 2)
30518 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30519 else if (out_n == 4 && in_n == 4)
30520 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30521 }
30522 break;
30523
30524 case BUILT_IN_FLOORF:
30525 /* The round insn does not trap on denormals. */
30526 if (flag_trapping_math || !TARGET_ROUND)
30527 break;
30528
30529 if (out_mode == SFmode && in_mode == SFmode)
30530 {
30531 if (out_n == 4 && in_n == 4)
30532 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30533 else if (out_n == 8 && in_n == 8)
30534 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30535 }
30536 break;
30537
30538 case BUILT_IN_CEIL:
30539 /* The round insn does not trap on denormals. */
30540 if (flag_trapping_math || !TARGET_ROUND)
30541 break;
30542
30543 if (out_mode == DFmode && in_mode == DFmode)
30544 {
30545 if (out_n == 2 && in_n == 2)
30546 return ix86_builtins[IX86_BUILTIN_CEILPD];
30547 else if (out_n == 4 && in_n == 4)
30548 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30549 }
30550 break;
30551
30552 case BUILT_IN_CEILF:
30553 /* The round insn does not trap on denormals. */
30554 if (flag_trapping_math || !TARGET_ROUND)
30555 break;
30556
30557 if (out_mode == SFmode && in_mode == SFmode)
30558 {
30559 if (out_n == 4 && in_n == 4)
30560 return ix86_builtins[IX86_BUILTIN_CEILPS];
30561 else if (out_n == 8 && in_n == 8)
30562 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30563 }
30564 break;
30565
30566 case BUILT_IN_TRUNC:
30567 /* The round insn does not trap on denormals. */
30568 if (flag_trapping_math || !TARGET_ROUND)
30569 break;
30570
30571 if (out_mode == DFmode && in_mode == DFmode)
30572 {
30573 if (out_n == 2 && in_n == 2)
30574 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30575 else if (out_n == 4 && in_n == 4)
30576 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30577 }
30578 break;
30579
30580 case BUILT_IN_TRUNCF:
30581 /* The round insn does not trap on denormals. */
30582 if (flag_trapping_math || !TARGET_ROUND)
30583 break;
30584
30585 if (out_mode == SFmode && in_mode == SFmode)
30586 {
30587 if (out_n == 4 && in_n == 4)
30588 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30589 else if (out_n == 8 && in_n == 8)
30590 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30591 }
30592 break;
30593
30594 case BUILT_IN_RINT:
30595 /* The round insn does not trap on denormals. */
30596 if (flag_trapping_math || !TARGET_ROUND)
30597 break;
30598
30599 if (out_mode == DFmode && in_mode == DFmode)
30600 {
30601 if (out_n == 2 && in_n == 2)
30602 return ix86_builtins[IX86_BUILTIN_RINTPD];
30603 else if (out_n == 4 && in_n == 4)
30604 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30605 }
30606 break;
30607
30608 case BUILT_IN_RINTF:
30609 /* The round insn does not trap on denormals. */
30610 if (flag_trapping_math || !TARGET_ROUND)
30611 break;
30612
30613 if (out_mode == SFmode && in_mode == SFmode)
30614 {
30615 if (out_n == 4 && in_n == 4)
30616 return ix86_builtins[IX86_BUILTIN_RINTPS];
30617 else if (out_n == 8 && in_n == 8)
30618 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30619 }
30620 break;
30621
30622 case BUILT_IN_ROUND:
30623 /* The round insn does not trap on denormals. */
30624 if (flag_trapping_math || !TARGET_ROUND)
30625 break;
30626
30627 if (out_mode == DFmode && in_mode == DFmode)
30628 {
30629 if (out_n == 2 && in_n == 2)
30630 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30631 else if (out_n == 4 && in_n == 4)
30632 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30633 }
30634 break;
30635
30636 case BUILT_IN_ROUNDF:
30637 /* The round insn does not trap on denormals. */
30638 if (flag_trapping_math || !TARGET_ROUND)
30639 break;
30640
30641 if (out_mode == SFmode && in_mode == SFmode)
30642 {
30643 if (out_n == 4 && in_n == 4)
30644 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30645 else if (out_n == 8 && in_n == 8)
30646 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30647 }
30648 break;
30649
30650 case BUILT_IN_FMA:
30651 if (out_mode == DFmode && in_mode == DFmode)
30652 {
30653 if (out_n == 2 && in_n == 2)
30654 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30655 if (out_n == 4 && in_n == 4)
30656 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30657 }
30658 break;
30659
30660 case BUILT_IN_FMAF:
30661 if (out_mode == SFmode && in_mode == SFmode)
30662 {
30663 if (out_n == 4 && in_n == 4)
30664 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30665 if (out_n == 8 && in_n == 8)
30666 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30667 }
30668 break;
30669
30670 default:
30671 break;
30672 }
30673
30674 /* Dispatch to a handler for a vectorization library. */
30675 if (ix86_veclib_handler)
30676 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30677 type_in);
30678
30679 return NULL_TREE;
30680 }
30681
30682 /* Handler for an SVML-style interface to
30683 a library with vectorized intrinsics. */
30684
30685 static tree
30686 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30687 {
30688 char name[20];
30689 tree fntype, new_fndecl, args;
30690 unsigned arity;
30691 const char *bname;
30692 enum machine_mode el_mode, in_mode;
30693 int n, in_n;
30694
30695 /* The SVML is suitable for unsafe math only. */
30696 if (!flag_unsafe_math_optimizations)
30697 return NULL_TREE;
30698
30699 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30700 n = TYPE_VECTOR_SUBPARTS (type_out);
30701 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30702 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30703 if (el_mode != in_mode
30704 || n != in_n)
30705 return NULL_TREE;
30706
30707 switch (fn)
30708 {
30709 case BUILT_IN_EXP:
30710 case BUILT_IN_LOG:
30711 case BUILT_IN_LOG10:
30712 case BUILT_IN_POW:
30713 case BUILT_IN_TANH:
30714 case BUILT_IN_TAN:
30715 case BUILT_IN_ATAN:
30716 case BUILT_IN_ATAN2:
30717 case BUILT_IN_ATANH:
30718 case BUILT_IN_CBRT:
30719 case BUILT_IN_SINH:
30720 case BUILT_IN_SIN:
30721 case BUILT_IN_ASINH:
30722 case BUILT_IN_ASIN:
30723 case BUILT_IN_COSH:
30724 case BUILT_IN_COS:
30725 case BUILT_IN_ACOSH:
30726 case BUILT_IN_ACOS:
30727 if (el_mode != DFmode || n != 2)
30728 return NULL_TREE;
30729 break;
30730
30731 case BUILT_IN_EXPF:
30732 case BUILT_IN_LOGF:
30733 case BUILT_IN_LOG10F:
30734 case BUILT_IN_POWF:
30735 case BUILT_IN_TANHF:
30736 case BUILT_IN_TANF:
30737 case BUILT_IN_ATANF:
30738 case BUILT_IN_ATAN2F:
30739 case BUILT_IN_ATANHF:
30740 case BUILT_IN_CBRTF:
30741 case BUILT_IN_SINHF:
30742 case BUILT_IN_SINF:
30743 case BUILT_IN_ASINHF:
30744 case BUILT_IN_ASINF:
30745 case BUILT_IN_COSHF:
30746 case BUILT_IN_COSF:
30747 case BUILT_IN_ACOSHF:
30748 case BUILT_IN_ACOSF:
30749 if (el_mode != SFmode || n != 4)
30750 return NULL_TREE;
30751 break;
30752
30753 default:
30754 return NULL_TREE;
30755 }
30756
30757 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30758
30759 if (fn == BUILT_IN_LOGF)
30760 strcpy (name, "vmlsLn4");
30761 else if (fn == BUILT_IN_LOG)
30762 strcpy (name, "vmldLn2");
30763 else if (n == 4)
30764 {
30765 sprintf (name, "vmls%s", bname+10);
30766 name[strlen (name)-1] = '4';
30767 }
30768 else
30769 sprintf (name, "vmld%s2", bname+10);
30770
30771 /* Convert to uppercase. */
30772 name[4] &= ~0x20;
30773
30774 arity = 0;
30775 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30776 args;
30777 args = TREE_CHAIN (args))
30778 arity++;
30779
30780 if (arity == 1)
30781 fntype = build_function_type_list (type_out, type_in, NULL);
30782 else
30783 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30784
30785 /* Build a function declaration for the vectorized function. */
30786 new_fndecl = build_decl (BUILTINS_LOCATION,
30787 FUNCTION_DECL, get_identifier (name), fntype);
30788 TREE_PUBLIC (new_fndecl) = 1;
30789 DECL_EXTERNAL (new_fndecl) = 1;
30790 DECL_IS_NOVOPS (new_fndecl) = 1;
30791 TREE_READONLY (new_fndecl) = 1;
30792
30793 return new_fndecl;
30794 }
30795
30796 /* Handler for an ACML-style interface to
30797 a library with vectorized intrinsics. */
30798
30799 static tree
30800 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30801 {
30802 char name[20] = "__vr.._";
30803 tree fntype, new_fndecl, args;
30804 unsigned arity;
30805 const char *bname;
30806 enum machine_mode el_mode, in_mode;
30807 int n, in_n;
30808
30809 /* The ACML is 64bits only and suitable for unsafe math only as
30810 it does not correctly support parts of IEEE with the required
30811 precision such as denormals. */
30812 if (!TARGET_64BIT
30813 || !flag_unsafe_math_optimizations)
30814 return NULL_TREE;
30815
30816 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30817 n = TYPE_VECTOR_SUBPARTS (type_out);
30818 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30819 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30820 if (el_mode != in_mode
30821 || n != in_n)
30822 return NULL_TREE;
30823
30824 switch (fn)
30825 {
30826 case BUILT_IN_SIN:
30827 case BUILT_IN_COS:
30828 case BUILT_IN_EXP:
30829 case BUILT_IN_LOG:
30830 case BUILT_IN_LOG2:
30831 case BUILT_IN_LOG10:
30832 name[4] = 'd';
30833 name[5] = '2';
30834 if (el_mode != DFmode
30835 || n != 2)
30836 return NULL_TREE;
30837 break;
30838
30839 case BUILT_IN_SINF:
30840 case BUILT_IN_COSF:
30841 case BUILT_IN_EXPF:
30842 case BUILT_IN_POWF:
30843 case BUILT_IN_LOGF:
30844 case BUILT_IN_LOG2F:
30845 case BUILT_IN_LOG10F:
30846 name[4] = 's';
30847 name[5] = '4';
30848 if (el_mode != SFmode
30849 || n != 4)
30850 return NULL_TREE;
30851 break;
30852
30853 default:
30854 return NULL_TREE;
30855 }
30856
30857 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30858 sprintf (name + 7, "%s", bname+10);
30859
30860 arity = 0;
30861 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30862 args;
30863 args = TREE_CHAIN (args))
30864 arity++;
30865
30866 if (arity == 1)
30867 fntype = build_function_type_list (type_out, type_in, NULL);
30868 else
30869 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30870
30871 /* Build a function declaration for the vectorized function. */
30872 new_fndecl = build_decl (BUILTINS_LOCATION,
30873 FUNCTION_DECL, get_identifier (name), fntype);
30874 TREE_PUBLIC (new_fndecl) = 1;
30875 DECL_EXTERNAL (new_fndecl) = 1;
30876 DECL_IS_NOVOPS (new_fndecl) = 1;
30877 TREE_READONLY (new_fndecl) = 1;
30878
30879 return new_fndecl;
30880 }
30881
30882 /* Returns a decl of a function that implements gather load with
30883 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30884 Return NULL_TREE if it is not available. */
30885
30886 static tree
30887 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30888 const_tree index_type, int scale)
30889 {
30890 bool si;
30891 enum ix86_builtins code;
30892
30893 if (! TARGET_AVX2)
30894 return NULL_TREE;
30895
30896 if ((TREE_CODE (index_type) != INTEGER_TYPE
30897 && !POINTER_TYPE_P (index_type))
30898 || (TYPE_MODE (index_type) != SImode
30899 && TYPE_MODE (index_type) != DImode))
30900 return NULL_TREE;
30901
30902 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30903 return NULL_TREE;
30904
30905 /* v*gather* insn sign extends index to pointer mode. */
30906 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30907 && TYPE_UNSIGNED (index_type))
30908 return NULL_TREE;
30909
30910 if (scale <= 0
30911 || scale > 8
30912 || (scale & (scale - 1)) != 0)
30913 return NULL_TREE;
30914
30915 si = TYPE_MODE (index_type) == SImode;
30916 switch (TYPE_MODE (mem_vectype))
30917 {
30918 case V2DFmode:
30919 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30920 break;
30921 case V4DFmode:
30922 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30923 break;
30924 case V2DImode:
30925 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30926 break;
30927 case V4DImode:
30928 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30929 break;
30930 case V4SFmode:
30931 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30932 break;
30933 case V8SFmode:
30934 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30935 break;
30936 case V4SImode:
30937 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30938 break;
30939 case V8SImode:
30940 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30941 break;
30942 default:
30943 return NULL_TREE;
30944 }
30945
30946 return ix86_builtins[code];
30947 }
30948
30949 /* Returns a code for a target-specific builtin that implements
30950 reciprocal of the function, or NULL_TREE if not available. */
30951
30952 static tree
30953 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30954 bool sqrt ATTRIBUTE_UNUSED)
30955 {
30956 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30957 && flag_finite_math_only && !flag_trapping_math
30958 && flag_unsafe_math_optimizations))
30959 return NULL_TREE;
30960
30961 if (md_fn)
30962 /* Machine dependent builtins. */
30963 switch (fn)
30964 {
30965 /* Vectorized version of sqrt to rsqrt conversion. */
30966 case IX86_BUILTIN_SQRTPS_NR:
30967 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30968
30969 case IX86_BUILTIN_SQRTPS_NR256:
30970 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30971
30972 default:
30973 return NULL_TREE;
30974 }
30975 else
30976 /* Normal builtins. */
30977 switch (fn)
30978 {
30979 /* Sqrt to rsqrt conversion. */
30980 case BUILT_IN_SQRTF:
30981 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30982
30983 default:
30984 return NULL_TREE;
30985 }
30986 }
30987 \f
30988 /* Helper for avx_vpermilps256_operand et al. This is also used by
30989 the expansion functions to turn the parallel back into a mask.
30990 The return value is 0 for no match and the imm8+1 for a match. */
30991
30992 int
30993 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30994 {
30995 unsigned i, nelt = GET_MODE_NUNITS (mode);
30996 unsigned mask = 0;
30997 unsigned char ipar[8];
30998
30999 if (XVECLEN (par, 0) != (int) nelt)
31000 return 0;
31001
31002 /* Validate that all of the elements are constants, and not totally
31003 out of range. Copy the data into an integral array to make the
31004 subsequent checks easier. */
31005 for (i = 0; i < nelt; ++i)
31006 {
31007 rtx er = XVECEXP (par, 0, i);
31008 unsigned HOST_WIDE_INT ei;
31009
31010 if (!CONST_INT_P (er))
31011 return 0;
31012 ei = INTVAL (er);
31013 if (ei >= nelt)
31014 return 0;
31015 ipar[i] = ei;
31016 }
31017
31018 switch (mode)
31019 {
31020 case V4DFmode:
31021 /* In the 256-bit DFmode case, we can only move elements within
31022 a 128-bit lane. */
31023 for (i = 0; i < 2; ++i)
31024 {
31025 if (ipar[i] >= 2)
31026 return 0;
31027 mask |= ipar[i] << i;
31028 }
31029 for (i = 2; i < 4; ++i)
31030 {
31031 if (ipar[i] < 2)
31032 return 0;
31033 mask |= (ipar[i] - 2) << i;
31034 }
31035 break;
31036
31037 case V8SFmode:
31038 /* In the 256-bit SFmode case, we have full freedom of movement
31039 within the low 128-bit lane, but the high 128-bit lane must
31040 mirror the exact same pattern. */
31041 for (i = 0; i < 4; ++i)
31042 if (ipar[i] + 4 != ipar[i + 4])
31043 return 0;
31044 nelt = 4;
31045 /* FALLTHRU */
31046
31047 case V2DFmode:
31048 case V4SFmode:
31049 /* In the 128-bit case, we've full freedom in the placement of
31050 the elements from the source operand. */
31051 for (i = 0; i < nelt; ++i)
31052 mask |= ipar[i] << (i * (nelt / 2));
31053 break;
31054
31055 default:
31056 gcc_unreachable ();
31057 }
31058
31059 /* Make sure success has a non-zero value by adding one. */
31060 return mask + 1;
31061 }
31062
31063 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
31064 the expansion functions to turn the parallel back into a mask.
31065 The return value is 0 for no match and the imm8+1 for a match. */
31066
31067 int
31068 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
31069 {
31070 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
31071 unsigned mask = 0;
31072 unsigned char ipar[8];
31073
31074 if (XVECLEN (par, 0) != (int) nelt)
31075 return 0;
31076
31077 /* Validate that all of the elements are constants, and not totally
31078 out of range. Copy the data into an integral array to make the
31079 subsequent checks easier. */
31080 for (i = 0; i < nelt; ++i)
31081 {
31082 rtx er = XVECEXP (par, 0, i);
31083 unsigned HOST_WIDE_INT ei;
31084
31085 if (!CONST_INT_P (er))
31086 return 0;
31087 ei = INTVAL (er);
31088 if (ei >= 2 * nelt)
31089 return 0;
31090 ipar[i] = ei;
31091 }
31092
31093 /* Validate that the halves of the permute are halves. */
31094 for (i = 0; i < nelt2 - 1; ++i)
31095 if (ipar[i] + 1 != ipar[i + 1])
31096 return 0;
31097 for (i = nelt2; i < nelt - 1; ++i)
31098 if (ipar[i] + 1 != ipar[i + 1])
31099 return 0;
31100
31101 /* Reconstruct the mask. */
31102 for (i = 0; i < 2; ++i)
31103 {
31104 unsigned e = ipar[i * nelt2];
31105 if (e % nelt2)
31106 return 0;
31107 e /= nelt2;
31108 mask |= e << (i * 4);
31109 }
31110
31111 /* Make sure success has a non-zero value by adding one. */
31112 return mask + 1;
31113 }
31114 \f
31115 /* Store OPERAND to the memory after reload is completed. This means
31116 that we can't easily use assign_stack_local. */
31117 rtx
31118 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31119 {
31120 rtx result;
31121
31122 gcc_assert (reload_completed);
31123 if (ix86_using_red_zone ())
31124 {
31125 result = gen_rtx_MEM (mode,
31126 gen_rtx_PLUS (Pmode,
31127 stack_pointer_rtx,
31128 GEN_INT (-RED_ZONE_SIZE)));
31129 emit_move_insn (result, operand);
31130 }
31131 else if (TARGET_64BIT)
31132 {
31133 switch (mode)
31134 {
31135 case HImode:
31136 case SImode:
31137 operand = gen_lowpart (DImode, operand);
31138 /* FALLTHRU */
31139 case DImode:
31140 emit_insn (
31141 gen_rtx_SET (VOIDmode,
31142 gen_rtx_MEM (DImode,
31143 gen_rtx_PRE_DEC (DImode,
31144 stack_pointer_rtx)),
31145 operand));
31146 break;
31147 default:
31148 gcc_unreachable ();
31149 }
31150 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31151 }
31152 else
31153 {
31154 switch (mode)
31155 {
31156 case DImode:
31157 {
31158 rtx operands[2];
31159 split_double_mode (mode, &operand, 1, operands, operands + 1);
31160 emit_insn (
31161 gen_rtx_SET (VOIDmode,
31162 gen_rtx_MEM (SImode,
31163 gen_rtx_PRE_DEC (Pmode,
31164 stack_pointer_rtx)),
31165 operands[1]));
31166 emit_insn (
31167 gen_rtx_SET (VOIDmode,
31168 gen_rtx_MEM (SImode,
31169 gen_rtx_PRE_DEC (Pmode,
31170 stack_pointer_rtx)),
31171 operands[0]));
31172 }
31173 break;
31174 case HImode:
31175 /* Store HImodes as SImodes. */
31176 operand = gen_lowpart (SImode, operand);
31177 /* FALLTHRU */
31178 case SImode:
31179 emit_insn (
31180 gen_rtx_SET (VOIDmode,
31181 gen_rtx_MEM (GET_MODE (operand),
31182 gen_rtx_PRE_DEC (SImode,
31183 stack_pointer_rtx)),
31184 operand));
31185 break;
31186 default:
31187 gcc_unreachable ();
31188 }
31189 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31190 }
31191 return result;
31192 }
31193
31194 /* Free operand from the memory. */
31195 void
31196 ix86_free_from_memory (enum machine_mode mode)
31197 {
31198 if (!ix86_using_red_zone ())
31199 {
31200 int size;
31201
31202 if (mode == DImode || TARGET_64BIT)
31203 size = 8;
31204 else
31205 size = 4;
31206 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31207 to pop or add instruction if registers are available. */
31208 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31209 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31210 GEN_INT (size))));
31211 }
31212 }
31213
31214 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31215
31216 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31217 QImode must go into class Q_REGS.
31218 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31219 movdf to do mem-to-mem moves through integer regs. */
31220
31221 static reg_class_t
31222 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31223 {
31224 enum machine_mode mode = GET_MODE (x);
31225
31226 /* We're only allowed to return a subclass of CLASS. Many of the
31227 following checks fail for NO_REGS, so eliminate that early. */
31228 if (regclass == NO_REGS)
31229 return NO_REGS;
31230
31231 /* All classes can load zeros. */
31232 if (x == CONST0_RTX (mode))
31233 return regclass;
31234
31235 /* Force constants into memory if we are loading a (nonzero) constant into
31236 an MMX or SSE register. This is because there are no MMX/SSE instructions
31237 to load from a constant. */
31238 if (CONSTANT_P (x)
31239 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31240 return NO_REGS;
31241
31242 /* Prefer SSE regs only, if we can use them for math. */
31243 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31244 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31245
31246 /* Floating-point constants need more complex checks. */
31247 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31248 {
31249 /* General regs can load everything. */
31250 if (reg_class_subset_p (regclass, GENERAL_REGS))
31251 return regclass;
31252
31253 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31254 zero above. We only want to wind up preferring 80387 registers if
31255 we plan on doing computation with them. */
31256 if (TARGET_80387
31257 && standard_80387_constant_p (x) > 0)
31258 {
31259 /* Limit class to non-sse. */
31260 if (regclass == FLOAT_SSE_REGS)
31261 return FLOAT_REGS;
31262 if (regclass == FP_TOP_SSE_REGS)
31263 return FP_TOP_REG;
31264 if (regclass == FP_SECOND_SSE_REGS)
31265 return FP_SECOND_REG;
31266 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31267 return regclass;
31268 }
31269
31270 return NO_REGS;
31271 }
31272
31273 /* Generally when we see PLUS here, it's the function invariant
31274 (plus soft-fp const_int). Which can only be computed into general
31275 regs. */
31276 if (GET_CODE (x) == PLUS)
31277 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31278
31279 /* QImode constants are easy to load, but non-constant QImode data
31280 must go into Q_REGS. */
31281 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31282 {
31283 if (reg_class_subset_p (regclass, Q_REGS))
31284 return regclass;
31285 if (reg_class_subset_p (Q_REGS, regclass))
31286 return Q_REGS;
31287 return NO_REGS;
31288 }
31289
31290 return regclass;
31291 }
31292
31293 /* Discourage putting floating-point values in SSE registers unless
31294 SSE math is being used, and likewise for the 387 registers. */
31295 static reg_class_t
31296 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31297 {
31298 enum machine_mode mode = GET_MODE (x);
31299
31300 /* Restrict the output reload class to the register bank that we are doing
31301 math on. If we would like not to return a subset of CLASS, reject this
31302 alternative: if reload cannot do this, it will still use its choice. */
31303 mode = GET_MODE (x);
31304 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31305 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31306
31307 if (X87_FLOAT_MODE_P (mode))
31308 {
31309 if (regclass == FP_TOP_SSE_REGS)
31310 return FP_TOP_REG;
31311 else if (regclass == FP_SECOND_SSE_REGS)
31312 return FP_SECOND_REG;
31313 else
31314 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31315 }
31316
31317 return regclass;
31318 }
31319
31320 static reg_class_t
31321 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31322 enum machine_mode mode, secondary_reload_info *sri)
31323 {
31324 /* Double-word spills from general registers to non-offsettable memory
31325 references (zero-extended addresses) require special handling. */
31326 if (TARGET_64BIT
31327 && MEM_P (x)
31328 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31329 && rclass == GENERAL_REGS
31330 && !offsettable_memref_p (x))
31331 {
31332 sri->icode = (in_p
31333 ? CODE_FOR_reload_noff_load
31334 : CODE_FOR_reload_noff_store);
31335 /* Add the cost of moving address to a temporary. */
31336 sri->extra_cost = 1;
31337
31338 return NO_REGS;
31339 }
31340
31341 /* QImode spills from non-QI registers require
31342 intermediate register on 32bit targets. */
31343 if (!TARGET_64BIT
31344 && !in_p && mode == QImode
31345 && (rclass == GENERAL_REGS
31346 || rclass == LEGACY_REGS
31347 || rclass == INDEX_REGS))
31348 {
31349 int regno;
31350
31351 if (REG_P (x))
31352 regno = REGNO (x);
31353 else
31354 regno = -1;
31355
31356 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31357 regno = true_regnum (x);
31358
31359 /* Return Q_REGS if the operand is in memory. */
31360 if (regno == -1)
31361 return Q_REGS;
31362 }
31363
31364 /* This condition handles corner case where an expression involving
31365 pointers gets vectorized. We're trying to use the address of a
31366 stack slot as a vector initializer.
31367
31368 (set (reg:V2DI 74 [ vect_cst_.2 ])
31369 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31370
31371 Eventually frame gets turned into sp+offset like this:
31372
31373 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31374 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31375 (const_int 392 [0x188]))))
31376
31377 That later gets turned into:
31378
31379 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31380 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31381 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31382
31383 We'll have the following reload recorded:
31384
31385 Reload 0: reload_in (DI) =
31386 (plus:DI (reg/f:DI 7 sp)
31387 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31388 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31389 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31390 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31391 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31392 reload_reg_rtx: (reg:V2DI 22 xmm1)
31393
31394 Which isn't going to work since SSE instructions can't handle scalar
31395 additions. Returning GENERAL_REGS forces the addition into integer
31396 register and reload can handle subsequent reloads without problems. */
31397
31398 if (in_p && GET_CODE (x) == PLUS
31399 && SSE_CLASS_P (rclass)
31400 && SCALAR_INT_MODE_P (mode))
31401 return GENERAL_REGS;
31402
31403 return NO_REGS;
31404 }
31405
31406 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31407
31408 static bool
31409 ix86_class_likely_spilled_p (reg_class_t rclass)
31410 {
31411 switch (rclass)
31412 {
31413 case AREG:
31414 case DREG:
31415 case CREG:
31416 case BREG:
31417 case AD_REGS:
31418 case SIREG:
31419 case DIREG:
31420 case SSE_FIRST_REG:
31421 case FP_TOP_REG:
31422 case FP_SECOND_REG:
31423 return true;
31424
31425 default:
31426 break;
31427 }
31428
31429 return false;
31430 }
31431
31432 /* If we are copying between general and FP registers, we need a memory
31433 location. The same is true for SSE and MMX registers.
31434
31435 To optimize register_move_cost performance, allow inline variant.
31436
31437 The macro can't work reliably when one of the CLASSES is class containing
31438 registers from multiple units (SSE, MMX, integer). We avoid this by never
31439 combining those units in single alternative in the machine description.
31440 Ensure that this constraint holds to avoid unexpected surprises.
31441
31442 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31443 enforce these sanity checks. */
31444
31445 static inline bool
31446 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31447 enum machine_mode mode, int strict)
31448 {
31449 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31450 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31451 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31452 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31453 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31454 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31455 {
31456 gcc_assert (!strict);
31457 return true;
31458 }
31459
31460 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31461 return true;
31462
31463 /* ??? This is a lie. We do have moves between mmx/general, and for
31464 mmx/sse2. But by saying we need secondary memory we discourage the
31465 register allocator from using the mmx registers unless needed. */
31466 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31467 return true;
31468
31469 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31470 {
31471 /* SSE1 doesn't have any direct moves from other classes. */
31472 if (!TARGET_SSE2)
31473 return true;
31474
31475 /* If the target says that inter-unit moves are more expensive
31476 than moving through memory, then don't generate them. */
31477 if (!TARGET_INTER_UNIT_MOVES)
31478 return true;
31479
31480 /* Between SSE and general, we have moves no larger than word size. */
31481 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31482 return true;
31483 }
31484
31485 return false;
31486 }
31487
31488 bool
31489 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31490 enum machine_mode mode, int strict)
31491 {
31492 return inline_secondary_memory_needed (class1, class2, mode, strict);
31493 }
31494
31495 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31496
31497 On the 80386, this is the size of MODE in words,
31498 except in the FP regs, where a single reg is always enough. */
31499
31500 static unsigned char
31501 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31502 {
31503 if (MAYBE_INTEGER_CLASS_P (rclass))
31504 {
31505 if (mode == XFmode)
31506 return (TARGET_64BIT ? 2 : 3);
31507 else if (mode == XCmode)
31508 return (TARGET_64BIT ? 4 : 6);
31509 else
31510 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31511 }
31512 else
31513 {
31514 if (COMPLEX_MODE_P (mode))
31515 return 2;
31516 else
31517 return 1;
31518 }
31519 }
31520
31521 /* Return true if the registers in CLASS cannot represent the change from
31522 modes FROM to TO. */
31523
31524 bool
31525 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31526 enum reg_class regclass)
31527 {
31528 if (from == to)
31529 return false;
31530
31531 /* x87 registers can't do subreg at all, as all values are reformatted
31532 to extended precision. */
31533 if (MAYBE_FLOAT_CLASS_P (regclass))
31534 return true;
31535
31536 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31537 {
31538 /* Vector registers do not support QI or HImode loads. If we don't
31539 disallow a change to these modes, reload will assume it's ok to
31540 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31541 the vec_dupv4hi pattern. */
31542 if (GET_MODE_SIZE (from) < 4)
31543 return true;
31544
31545 /* Vector registers do not support subreg with nonzero offsets, which
31546 are otherwise valid for integer registers. Since we can't see
31547 whether we have a nonzero offset from here, prohibit all
31548 nonparadoxical subregs changing size. */
31549 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31550 return true;
31551 }
31552
31553 return false;
31554 }
31555
31556 /* Return the cost of moving data of mode M between a
31557 register and memory. A value of 2 is the default; this cost is
31558 relative to those in `REGISTER_MOVE_COST'.
31559
31560 This function is used extensively by register_move_cost that is used to
31561 build tables at startup. Make it inline in this case.
31562 When IN is 2, return maximum of in and out move cost.
31563
31564 If moving between registers and memory is more expensive than
31565 between two registers, you should define this macro to express the
31566 relative cost.
31567
31568 Model also increased moving costs of QImode registers in non
31569 Q_REGS classes.
31570 */
31571 static inline int
31572 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31573 int in)
31574 {
31575 int cost;
31576 if (FLOAT_CLASS_P (regclass))
31577 {
31578 int index;
31579 switch (mode)
31580 {
31581 case SFmode:
31582 index = 0;
31583 break;
31584 case DFmode:
31585 index = 1;
31586 break;
31587 case XFmode:
31588 index = 2;
31589 break;
31590 default:
31591 return 100;
31592 }
31593 if (in == 2)
31594 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31595 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31596 }
31597 if (SSE_CLASS_P (regclass))
31598 {
31599 int index;
31600 switch (GET_MODE_SIZE (mode))
31601 {
31602 case 4:
31603 index = 0;
31604 break;
31605 case 8:
31606 index = 1;
31607 break;
31608 case 16:
31609 index = 2;
31610 break;
31611 default:
31612 return 100;
31613 }
31614 if (in == 2)
31615 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31616 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31617 }
31618 if (MMX_CLASS_P (regclass))
31619 {
31620 int index;
31621 switch (GET_MODE_SIZE (mode))
31622 {
31623 case 4:
31624 index = 0;
31625 break;
31626 case 8:
31627 index = 1;
31628 break;
31629 default:
31630 return 100;
31631 }
31632 if (in)
31633 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31634 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31635 }
31636 switch (GET_MODE_SIZE (mode))
31637 {
31638 case 1:
31639 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31640 {
31641 if (!in)
31642 return ix86_cost->int_store[0];
31643 if (TARGET_PARTIAL_REG_DEPENDENCY
31644 && optimize_function_for_speed_p (cfun))
31645 cost = ix86_cost->movzbl_load;
31646 else
31647 cost = ix86_cost->int_load[0];
31648 if (in == 2)
31649 return MAX (cost, ix86_cost->int_store[0]);
31650 return cost;
31651 }
31652 else
31653 {
31654 if (in == 2)
31655 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31656 if (in)
31657 return ix86_cost->movzbl_load;
31658 else
31659 return ix86_cost->int_store[0] + 4;
31660 }
31661 break;
31662 case 2:
31663 if (in == 2)
31664 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31665 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31666 default:
31667 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31668 if (mode == TFmode)
31669 mode = XFmode;
31670 if (in == 2)
31671 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31672 else if (in)
31673 cost = ix86_cost->int_load[2];
31674 else
31675 cost = ix86_cost->int_store[2];
31676 return (cost * (((int) GET_MODE_SIZE (mode)
31677 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31678 }
31679 }
31680
31681 static int
31682 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31683 bool in)
31684 {
31685 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31686 }
31687
31688
31689 /* Return the cost of moving data from a register in class CLASS1 to
31690 one in class CLASS2.
31691
31692 It is not required that the cost always equal 2 when FROM is the same as TO;
31693 on some machines it is expensive to move between registers if they are not
31694 general registers. */
31695
31696 static int
31697 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31698 reg_class_t class2_i)
31699 {
31700 enum reg_class class1 = (enum reg_class) class1_i;
31701 enum reg_class class2 = (enum reg_class) class2_i;
31702
31703 /* In case we require secondary memory, compute cost of the store followed
31704 by load. In order to avoid bad register allocation choices, we need
31705 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31706
31707 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31708 {
31709 int cost = 1;
31710
31711 cost += inline_memory_move_cost (mode, class1, 2);
31712 cost += inline_memory_move_cost (mode, class2, 2);
31713
31714 /* In case of copying from general_purpose_register we may emit multiple
31715 stores followed by single load causing memory size mismatch stall.
31716 Count this as arbitrarily high cost of 20. */
31717 if (targetm.class_max_nregs (class1, mode)
31718 > targetm.class_max_nregs (class2, mode))
31719 cost += 20;
31720
31721 /* In the case of FP/MMX moves, the registers actually overlap, and we
31722 have to switch modes in order to treat them differently. */
31723 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31724 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31725 cost += 20;
31726
31727 return cost;
31728 }
31729
31730 /* Moves between SSE/MMX and integer unit are expensive. */
31731 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31732 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31733
31734 /* ??? By keeping returned value relatively high, we limit the number
31735 of moves between integer and MMX/SSE registers for all targets.
31736 Additionally, high value prevents problem with x86_modes_tieable_p(),
31737 where integer modes in MMX/SSE registers are not tieable
31738 because of missing QImode and HImode moves to, from or between
31739 MMX/SSE registers. */
31740 return MAX (8, ix86_cost->mmxsse_to_integer);
31741
31742 if (MAYBE_FLOAT_CLASS_P (class1))
31743 return ix86_cost->fp_move;
31744 if (MAYBE_SSE_CLASS_P (class1))
31745 return ix86_cost->sse_move;
31746 if (MAYBE_MMX_CLASS_P (class1))
31747 return ix86_cost->mmx_move;
31748 return 2;
31749 }
31750
31751 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31752 MODE. */
31753
31754 bool
31755 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31756 {
31757 /* Flags and only flags can only hold CCmode values. */
31758 if (CC_REGNO_P (regno))
31759 return GET_MODE_CLASS (mode) == MODE_CC;
31760 if (GET_MODE_CLASS (mode) == MODE_CC
31761 || GET_MODE_CLASS (mode) == MODE_RANDOM
31762 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31763 return false;
31764 if (FP_REGNO_P (regno))
31765 return VALID_FP_MODE_P (mode);
31766 if (SSE_REGNO_P (regno))
31767 {
31768 /* We implement the move patterns for all vector modes into and
31769 out of SSE registers, even when no operation instructions
31770 are available. OImode move is available only when AVX is
31771 enabled. */
31772 return ((TARGET_AVX && mode == OImode)
31773 || VALID_AVX256_REG_MODE (mode)
31774 || VALID_SSE_REG_MODE (mode)
31775 || VALID_SSE2_REG_MODE (mode)
31776 || VALID_MMX_REG_MODE (mode)
31777 || VALID_MMX_REG_MODE_3DNOW (mode));
31778 }
31779 if (MMX_REGNO_P (regno))
31780 {
31781 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31782 so if the register is available at all, then we can move data of
31783 the given mode into or out of it. */
31784 return (VALID_MMX_REG_MODE (mode)
31785 || VALID_MMX_REG_MODE_3DNOW (mode));
31786 }
31787
31788 if (mode == QImode)
31789 {
31790 /* Take care for QImode values - they can be in non-QI regs,
31791 but then they do cause partial register stalls. */
31792 if (regno <= BX_REG || TARGET_64BIT)
31793 return true;
31794 if (!TARGET_PARTIAL_REG_STALL)
31795 return true;
31796 return !can_create_pseudo_p ();
31797 }
31798 /* We handle both integer and floats in the general purpose registers. */
31799 else if (VALID_INT_MODE_P (mode))
31800 return true;
31801 else if (VALID_FP_MODE_P (mode))
31802 return true;
31803 else if (VALID_DFP_MODE_P (mode))
31804 return true;
31805 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31806 on to use that value in smaller contexts, this can easily force a
31807 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31808 supporting DImode, allow it. */
31809 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31810 return true;
31811
31812 return false;
31813 }
31814
31815 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31816 tieable integer mode. */
31817
31818 static bool
31819 ix86_tieable_integer_mode_p (enum machine_mode mode)
31820 {
31821 switch (mode)
31822 {
31823 case HImode:
31824 case SImode:
31825 return true;
31826
31827 case QImode:
31828 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31829
31830 case DImode:
31831 return TARGET_64BIT;
31832
31833 default:
31834 return false;
31835 }
31836 }
31837
31838 /* Return true if MODE1 is accessible in a register that can hold MODE2
31839 without copying. That is, all register classes that can hold MODE2
31840 can also hold MODE1. */
31841
31842 bool
31843 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31844 {
31845 if (mode1 == mode2)
31846 return true;
31847
31848 if (ix86_tieable_integer_mode_p (mode1)
31849 && ix86_tieable_integer_mode_p (mode2))
31850 return true;
31851
31852 /* MODE2 being XFmode implies fp stack or general regs, which means we
31853 can tie any smaller floating point modes to it. Note that we do not
31854 tie this with TFmode. */
31855 if (mode2 == XFmode)
31856 return mode1 == SFmode || mode1 == DFmode;
31857
31858 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31859 that we can tie it with SFmode. */
31860 if (mode2 == DFmode)
31861 return mode1 == SFmode;
31862
31863 /* If MODE2 is only appropriate for an SSE register, then tie with
31864 any other mode acceptable to SSE registers. */
31865 if (GET_MODE_SIZE (mode2) == 32
31866 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31867 return (GET_MODE_SIZE (mode1) == 32
31868 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31869 if (GET_MODE_SIZE (mode2) == 16
31870 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31871 return (GET_MODE_SIZE (mode1) == 16
31872 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31873
31874 /* If MODE2 is appropriate for an MMX register, then tie
31875 with any other mode acceptable to MMX registers. */
31876 if (GET_MODE_SIZE (mode2) == 8
31877 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31878 return (GET_MODE_SIZE (mode1) == 8
31879 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31880
31881 return false;
31882 }
31883
31884 /* Return the cost of moving between two registers of mode MODE. */
31885
31886 static int
31887 ix86_set_reg_reg_cost (enum machine_mode mode)
31888 {
31889 unsigned int units = UNITS_PER_WORD;
31890
31891 switch (GET_MODE_CLASS (mode))
31892 {
31893 default:
31894 break;
31895
31896 case MODE_CC:
31897 units = GET_MODE_SIZE (CCmode);
31898 break;
31899
31900 case MODE_FLOAT:
31901 if ((TARGET_SSE2 && mode == TFmode)
31902 || (TARGET_80387 && mode == XFmode)
31903 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
31904 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
31905 units = GET_MODE_SIZE (mode);
31906 break;
31907
31908 case MODE_COMPLEX_FLOAT:
31909 if ((TARGET_SSE2 && mode == TCmode)
31910 || (TARGET_80387 && mode == XCmode)
31911 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
31912 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
31913 units = GET_MODE_SIZE (mode);
31914 break;
31915
31916 case MODE_VECTOR_INT:
31917 case MODE_VECTOR_FLOAT:
31918 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
31919 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
31920 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
31921 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
31922 units = GET_MODE_SIZE (mode);
31923 }
31924
31925 /* Return the cost of moving between two registers of mode MODE,
31926 assuming that the move will be in pieces of at most UNITS bytes. */
31927 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
31928 }
31929
31930 /* Compute a (partial) cost for rtx X. Return true if the complete
31931 cost has been computed, and false if subexpressions should be
31932 scanned. In either case, *TOTAL contains the cost result. */
31933
31934 static bool
31935 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31936 bool speed)
31937 {
31938 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31939 enum machine_mode mode = GET_MODE (x);
31940 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31941
31942 switch (code)
31943 {
31944 case SET:
31945 if (register_operand (SET_DEST (x), VOIDmode)
31946 && reg_or_0_operand (SET_SRC (x), VOIDmode))
31947 {
31948 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
31949 return true;
31950 }
31951 return false;
31952
31953 case CONST_INT:
31954 case CONST:
31955 case LABEL_REF:
31956 case SYMBOL_REF:
31957 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31958 *total = 3;
31959 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31960 *total = 2;
31961 else if (flag_pic && SYMBOLIC_CONST (x)
31962 && (!TARGET_64BIT
31963 || (!GET_CODE (x) != LABEL_REF
31964 && (GET_CODE (x) != SYMBOL_REF
31965 || !SYMBOL_REF_LOCAL_P (x)))))
31966 *total = 1;
31967 else
31968 *total = 0;
31969 return true;
31970
31971 case CONST_DOUBLE:
31972 if (mode == VOIDmode)
31973 *total = 0;
31974 else
31975 switch (standard_80387_constant_p (x))
31976 {
31977 case 1: /* 0.0 */
31978 *total = 1;
31979 break;
31980 default: /* Other constants */
31981 *total = 2;
31982 break;
31983 case 0:
31984 case -1:
31985 /* Start with (MEM (SYMBOL_REF)), since that's where
31986 it'll probably end up. Add a penalty for size. */
31987 *total = (COSTS_N_INSNS (1)
31988 + (flag_pic != 0 && !TARGET_64BIT)
31989 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31990 break;
31991 }
31992 return true;
31993
31994 case ZERO_EXTEND:
31995 /* The zero extensions is often completely free on x86_64, so make
31996 it as cheap as possible. */
31997 if (TARGET_64BIT && mode == DImode
31998 && GET_MODE (XEXP (x, 0)) == SImode)
31999 *total = 1;
32000 else if (TARGET_ZERO_EXTEND_WITH_AND)
32001 *total = cost->add;
32002 else
32003 *total = cost->movzx;
32004 return false;
32005
32006 case SIGN_EXTEND:
32007 *total = cost->movsx;
32008 return false;
32009
32010 case ASHIFT:
32011 if (CONST_INT_P (XEXP (x, 1))
32012 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
32013 {
32014 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32015 if (value == 1)
32016 {
32017 *total = cost->add;
32018 return false;
32019 }
32020 if ((value == 2 || value == 3)
32021 && cost->lea <= cost->shift_const)
32022 {
32023 *total = cost->lea;
32024 return false;
32025 }
32026 }
32027 /* FALLTHRU */
32028
32029 case ROTATE:
32030 case ASHIFTRT:
32031 case LSHIFTRT:
32032 case ROTATERT:
32033 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
32034 {
32035 if (CONST_INT_P (XEXP (x, 1)))
32036 {
32037 if (INTVAL (XEXP (x, 1)) > 32)
32038 *total = cost->shift_const + COSTS_N_INSNS (2);
32039 else
32040 *total = cost->shift_const * 2;
32041 }
32042 else
32043 {
32044 if (GET_CODE (XEXP (x, 1)) == AND)
32045 *total = cost->shift_var * 2;
32046 else
32047 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
32048 }
32049 }
32050 else
32051 {
32052 if (CONST_INT_P (XEXP (x, 1)))
32053 *total = cost->shift_const;
32054 else
32055 *total = cost->shift_var;
32056 }
32057 return false;
32058
32059 case FMA:
32060 {
32061 rtx sub;
32062
32063 gcc_assert (FLOAT_MODE_P (mode));
32064 gcc_assert (TARGET_FMA || TARGET_FMA4);
32065
32066 /* ??? SSE scalar/vector cost should be used here. */
32067 /* ??? Bald assumption that fma has the same cost as fmul. */
32068 *total = cost->fmul;
32069 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
32070
32071 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
32072 sub = XEXP (x, 0);
32073 if (GET_CODE (sub) == NEG)
32074 sub = XEXP (sub, 0);
32075 *total += rtx_cost (sub, FMA, 0, speed);
32076
32077 sub = XEXP (x, 2);
32078 if (GET_CODE (sub) == NEG)
32079 sub = XEXP (sub, 0);
32080 *total += rtx_cost (sub, FMA, 2, speed);
32081 return true;
32082 }
32083
32084 case MULT:
32085 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32086 {
32087 /* ??? SSE scalar cost should be used here. */
32088 *total = cost->fmul;
32089 return false;
32090 }
32091 else if (X87_FLOAT_MODE_P (mode))
32092 {
32093 *total = cost->fmul;
32094 return false;
32095 }
32096 else if (FLOAT_MODE_P (mode))
32097 {
32098 /* ??? SSE vector cost should be used here. */
32099 *total = cost->fmul;
32100 return false;
32101 }
32102 else
32103 {
32104 rtx op0 = XEXP (x, 0);
32105 rtx op1 = XEXP (x, 1);
32106 int nbits;
32107 if (CONST_INT_P (XEXP (x, 1)))
32108 {
32109 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32110 for (nbits = 0; value != 0; value &= value - 1)
32111 nbits++;
32112 }
32113 else
32114 /* This is arbitrary. */
32115 nbits = 7;
32116
32117 /* Compute costs correctly for widening multiplication. */
32118 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32119 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32120 == GET_MODE_SIZE (mode))
32121 {
32122 int is_mulwiden = 0;
32123 enum machine_mode inner_mode = GET_MODE (op0);
32124
32125 if (GET_CODE (op0) == GET_CODE (op1))
32126 is_mulwiden = 1, op1 = XEXP (op1, 0);
32127 else if (CONST_INT_P (op1))
32128 {
32129 if (GET_CODE (op0) == SIGN_EXTEND)
32130 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32131 == INTVAL (op1);
32132 else
32133 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32134 }
32135
32136 if (is_mulwiden)
32137 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32138 }
32139
32140 *total = (cost->mult_init[MODE_INDEX (mode)]
32141 + nbits * cost->mult_bit
32142 + rtx_cost (op0, outer_code, opno, speed)
32143 + rtx_cost (op1, outer_code, opno, speed));
32144
32145 return true;
32146 }
32147
32148 case DIV:
32149 case UDIV:
32150 case MOD:
32151 case UMOD:
32152 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32153 /* ??? SSE cost should be used here. */
32154 *total = cost->fdiv;
32155 else if (X87_FLOAT_MODE_P (mode))
32156 *total = cost->fdiv;
32157 else if (FLOAT_MODE_P (mode))
32158 /* ??? SSE vector cost should be used here. */
32159 *total = cost->fdiv;
32160 else
32161 *total = cost->divide[MODE_INDEX (mode)];
32162 return false;
32163
32164 case PLUS:
32165 if (GET_MODE_CLASS (mode) == MODE_INT
32166 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
32167 {
32168 if (GET_CODE (XEXP (x, 0)) == PLUS
32169 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32170 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32171 && CONSTANT_P (XEXP (x, 1)))
32172 {
32173 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32174 if (val == 2 || val == 4 || val == 8)
32175 {
32176 *total = cost->lea;
32177 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32178 outer_code, opno, speed);
32179 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32180 outer_code, opno, speed);
32181 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32182 return true;
32183 }
32184 }
32185 else if (GET_CODE (XEXP (x, 0)) == MULT
32186 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32187 {
32188 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32189 if (val == 2 || val == 4 || val == 8)
32190 {
32191 *total = cost->lea;
32192 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32193 outer_code, opno, speed);
32194 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32195 return true;
32196 }
32197 }
32198 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32199 {
32200 *total = cost->lea;
32201 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32202 outer_code, opno, speed);
32203 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32204 outer_code, opno, speed);
32205 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32206 return true;
32207 }
32208 }
32209 /* FALLTHRU */
32210
32211 case MINUS:
32212 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32213 {
32214 /* ??? SSE cost should be used here. */
32215 *total = cost->fadd;
32216 return false;
32217 }
32218 else if (X87_FLOAT_MODE_P (mode))
32219 {
32220 *total = cost->fadd;
32221 return false;
32222 }
32223 else if (FLOAT_MODE_P (mode))
32224 {
32225 /* ??? SSE vector cost should be used here. */
32226 *total = cost->fadd;
32227 return false;
32228 }
32229 /* FALLTHRU */
32230
32231 case AND:
32232 case IOR:
32233 case XOR:
32234 if (!TARGET_64BIT && mode == DImode)
32235 {
32236 *total = (cost->add * 2
32237 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32238 << (GET_MODE (XEXP (x, 0)) != DImode))
32239 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32240 << (GET_MODE (XEXP (x, 1)) != DImode)));
32241 return true;
32242 }
32243 /* FALLTHRU */
32244
32245 case NEG:
32246 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32247 {
32248 /* ??? SSE cost should be used here. */
32249 *total = cost->fchs;
32250 return false;
32251 }
32252 else if (X87_FLOAT_MODE_P (mode))
32253 {
32254 *total = cost->fchs;
32255 return false;
32256 }
32257 else if (FLOAT_MODE_P (mode))
32258 {
32259 /* ??? SSE vector cost should be used here. */
32260 *total = cost->fchs;
32261 return false;
32262 }
32263 /* FALLTHRU */
32264
32265 case NOT:
32266 if (!TARGET_64BIT && mode == DImode)
32267 *total = cost->add * 2;
32268 else
32269 *total = cost->add;
32270 return false;
32271
32272 case COMPARE:
32273 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32274 && XEXP (XEXP (x, 0), 1) == const1_rtx
32275 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32276 && XEXP (x, 1) == const0_rtx)
32277 {
32278 /* This kind of construct is implemented using test[bwl].
32279 Treat it as if we had an AND. */
32280 *total = (cost->add
32281 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32282 + rtx_cost (const1_rtx, outer_code, opno, speed));
32283 return true;
32284 }
32285 return false;
32286
32287 case FLOAT_EXTEND:
32288 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32289 *total = 0;
32290 return false;
32291
32292 case ABS:
32293 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32294 /* ??? SSE cost should be used here. */
32295 *total = cost->fabs;
32296 else if (X87_FLOAT_MODE_P (mode))
32297 *total = cost->fabs;
32298 else if (FLOAT_MODE_P (mode))
32299 /* ??? SSE vector cost should be used here. */
32300 *total = cost->fabs;
32301 return false;
32302
32303 case SQRT:
32304 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32305 /* ??? SSE cost should be used here. */
32306 *total = cost->fsqrt;
32307 else if (X87_FLOAT_MODE_P (mode))
32308 *total = cost->fsqrt;
32309 else if (FLOAT_MODE_P (mode))
32310 /* ??? SSE vector cost should be used here. */
32311 *total = cost->fsqrt;
32312 return false;
32313
32314 case UNSPEC:
32315 if (XINT (x, 1) == UNSPEC_TP)
32316 *total = 0;
32317 return false;
32318
32319 case VEC_SELECT:
32320 case VEC_CONCAT:
32321 case VEC_MERGE:
32322 case VEC_DUPLICATE:
32323 /* ??? Assume all of these vector manipulation patterns are
32324 recognizable. In which case they all pretty much have the
32325 same cost. */
32326 *total = COSTS_N_INSNS (1);
32327 return true;
32328
32329 default:
32330 return false;
32331 }
32332 }
32333
32334 #if TARGET_MACHO
32335
32336 static int current_machopic_label_num;
32337
32338 /* Given a symbol name and its associated stub, write out the
32339 definition of the stub. */
32340
32341 void
32342 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32343 {
32344 unsigned int length;
32345 char *binder_name, *symbol_name, lazy_ptr_name[32];
32346 int label = ++current_machopic_label_num;
32347
32348 /* For 64-bit we shouldn't get here. */
32349 gcc_assert (!TARGET_64BIT);
32350
32351 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32352 symb = targetm.strip_name_encoding (symb);
32353
32354 length = strlen (stub);
32355 binder_name = XALLOCAVEC (char, length + 32);
32356 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32357
32358 length = strlen (symb);
32359 symbol_name = XALLOCAVEC (char, length + 32);
32360 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32361
32362 sprintf (lazy_ptr_name, "L%d$lz", label);
32363
32364 if (MACHOPIC_ATT_STUB)
32365 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32366 else if (MACHOPIC_PURE)
32367 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32368 else
32369 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32370
32371 fprintf (file, "%s:\n", stub);
32372 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32373
32374 if (MACHOPIC_ATT_STUB)
32375 {
32376 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32377 }
32378 else if (MACHOPIC_PURE)
32379 {
32380 /* PIC stub. */
32381 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32382 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32383 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32384 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32385 label, lazy_ptr_name, label);
32386 fprintf (file, "\tjmp\t*%%ecx\n");
32387 }
32388 else
32389 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32390
32391 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32392 it needs no stub-binding-helper. */
32393 if (MACHOPIC_ATT_STUB)
32394 return;
32395
32396 fprintf (file, "%s:\n", binder_name);
32397
32398 if (MACHOPIC_PURE)
32399 {
32400 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32401 fprintf (file, "\tpushl\t%%ecx\n");
32402 }
32403 else
32404 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32405
32406 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32407
32408 /* N.B. Keep the correspondence of these
32409 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32410 old-pic/new-pic/non-pic stubs; altering this will break
32411 compatibility with existing dylibs. */
32412 if (MACHOPIC_PURE)
32413 {
32414 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32415 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32416 }
32417 else
32418 /* 16-byte -mdynamic-no-pic stub. */
32419 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32420
32421 fprintf (file, "%s:\n", lazy_ptr_name);
32422 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32423 fprintf (file, ASM_LONG "%s\n", binder_name);
32424 }
32425 #endif /* TARGET_MACHO */
32426
32427 /* Order the registers for register allocator. */
32428
32429 void
32430 x86_order_regs_for_local_alloc (void)
32431 {
32432 int pos = 0;
32433 int i;
32434
32435 /* First allocate the local general purpose registers. */
32436 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32437 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32438 reg_alloc_order [pos++] = i;
32439
32440 /* Global general purpose registers. */
32441 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32442 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32443 reg_alloc_order [pos++] = i;
32444
32445 /* x87 registers come first in case we are doing FP math
32446 using them. */
32447 if (!TARGET_SSE_MATH)
32448 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32449 reg_alloc_order [pos++] = i;
32450
32451 /* SSE registers. */
32452 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32453 reg_alloc_order [pos++] = i;
32454 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32455 reg_alloc_order [pos++] = i;
32456
32457 /* x87 registers. */
32458 if (TARGET_SSE_MATH)
32459 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32460 reg_alloc_order [pos++] = i;
32461
32462 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32463 reg_alloc_order [pos++] = i;
32464
32465 /* Initialize the rest of array as we do not allocate some registers
32466 at all. */
32467 while (pos < FIRST_PSEUDO_REGISTER)
32468 reg_alloc_order [pos++] = 0;
32469 }
32470
32471 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32472 in struct attribute_spec handler. */
32473 static tree
32474 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32475 tree args,
32476 int flags ATTRIBUTE_UNUSED,
32477 bool *no_add_attrs)
32478 {
32479 if (TREE_CODE (*node) != FUNCTION_TYPE
32480 && TREE_CODE (*node) != METHOD_TYPE
32481 && TREE_CODE (*node) != FIELD_DECL
32482 && TREE_CODE (*node) != TYPE_DECL)
32483 {
32484 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32485 name);
32486 *no_add_attrs = true;
32487 return NULL_TREE;
32488 }
32489 if (TARGET_64BIT)
32490 {
32491 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32492 name);
32493 *no_add_attrs = true;
32494 return NULL_TREE;
32495 }
32496 if (is_attribute_p ("callee_pop_aggregate_return", name))
32497 {
32498 tree cst;
32499
32500 cst = TREE_VALUE (args);
32501 if (TREE_CODE (cst) != INTEGER_CST)
32502 {
32503 warning (OPT_Wattributes,
32504 "%qE attribute requires an integer constant argument",
32505 name);
32506 *no_add_attrs = true;
32507 }
32508 else if (compare_tree_int (cst, 0) != 0
32509 && compare_tree_int (cst, 1) != 0)
32510 {
32511 warning (OPT_Wattributes,
32512 "argument to %qE attribute is neither zero, nor one",
32513 name);
32514 *no_add_attrs = true;
32515 }
32516
32517 return NULL_TREE;
32518 }
32519
32520 return NULL_TREE;
32521 }
32522
32523 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32524 struct attribute_spec.handler. */
32525 static tree
32526 ix86_handle_abi_attribute (tree *node, tree name,
32527 tree args ATTRIBUTE_UNUSED,
32528 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32529 {
32530 if (TREE_CODE (*node) != FUNCTION_TYPE
32531 && TREE_CODE (*node) != METHOD_TYPE
32532 && TREE_CODE (*node) != FIELD_DECL
32533 && TREE_CODE (*node) != TYPE_DECL)
32534 {
32535 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32536 name);
32537 *no_add_attrs = true;
32538 return NULL_TREE;
32539 }
32540
32541 /* Can combine regparm with all attributes but fastcall. */
32542 if (is_attribute_p ("ms_abi", name))
32543 {
32544 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32545 {
32546 error ("ms_abi and sysv_abi attributes are not compatible");
32547 }
32548
32549 return NULL_TREE;
32550 }
32551 else if (is_attribute_p ("sysv_abi", name))
32552 {
32553 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32554 {
32555 error ("ms_abi and sysv_abi attributes are not compatible");
32556 }
32557
32558 return NULL_TREE;
32559 }
32560
32561 return NULL_TREE;
32562 }
32563
32564 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32565 struct attribute_spec.handler. */
32566 static tree
32567 ix86_handle_struct_attribute (tree *node, tree name,
32568 tree args ATTRIBUTE_UNUSED,
32569 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32570 {
32571 tree *type = NULL;
32572 if (DECL_P (*node))
32573 {
32574 if (TREE_CODE (*node) == TYPE_DECL)
32575 type = &TREE_TYPE (*node);
32576 }
32577 else
32578 type = node;
32579
32580 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32581 {
32582 warning (OPT_Wattributes, "%qE attribute ignored",
32583 name);
32584 *no_add_attrs = true;
32585 }
32586
32587 else if ((is_attribute_p ("ms_struct", name)
32588 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32589 || ((is_attribute_p ("gcc_struct", name)
32590 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32591 {
32592 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32593 name);
32594 *no_add_attrs = true;
32595 }
32596
32597 return NULL_TREE;
32598 }
32599
32600 static tree
32601 ix86_handle_fndecl_attribute (tree *node, tree name,
32602 tree args ATTRIBUTE_UNUSED,
32603 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32604 {
32605 if (TREE_CODE (*node) != FUNCTION_DECL)
32606 {
32607 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32608 name);
32609 *no_add_attrs = true;
32610 }
32611 return NULL_TREE;
32612 }
32613
32614 static bool
32615 ix86_ms_bitfield_layout_p (const_tree record_type)
32616 {
32617 return ((TARGET_MS_BITFIELD_LAYOUT
32618 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32619 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32620 }
32621
32622 /* Returns an expression indicating where the this parameter is
32623 located on entry to the FUNCTION. */
32624
32625 static rtx
32626 x86_this_parameter (tree function)
32627 {
32628 tree type = TREE_TYPE (function);
32629 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32630 int nregs;
32631
32632 if (TARGET_64BIT)
32633 {
32634 const int *parm_regs;
32635
32636 if (ix86_function_type_abi (type) == MS_ABI)
32637 parm_regs = x86_64_ms_abi_int_parameter_registers;
32638 else
32639 parm_regs = x86_64_int_parameter_registers;
32640 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32641 }
32642
32643 nregs = ix86_function_regparm (type, function);
32644
32645 if (nregs > 0 && !stdarg_p (type))
32646 {
32647 int regno;
32648 unsigned int ccvt = ix86_get_callcvt (type);
32649
32650 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32651 regno = aggr ? DX_REG : CX_REG;
32652 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32653 {
32654 regno = CX_REG;
32655 if (aggr)
32656 return gen_rtx_MEM (SImode,
32657 plus_constant (Pmode, stack_pointer_rtx, 4));
32658 }
32659 else
32660 {
32661 regno = AX_REG;
32662 if (aggr)
32663 {
32664 regno = DX_REG;
32665 if (nregs == 1)
32666 return gen_rtx_MEM (SImode,
32667 plus_constant (Pmode,
32668 stack_pointer_rtx, 4));
32669 }
32670 }
32671 return gen_rtx_REG (SImode, regno);
32672 }
32673
32674 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
32675 aggr ? 8 : 4));
32676 }
32677
32678 /* Determine whether x86_output_mi_thunk can succeed. */
32679
32680 static bool
32681 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32682 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32683 HOST_WIDE_INT vcall_offset, const_tree function)
32684 {
32685 /* 64-bit can handle anything. */
32686 if (TARGET_64BIT)
32687 return true;
32688
32689 /* For 32-bit, everything's fine if we have one free register. */
32690 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32691 return true;
32692
32693 /* Need a free register for vcall_offset. */
32694 if (vcall_offset)
32695 return false;
32696
32697 /* Need a free register for GOT references. */
32698 if (flag_pic && !targetm.binds_local_p (function))
32699 return false;
32700
32701 /* Otherwise ok. */
32702 return true;
32703 }
32704
32705 /* Output the assembler code for a thunk function. THUNK_DECL is the
32706 declaration for the thunk function itself, FUNCTION is the decl for
32707 the target function. DELTA is an immediate constant offset to be
32708 added to THIS. If VCALL_OFFSET is nonzero, the word at
32709 *(*this + vcall_offset) should be added to THIS. */
32710
32711 static void
32712 x86_output_mi_thunk (FILE *file,
32713 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32714 HOST_WIDE_INT vcall_offset, tree function)
32715 {
32716 rtx this_param = x86_this_parameter (function);
32717 rtx this_reg, tmp, fnaddr;
32718
32719 emit_note (NOTE_INSN_PROLOGUE_END);
32720
32721 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32722 pull it in now and let DELTA benefit. */
32723 if (REG_P (this_param))
32724 this_reg = this_param;
32725 else if (vcall_offset)
32726 {
32727 /* Put the this parameter into %eax. */
32728 this_reg = gen_rtx_REG (Pmode, AX_REG);
32729 emit_move_insn (this_reg, this_param);
32730 }
32731 else
32732 this_reg = NULL_RTX;
32733
32734 /* Adjust the this parameter by a fixed constant. */
32735 if (delta)
32736 {
32737 rtx delta_rtx = GEN_INT (delta);
32738 rtx delta_dst = this_reg ? this_reg : this_param;
32739
32740 if (TARGET_64BIT)
32741 {
32742 if (!x86_64_general_operand (delta_rtx, Pmode))
32743 {
32744 tmp = gen_rtx_REG (Pmode, R10_REG);
32745 emit_move_insn (tmp, delta_rtx);
32746 delta_rtx = tmp;
32747 }
32748 }
32749
32750 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32751 }
32752
32753 /* Adjust the this parameter by a value stored in the vtable. */
32754 if (vcall_offset)
32755 {
32756 rtx vcall_addr, vcall_mem, this_mem;
32757 unsigned int tmp_regno;
32758
32759 if (TARGET_64BIT)
32760 tmp_regno = R10_REG;
32761 else
32762 {
32763 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32764 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32765 tmp_regno = AX_REG;
32766 else
32767 tmp_regno = CX_REG;
32768 }
32769 tmp = gen_rtx_REG (Pmode, tmp_regno);
32770
32771 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32772 if (Pmode != ptr_mode)
32773 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32774 emit_move_insn (tmp, this_mem);
32775
32776 /* Adjust the this parameter. */
32777 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
32778 if (TARGET_64BIT
32779 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32780 {
32781 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32782 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32783 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32784 }
32785
32786 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32787 if (Pmode != ptr_mode)
32788 emit_insn (gen_addsi_1_zext (this_reg,
32789 gen_rtx_REG (ptr_mode,
32790 REGNO (this_reg)),
32791 vcall_mem));
32792 else
32793 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32794 }
32795
32796 /* If necessary, drop THIS back to its stack slot. */
32797 if (this_reg && this_reg != this_param)
32798 emit_move_insn (this_param, this_reg);
32799
32800 fnaddr = XEXP (DECL_RTL (function), 0);
32801 if (TARGET_64BIT)
32802 {
32803 if (!flag_pic || targetm.binds_local_p (function)
32804 || cfun->machine->call_abi == MS_ABI)
32805 ;
32806 else
32807 {
32808 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32809 tmp = gen_rtx_CONST (Pmode, tmp);
32810 fnaddr = gen_rtx_MEM (Pmode, tmp);
32811 }
32812 }
32813 else
32814 {
32815 if (!flag_pic || targetm.binds_local_p (function))
32816 ;
32817 #if TARGET_MACHO
32818 else if (TARGET_MACHO)
32819 {
32820 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32821 fnaddr = XEXP (fnaddr, 0);
32822 }
32823 #endif /* TARGET_MACHO */
32824 else
32825 {
32826 tmp = gen_rtx_REG (Pmode, CX_REG);
32827 output_set_got (tmp, NULL_RTX);
32828
32829 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32830 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32831 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32832 }
32833 }
32834
32835 /* Our sibling call patterns do not allow memories, because we have no
32836 predicate that can distinguish between frame and non-frame memory.
32837 For our purposes here, we can get away with (ab)using a jump pattern,
32838 because we're going to do no optimization. */
32839 if (MEM_P (fnaddr))
32840 emit_jump_insn (gen_indirect_jump (fnaddr));
32841 else
32842 {
32843 tmp = gen_rtx_MEM (QImode, fnaddr);
32844 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32845 tmp = emit_call_insn (tmp);
32846 SIBLING_CALL_P (tmp) = 1;
32847 }
32848 emit_barrier ();
32849
32850 /* Emit just enough of rest_of_compilation to get the insns emitted.
32851 Note that use_thunk calls assemble_start_function et al. */
32852 tmp = get_insns ();
32853 insn_locators_alloc ();
32854 shorten_branches (tmp);
32855 final_start_function (tmp, file, 1);
32856 final (tmp, file, 1);
32857 final_end_function ();
32858 }
32859
32860 static void
32861 x86_file_start (void)
32862 {
32863 default_file_start ();
32864 #if TARGET_MACHO
32865 darwin_file_start ();
32866 #endif
32867 if (X86_FILE_START_VERSION_DIRECTIVE)
32868 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32869 if (X86_FILE_START_FLTUSED)
32870 fputs ("\t.global\t__fltused\n", asm_out_file);
32871 if (ix86_asm_dialect == ASM_INTEL)
32872 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32873 }
32874
32875 int
32876 x86_field_alignment (tree field, int computed)
32877 {
32878 enum machine_mode mode;
32879 tree type = TREE_TYPE (field);
32880
32881 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32882 return computed;
32883 mode = TYPE_MODE (strip_array_types (type));
32884 if (mode == DFmode || mode == DCmode
32885 || GET_MODE_CLASS (mode) == MODE_INT
32886 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32887 return MIN (32, computed);
32888 return computed;
32889 }
32890
32891 /* Output assembler code to FILE to increment profiler label # LABELNO
32892 for profiling a function entry. */
32893 void
32894 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32895 {
32896 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32897 : MCOUNT_NAME);
32898
32899 if (TARGET_64BIT)
32900 {
32901 #ifndef NO_PROFILE_COUNTERS
32902 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32903 #endif
32904
32905 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32906 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32907 else
32908 fprintf (file, "\tcall\t%s\n", mcount_name);
32909 }
32910 else if (flag_pic)
32911 {
32912 #ifndef NO_PROFILE_COUNTERS
32913 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32914 LPREFIX, labelno);
32915 #endif
32916 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32917 }
32918 else
32919 {
32920 #ifndef NO_PROFILE_COUNTERS
32921 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32922 LPREFIX, labelno);
32923 #endif
32924 fprintf (file, "\tcall\t%s\n", mcount_name);
32925 }
32926 }
32927
32928 /* We don't have exact information about the insn sizes, but we may assume
32929 quite safely that we are informed about all 1 byte insns and memory
32930 address sizes. This is enough to eliminate unnecessary padding in
32931 99% of cases. */
32932
32933 static int
32934 min_insn_size (rtx insn)
32935 {
32936 int l = 0, len;
32937
32938 if (!INSN_P (insn) || !active_insn_p (insn))
32939 return 0;
32940
32941 /* Discard alignments we've emit and jump instructions. */
32942 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32943 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32944 return 0;
32945 if (JUMP_TABLE_DATA_P (insn))
32946 return 0;
32947
32948 /* Important case - calls are always 5 bytes.
32949 It is common to have many calls in the row. */
32950 if (CALL_P (insn)
32951 && symbolic_reference_mentioned_p (PATTERN (insn))
32952 && !SIBLING_CALL_P (insn))
32953 return 5;
32954 len = get_attr_length (insn);
32955 if (len <= 1)
32956 return 1;
32957
32958 /* For normal instructions we rely on get_attr_length being exact,
32959 with a few exceptions. */
32960 if (!JUMP_P (insn))
32961 {
32962 enum attr_type type = get_attr_type (insn);
32963
32964 switch (type)
32965 {
32966 case TYPE_MULTI:
32967 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32968 || asm_noperands (PATTERN (insn)) >= 0)
32969 return 0;
32970 break;
32971 case TYPE_OTHER:
32972 case TYPE_FCMP:
32973 break;
32974 default:
32975 /* Otherwise trust get_attr_length. */
32976 return len;
32977 }
32978
32979 l = get_attr_length_address (insn);
32980 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32981 l = 4;
32982 }
32983 if (l)
32984 return 1+l;
32985 else
32986 return 2;
32987 }
32988
32989 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32990
32991 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32992 window. */
32993
32994 static void
32995 ix86_avoid_jump_mispredicts (void)
32996 {
32997 rtx insn, start = get_insns ();
32998 int nbytes = 0, njumps = 0;
32999 int isjump = 0;
33000
33001 /* Look for all minimal intervals of instructions containing 4 jumps.
33002 The intervals are bounded by START and INSN. NBYTES is the total
33003 size of instructions in the interval including INSN and not including
33004 START. When the NBYTES is smaller than 16 bytes, it is possible
33005 that the end of START and INSN ends up in the same 16byte page.
33006
33007 The smallest offset in the page INSN can start is the case where START
33008 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
33009 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
33010 */
33011 for (insn = start; insn; insn = NEXT_INSN (insn))
33012 {
33013 int min_size;
33014
33015 if (LABEL_P (insn))
33016 {
33017 int align = label_to_alignment (insn);
33018 int max_skip = label_to_max_skip (insn);
33019
33020 if (max_skip > 15)
33021 max_skip = 15;
33022 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
33023 already in the current 16 byte page, because otherwise
33024 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
33025 bytes to reach 16 byte boundary. */
33026 if (align <= 0
33027 || (align <= 3 && max_skip != (1 << align) - 1))
33028 max_skip = 0;
33029 if (dump_file)
33030 fprintf (dump_file, "Label %i with max_skip %i\n",
33031 INSN_UID (insn), max_skip);
33032 if (max_skip)
33033 {
33034 while (nbytes + max_skip >= 16)
33035 {
33036 start = NEXT_INSN (start);
33037 if ((JUMP_P (start)
33038 && GET_CODE (PATTERN (start)) != ADDR_VEC
33039 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33040 || CALL_P (start))
33041 njumps--, isjump = 1;
33042 else
33043 isjump = 0;
33044 nbytes -= min_insn_size (start);
33045 }
33046 }
33047 continue;
33048 }
33049
33050 min_size = min_insn_size (insn);
33051 nbytes += min_size;
33052 if (dump_file)
33053 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
33054 INSN_UID (insn), min_size);
33055 if ((JUMP_P (insn)
33056 && GET_CODE (PATTERN (insn)) != ADDR_VEC
33057 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
33058 || CALL_P (insn))
33059 njumps++;
33060 else
33061 continue;
33062
33063 while (njumps > 3)
33064 {
33065 start = NEXT_INSN (start);
33066 if ((JUMP_P (start)
33067 && GET_CODE (PATTERN (start)) != ADDR_VEC
33068 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33069 || CALL_P (start))
33070 njumps--, isjump = 1;
33071 else
33072 isjump = 0;
33073 nbytes -= min_insn_size (start);
33074 }
33075 gcc_assert (njumps >= 0);
33076 if (dump_file)
33077 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33078 INSN_UID (start), INSN_UID (insn), nbytes);
33079
33080 if (njumps == 3 && isjump && nbytes < 16)
33081 {
33082 int padsize = 15 - nbytes + min_insn_size (insn);
33083
33084 if (dump_file)
33085 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33086 INSN_UID (insn), padsize);
33087 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33088 }
33089 }
33090 }
33091 #endif
33092
33093 /* AMD Athlon works faster
33094 when RET is not destination of conditional jump or directly preceded
33095 by other jump instruction. We avoid the penalty by inserting NOP just
33096 before the RET instructions in such cases. */
33097 static void
33098 ix86_pad_returns (void)
33099 {
33100 edge e;
33101 edge_iterator ei;
33102
33103 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33104 {
33105 basic_block bb = e->src;
33106 rtx ret = BB_END (bb);
33107 rtx prev;
33108 bool replace = false;
33109
33110 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33111 || optimize_bb_for_size_p (bb))
33112 continue;
33113 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33114 if (active_insn_p (prev) || LABEL_P (prev))
33115 break;
33116 if (prev && LABEL_P (prev))
33117 {
33118 edge e;
33119 edge_iterator ei;
33120
33121 FOR_EACH_EDGE (e, ei, bb->preds)
33122 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33123 && !(e->flags & EDGE_FALLTHRU))
33124 replace = true;
33125 }
33126 if (!replace)
33127 {
33128 prev = prev_active_insn (ret);
33129 if (prev
33130 && ((JUMP_P (prev) && any_condjump_p (prev))
33131 || CALL_P (prev)))
33132 replace = true;
33133 /* Empty functions get branch mispredict even when
33134 the jump destination is not visible to us. */
33135 if (!prev && !optimize_function_for_size_p (cfun))
33136 replace = true;
33137 }
33138 if (replace)
33139 {
33140 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33141 delete_insn (ret);
33142 }
33143 }
33144 }
33145
33146 /* Count the minimum number of instructions in BB. Return 4 if the
33147 number of instructions >= 4. */
33148
33149 static int
33150 ix86_count_insn_bb (basic_block bb)
33151 {
33152 rtx insn;
33153 int insn_count = 0;
33154
33155 /* Count number of instructions in this block. Return 4 if the number
33156 of instructions >= 4. */
33157 FOR_BB_INSNS (bb, insn)
33158 {
33159 /* Only happen in exit blocks. */
33160 if (JUMP_P (insn)
33161 && ANY_RETURN_P (PATTERN (insn)))
33162 break;
33163
33164 if (NONDEBUG_INSN_P (insn)
33165 && GET_CODE (PATTERN (insn)) != USE
33166 && GET_CODE (PATTERN (insn)) != CLOBBER)
33167 {
33168 insn_count++;
33169 if (insn_count >= 4)
33170 return insn_count;
33171 }
33172 }
33173
33174 return insn_count;
33175 }
33176
33177
33178 /* Count the minimum number of instructions in code path in BB.
33179 Return 4 if the number of instructions >= 4. */
33180
33181 static int
33182 ix86_count_insn (basic_block bb)
33183 {
33184 edge e;
33185 edge_iterator ei;
33186 int min_prev_count;
33187
33188 /* Only bother counting instructions along paths with no
33189 more than 2 basic blocks between entry and exit. Given
33190 that BB has an edge to exit, determine if a predecessor
33191 of BB has an edge from entry. If so, compute the number
33192 of instructions in the predecessor block. If there
33193 happen to be multiple such blocks, compute the minimum. */
33194 min_prev_count = 4;
33195 FOR_EACH_EDGE (e, ei, bb->preds)
33196 {
33197 edge prev_e;
33198 edge_iterator prev_ei;
33199
33200 if (e->src == ENTRY_BLOCK_PTR)
33201 {
33202 min_prev_count = 0;
33203 break;
33204 }
33205 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33206 {
33207 if (prev_e->src == ENTRY_BLOCK_PTR)
33208 {
33209 int count = ix86_count_insn_bb (e->src);
33210 if (count < min_prev_count)
33211 min_prev_count = count;
33212 break;
33213 }
33214 }
33215 }
33216
33217 if (min_prev_count < 4)
33218 min_prev_count += ix86_count_insn_bb (bb);
33219
33220 return min_prev_count;
33221 }
33222
33223 /* Pad short function to 4 instructions. */
33224
33225 static void
33226 ix86_pad_short_function (void)
33227 {
33228 edge e;
33229 edge_iterator ei;
33230
33231 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33232 {
33233 rtx ret = BB_END (e->src);
33234 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33235 {
33236 int insn_count = ix86_count_insn (e->src);
33237
33238 /* Pad short function. */
33239 if (insn_count < 4)
33240 {
33241 rtx insn = ret;
33242
33243 /* Find epilogue. */
33244 while (insn
33245 && (!NOTE_P (insn)
33246 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33247 insn = PREV_INSN (insn);
33248
33249 if (!insn)
33250 insn = ret;
33251
33252 /* Two NOPs count as one instruction. */
33253 insn_count = 2 * (4 - insn_count);
33254 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33255 }
33256 }
33257 }
33258 }
33259
33260 /* Implement machine specific optimizations. We implement padding of returns
33261 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33262 static void
33263 ix86_reorg (void)
33264 {
33265 /* We are freeing block_for_insn in the toplev to keep compatibility
33266 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33267 compute_bb_for_insn ();
33268
33269 /* Run the vzeroupper optimization if needed. */
33270 if (TARGET_VZEROUPPER)
33271 move_or_delete_vzeroupper ();
33272
33273 if (optimize && optimize_function_for_speed_p (cfun))
33274 {
33275 if (TARGET_PAD_SHORT_FUNCTION)
33276 ix86_pad_short_function ();
33277 else if (TARGET_PAD_RETURNS)
33278 ix86_pad_returns ();
33279 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33280 if (TARGET_FOUR_JUMP_LIMIT)
33281 ix86_avoid_jump_mispredicts ();
33282 #endif
33283 }
33284 }
33285
33286 /* Return nonzero when QImode register that must be represented via REX prefix
33287 is used. */
33288 bool
33289 x86_extended_QIreg_mentioned_p (rtx insn)
33290 {
33291 int i;
33292 extract_insn_cached (insn);
33293 for (i = 0; i < recog_data.n_operands; i++)
33294 if (REG_P (recog_data.operand[i])
33295 && REGNO (recog_data.operand[i]) > BX_REG)
33296 return true;
33297 return false;
33298 }
33299
33300 /* Return nonzero when P points to register encoded via REX prefix.
33301 Called via for_each_rtx. */
33302 static int
33303 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33304 {
33305 unsigned int regno;
33306 if (!REG_P (*p))
33307 return 0;
33308 regno = REGNO (*p);
33309 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33310 }
33311
33312 /* Return true when INSN mentions register that must be encoded using REX
33313 prefix. */
33314 bool
33315 x86_extended_reg_mentioned_p (rtx insn)
33316 {
33317 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33318 extended_reg_mentioned_1, NULL);
33319 }
33320
33321 /* If profitable, negate (without causing overflow) integer constant
33322 of mode MODE at location LOC. Return true in this case. */
33323 bool
33324 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33325 {
33326 HOST_WIDE_INT val;
33327
33328 if (!CONST_INT_P (*loc))
33329 return false;
33330
33331 switch (mode)
33332 {
33333 case DImode:
33334 /* DImode x86_64 constants must fit in 32 bits. */
33335 gcc_assert (x86_64_immediate_operand (*loc, mode));
33336
33337 mode = SImode;
33338 break;
33339
33340 case SImode:
33341 case HImode:
33342 case QImode:
33343 break;
33344
33345 default:
33346 gcc_unreachable ();
33347 }
33348
33349 /* Avoid overflows. */
33350 if (mode_signbit_p (mode, *loc))
33351 return false;
33352
33353 val = INTVAL (*loc);
33354
33355 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33356 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33357 if ((val < 0 && val != -128)
33358 || val == 128)
33359 {
33360 *loc = GEN_INT (-val);
33361 return true;
33362 }
33363
33364 return false;
33365 }
33366
33367 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33368 optabs would emit if we didn't have TFmode patterns. */
33369
33370 void
33371 x86_emit_floatuns (rtx operands[2])
33372 {
33373 rtx neglab, donelab, i0, i1, f0, in, out;
33374 enum machine_mode mode, inmode;
33375
33376 inmode = GET_MODE (operands[1]);
33377 gcc_assert (inmode == SImode || inmode == DImode);
33378
33379 out = operands[0];
33380 in = force_reg (inmode, operands[1]);
33381 mode = GET_MODE (out);
33382 neglab = gen_label_rtx ();
33383 donelab = gen_label_rtx ();
33384 f0 = gen_reg_rtx (mode);
33385
33386 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33387
33388 expand_float (out, in, 0);
33389
33390 emit_jump_insn (gen_jump (donelab));
33391 emit_barrier ();
33392
33393 emit_label (neglab);
33394
33395 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33396 1, OPTAB_DIRECT);
33397 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33398 1, OPTAB_DIRECT);
33399 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33400
33401 expand_float (f0, i0, 0);
33402
33403 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33404
33405 emit_label (donelab);
33406 }
33407 \f
33408 /* AVX2 does support 32-byte integer vector operations,
33409 thus the longest vector we are faced with is V32QImode. */
33410 #define MAX_VECT_LEN 32
33411
33412 struct expand_vec_perm_d
33413 {
33414 rtx target, op0, op1;
33415 unsigned char perm[MAX_VECT_LEN];
33416 enum machine_mode vmode;
33417 unsigned char nelt;
33418 bool one_operand_p;
33419 bool testing_p;
33420 };
33421
33422 static bool canonicalize_perm (struct expand_vec_perm_d *d);
33423 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33424 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33425
33426 /* Get a vector mode of the same size as the original but with elements
33427 twice as wide. This is only guaranteed to apply to integral vectors. */
33428
33429 static inline enum machine_mode
33430 get_mode_wider_vector (enum machine_mode o)
33431 {
33432 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33433 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33434 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33435 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33436 return n;
33437 }
33438
33439 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33440 with all elements equal to VAR. Return true if successful. */
33441
33442 static bool
33443 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33444 rtx target, rtx val)
33445 {
33446 bool ok;
33447
33448 switch (mode)
33449 {
33450 case V2SImode:
33451 case V2SFmode:
33452 if (!mmx_ok)
33453 return false;
33454 /* FALLTHRU */
33455
33456 case V4DFmode:
33457 case V4DImode:
33458 case V8SFmode:
33459 case V8SImode:
33460 case V2DFmode:
33461 case V2DImode:
33462 case V4SFmode:
33463 case V4SImode:
33464 {
33465 rtx insn, dup;
33466
33467 /* First attempt to recognize VAL as-is. */
33468 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33469 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33470 if (recog_memoized (insn) < 0)
33471 {
33472 rtx seq;
33473 /* If that fails, force VAL into a register. */
33474
33475 start_sequence ();
33476 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33477 seq = get_insns ();
33478 end_sequence ();
33479 if (seq)
33480 emit_insn_before (seq, insn);
33481
33482 ok = recog_memoized (insn) >= 0;
33483 gcc_assert (ok);
33484 }
33485 }
33486 return true;
33487
33488 case V4HImode:
33489 if (!mmx_ok)
33490 return false;
33491 if (TARGET_SSE || TARGET_3DNOW_A)
33492 {
33493 rtx x;
33494
33495 val = gen_lowpart (SImode, val);
33496 x = gen_rtx_TRUNCATE (HImode, val);
33497 x = gen_rtx_VEC_DUPLICATE (mode, x);
33498 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33499 return true;
33500 }
33501 goto widen;
33502
33503 case V8QImode:
33504 if (!mmx_ok)
33505 return false;
33506 goto widen;
33507
33508 case V8HImode:
33509 if (TARGET_SSE2)
33510 {
33511 struct expand_vec_perm_d dperm;
33512 rtx tmp1, tmp2;
33513
33514 permute:
33515 memset (&dperm, 0, sizeof (dperm));
33516 dperm.target = target;
33517 dperm.vmode = mode;
33518 dperm.nelt = GET_MODE_NUNITS (mode);
33519 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33520 dperm.one_operand_p = true;
33521
33522 /* Extend to SImode using a paradoxical SUBREG. */
33523 tmp1 = gen_reg_rtx (SImode);
33524 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33525
33526 /* Insert the SImode value as low element of a V4SImode vector. */
33527 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33528 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33529
33530 ok = (expand_vec_perm_1 (&dperm)
33531 || expand_vec_perm_broadcast_1 (&dperm));
33532 gcc_assert (ok);
33533 return ok;
33534 }
33535 goto widen;
33536
33537 case V16QImode:
33538 if (TARGET_SSE2)
33539 goto permute;
33540 goto widen;
33541
33542 widen:
33543 /* Replicate the value once into the next wider mode and recurse. */
33544 {
33545 enum machine_mode smode, wsmode, wvmode;
33546 rtx x;
33547
33548 smode = GET_MODE_INNER (mode);
33549 wvmode = get_mode_wider_vector (mode);
33550 wsmode = GET_MODE_INNER (wvmode);
33551
33552 val = convert_modes (wsmode, smode, val, true);
33553 x = expand_simple_binop (wsmode, ASHIFT, val,
33554 GEN_INT (GET_MODE_BITSIZE (smode)),
33555 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33556 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33557
33558 x = gen_lowpart (wvmode, target);
33559 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33560 gcc_assert (ok);
33561 return ok;
33562 }
33563
33564 case V16HImode:
33565 case V32QImode:
33566 {
33567 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33568 rtx x = gen_reg_rtx (hvmode);
33569
33570 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33571 gcc_assert (ok);
33572
33573 x = gen_rtx_VEC_CONCAT (mode, x, x);
33574 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33575 }
33576 return true;
33577
33578 default:
33579 return false;
33580 }
33581 }
33582
33583 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33584 whose ONE_VAR element is VAR, and other elements are zero. Return true
33585 if successful. */
33586
33587 static bool
33588 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33589 rtx target, rtx var, int one_var)
33590 {
33591 enum machine_mode vsimode;
33592 rtx new_target;
33593 rtx x, tmp;
33594 bool use_vector_set = false;
33595
33596 switch (mode)
33597 {
33598 case V2DImode:
33599 /* For SSE4.1, we normally use vector set. But if the second
33600 element is zero and inter-unit moves are OK, we use movq
33601 instead. */
33602 use_vector_set = (TARGET_64BIT
33603 && TARGET_SSE4_1
33604 && !(TARGET_INTER_UNIT_MOVES
33605 && one_var == 0));
33606 break;
33607 case V16QImode:
33608 case V4SImode:
33609 case V4SFmode:
33610 use_vector_set = TARGET_SSE4_1;
33611 break;
33612 case V8HImode:
33613 use_vector_set = TARGET_SSE2;
33614 break;
33615 case V4HImode:
33616 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33617 break;
33618 case V32QImode:
33619 case V16HImode:
33620 case V8SImode:
33621 case V8SFmode:
33622 case V4DFmode:
33623 use_vector_set = TARGET_AVX;
33624 break;
33625 case V4DImode:
33626 /* Use ix86_expand_vector_set in 64bit mode only. */
33627 use_vector_set = TARGET_AVX && TARGET_64BIT;
33628 break;
33629 default:
33630 break;
33631 }
33632
33633 if (use_vector_set)
33634 {
33635 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33636 var = force_reg (GET_MODE_INNER (mode), var);
33637 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33638 return true;
33639 }
33640
33641 switch (mode)
33642 {
33643 case V2SFmode:
33644 case V2SImode:
33645 if (!mmx_ok)
33646 return false;
33647 /* FALLTHRU */
33648
33649 case V2DFmode:
33650 case V2DImode:
33651 if (one_var != 0)
33652 return false;
33653 var = force_reg (GET_MODE_INNER (mode), var);
33654 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33655 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33656 return true;
33657
33658 case V4SFmode:
33659 case V4SImode:
33660 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33661 new_target = gen_reg_rtx (mode);
33662 else
33663 new_target = target;
33664 var = force_reg (GET_MODE_INNER (mode), var);
33665 x = gen_rtx_VEC_DUPLICATE (mode, var);
33666 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33667 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33668 if (one_var != 0)
33669 {
33670 /* We need to shuffle the value to the correct position, so
33671 create a new pseudo to store the intermediate result. */
33672
33673 /* With SSE2, we can use the integer shuffle insns. */
33674 if (mode != V4SFmode && TARGET_SSE2)
33675 {
33676 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33677 const1_rtx,
33678 GEN_INT (one_var == 1 ? 0 : 1),
33679 GEN_INT (one_var == 2 ? 0 : 1),
33680 GEN_INT (one_var == 3 ? 0 : 1)));
33681 if (target != new_target)
33682 emit_move_insn (target, new_target);
33683 return true;
33684 }
33685
33686 /* Otherwise convert the intermediate result to V4SFmode and
33687 use the SSE1 shuffle instructions. */
33688 if (mode != V4SFmode)
33689 {
33690 tmp = gen_reg_rtx (V4SFmode);
33691 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33692 }
33693 else
33694 tmp = new_target;
33695
33696 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33697 const1_rtx,
33698 GEN_INT (one_var == 1 ? 0 : 1),
33699 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33700 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33701
33702 if (mode != V4SFmode)
33703 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33704 else if (tmp != target)
33705 emit_move_insn (target, tmp);
33706 }
33707 else if (target != new_target)
33708 emit_move_insn (target, new_target);
33709 return true;
33710
33711 case V8HImode:
33712 case V16QImode:
33713 vsimode = V4SImode;
33714 goto widen;
33715 case V4HImode:
33716 case V8QImode:
33717 if (!mmx_ok)
33718 return false;
33719 vsimode = V2SImode;
33720 goto widen;
33721 widen:
33722 if (one_var != 0)
33723 return false;
33724
33725 /* Zero extend the variable element to SImode and recurse. */
33726 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33727
33728 x = gen_reg_rtx (vsimode);
33729 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33730 var, one_var))
33731 gcc_unreachable ();
33732
33733 emit_move_insn (target, gen_lowpart (mode, x));
33734 return true;
33735
33736 default:
33737 return false;
33738 }
33739 }
33740
33741 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33742 consisting of the values in VALS. It is known that all elements
33743 except ONE_VAR are constants. Return true if successful. */
33744
33745 static bool
33746 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33747 rtx target, rtx vals, int one_var)
33748 {
33749 rtx var = XVECEXP (vals, 0, one_var);
33750 enum machine_mode wmode;
33751 rtx const_vec, x;
33752
33753 const_vec = copy_rtx (vals);
33754 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33755 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33756
33757 switch (mode)
33758 {
33759 case V2DFmode:
33760 case V2DImode:
33761 case V2SFmode:
33762 case V2SImode:
33763 /* For the two element vectors, it's just as easy to use
33764 the general case. */
33765 return false;
33766
33767 case V4DImode:
33768 /* Use ix86_expand_vector_set in 64bit mode only. */
33769 if (!TARGET_64BIT)
33770 return false;
33771 case V4DFmode:
33772 case V8SFmode:
33773 case V8SImode:
33774 case V16HImode:
33775 case V32QImode:
33776 case V4SFmode:
33777 case V4SImode:
33778 case V8HImode:
33779 case V4HImode:
33780 break;
33781
33782 case V16QImode:
33783 if (TARGET_SSE4_1)
33784 break;
33785 wmode = V8HImode;
33786 goto widen;
33787 case V8QImode:
33788 wmode = V4HImode;
33789 goto widen;
33790 widen:
33791 /* There's no way to set one QImode entry easily. Combine
33792 the variable value with its adjacent constant value, and
33793 promote to an HImode set. */
33794 x = XVECEXP (vals, 0, one_var ^ 1);
33795 if (one_var & 1)
33796 {
33797 var = convert_modes (HImode, QImode, var, true);
33798 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33799 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33800 x = GEN_INT (INTVAL (x) & 0xff);
33801 }
33802 else
33803 {
33804 var = convert_modes (HImode, QImode, var, true);
33805 x = gen_int_mode (INTVAL (x) << 8, HImode);
33806 }
33807 if (x != const0_rtx)
33808 var = expand_simple_binop (HImode, IOR, var, x, var,
33809 1, OPTAB_LIB_WIDEN);
33810
33811 x = gen_reg_rtx (wmode);
33812 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33813 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33814
33815 emit_move_insn (target, gen_lowpart (mode, x));
33816 return true;
33817
33818 default:
33819 return false;
33820 }
33821
33822 emit_move_insn (target, const_vec);
33823 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33824 return true;
33825 }
33826
33827 /* A subroutine of ix86_expand_vector_init_general. Use vector
33828 concatenate to handle the most general case: all values variable,
33829 and none identical. */
33830
33831 static void
33832 ix86_expand_vector_init_concat (enum machine_mode mode,
33833 rtx target, rtx *ops, int n)
33834 {
33835 enum machine_mode cmode, hmode = VOIDmode;
33836 rtx first[8], second[4];
33837 rtvec v;
33838 int i, j;
33839
33840 switch (n)
33841 {
33842 case 2:
33843 switch (mode)
33844 {
33845 case V8SImode:
33846 cmode = V4SImode;
33847 break;
33848 case V8SFmode:
33849 cmode = V4SFmode;
33850 break;
33851 case V4DImode:
33852 cmode = V2DImode;
33853 break;
33854 case V4DFmode:
33855 cmode = V2DFmode;
33856 break;
33857 case V4SImode:
33858 cmode = V2SImode;
33859 break;
33860 case V4SFmode:
33861 cmode = V2SFmode;
33862 break;
33863 case V2DImode:
33864 cmode = DImode;
33865 break;
33866 case V2SImode:
33867 cmode = SImode;
33868 break;
33869 case V2DFmode:
33870 cmode = DFmode;
33871 break;
33872 case V2SFmode:
33873 cmode = SFmode;
33874 break;
33875 default:
33876 gcc_unreachable ();
33877 }
33878
33879 if (!register_operand (ops[1], cmode))
33880 ops[1] = force_reg (cmode, ops[1]);
33881 if (!register_operand (ops[0], cmode))
33882 ops[0] = force_reg (cmode, ops[0]);
33883 emit_insn (gen_rtx_SET (VOIDmode, target,
33884 gen_rtx_VEC_CONCAT (mode, ops[0],
33885 ops[1])));
33886 break;
33887
33888 case 4:
33889 switch (mode)
33890 {
33891 case V4DImode:
33892 cmode = V2DImode;
33893 break;
33894 case V4DFmode:
33895 cmode = V2DFmode;
33896 break;
33897 case V4SImode:
33898 cmode = V2SImode;
33899 break;
33900 case V4SFmode:
33901 cmode = V2SFmode;
33902 break;
33903 default:
33904 gcc_unreachable ();
33905 }
33906 goto half;
33907
33908 case 8:
33909 switch (mode)
33910 {
33911 case V8SImode:
33912 cmode = V2SImode;
33913 hmode = V4SImode;
33914 break;
33915 case V8SFmode:
33916 cmode = V2SFmode;
33917 hmode = V4SFmode;
33918 break;
33919 default:
33920 gcc_unreachable ();
33921 }
33922 goto half;
33923
33924 half:
33925 /* FIXME: We process inputs backward to help RA. PR 36222. */
33926 i = n - 1;
33927 j = (n >> 1) - 1;
33928 for (; i > 0; i -= 2, j--)
33929 {
33930 first[j] = gen_reg_rtx (cmode);
33931 v = gen_rtvec (2, ops[i - 1], ops[i]);
33932 ix86_expand_vector_init (false, first[j],
33933 gen_rtx_PARALLEL (cmode, v));
33934 }
33935
33936 n >>= 1;
33937 if (n > 2)
33938 {
33939 gcc_assert (hmode != VOIDmode);
33940 for (i = j = 0; i < n; i += 2, j++)
33941 {
33942 second[j] = gen_reg_rtx (hmode);
33943 ix86_expand_vector_init_concat (hmode, second [j],
33944 &first [i], 2);
33945 }
33946 n >>= 1;
33947 ix86_expand_vector_init_concat (mode, target, second, n);
33948 }
33949 else
33950 ix86_expand_vector_init_concat (mode, target, first, n);
33951 break;
33952
33953 default:
33954 gcc_unreachable ();
33955 }
33956 }
33957
33958 /* A subroutine of ix86_expand_vector_init_general. Use vector
33959 interleave to handle the most general case: all values variable,
33960 and none identical. */
33961
33962 static void
33963 ix86_expand_vector_init_interleave (enum machine_mode mode,
33964 rtx target, rtx *ops, int n)
33965 {
33966 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33967 int i, j;
33968 rtx op0, op1;
33969 rtx (*gen_load_even) (rtx, rtx, rtx);
33970 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33971 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33972
33973 switch (mode)
33974 {
33975 case V8HImode:
33976 gen_load_even = gen_vec_setv8hi;
33977 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33978 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33979 inner_mode = HImode;
33980 first_imode = V4SImode;
33981 second_imode = V2DImode;
33982 third_imode = VOIDmode;
33983 break;
33984 case V16QImode:
33985 gen_load_even = gen_vec_setv16qi;
33986 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33987 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33988 inner_mode = QImode;
33989 first_imode = V8HImode;
33990 second_imode = V4SImode;
33991 third_imode = V2DImode;
33992 break;
33993 default:
33994 gcc_unreachable ();
33995 }
33996
33997 for (i = 0; i < n; i++)
33998 {
33999 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
34000 op0 = gen_reg_rtx (SImode);
34001 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
34002
34003 /* Insert the SImode value as low element of V4SImode vector. */
34004 op1 = gen_reg_rtx (V4SImode);
34005 op0 = gen_rtx_VEC_MERGE (V4SImode,
34006 gen_rtx_VEC_DUPLICATE (V4SImode,
34007 op0),
34008 CONST0_RTX (V4SImode),
34009 const1_rtx);
34010 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
34011
34012 /* Cast the V4SImode vector back to a vector in orignal mode. */
34013 op0 = gen_reg_rtx (mode);
34014 emit_move_insn (op0, gen_lowpart (mode, op1));
34015
34016 /* Load even elements into the second positon. */
34017 emit_insn (gen_load_even (op0,
34018 force_reg (inner_mode,
34019 ops [i + i + 1]),
34020 const1_rtx));
34021
34022 /* Cast vector to FIRST_IMODE vector. */
34023 ops[i] = gen_reg_rtx (first_imode);
34024 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
34025 }
34026
34027 /* Interleave low FIRST_IMODE vectors. */
34028 for (i = j = 0; i < n; i += 2, j++)
34029 {
34030 op0 = gen_reg_rtx (first_imode);
34031 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
34032
34033 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
34034 ops[j] = gen_reg_rtx (second_imode);
34035 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
34036 }
34037
34038 /* Interleave low SECOND_IMODE vectors. */
34039 switch (second_imode)
34040 {
34041 case V4SImode:
34042 for (i = j = 0; i < n / 2; i += 2, j++)
34043 {
34044 op0 = gen_reg_rtx (second_imode);
34045 emit_insn (gen_interleave_second_low (op0, ops[i],
34046 ops[i + 1]));
34047
34048 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
34049 vector. */
34050 ops[j] = gen_reg_rtx (third_imode);
34051 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
34052 }
34053 second_imode = V2DImode;
34054 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34055 /* FALLTHRU */
34056
34057 case V2DImode:
34058 op0 = gen_reg_rtx (second_imode);
34059 emit_insn (gen_interleave_second_low (op0, ops[0],
34060 ops[1]));
34061
34062 /* Cast the SECOND_IMODE vector back to a vector on original
34063 mode. */
34064 emit_insn (gen_rtx_SET (VOIDmode, target,
34065 gen_lowpart (mode, op0)));
34066 break;
34067
34068 default:
34069 gcc_unreachable ();
34070 }
34071 }
34072
34073 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
34074 all values variable, and none identical. */
34075
34076 static void
34077 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34078 rtx target, rtx vals)
34079 {
34080 rtx ops[32], op0, op1;
34081 enum machine_mode half_mode = VOIDmode;
34082 int n, i;
34083
34084 switch (mode)
34085 {
34086 case V2SFmode:
34087 case V2SImode:
34088 if (!mmx_ok && !TARGET_SSE)
34089 break;
34090 /* FALLTHRU */
34091
34092 case V8SFmode:
34093 case V8SImode:
34094 case V4DFmode:
34095 case V4DImode:
34096 case V4SFmode:
34097 case V4SImode:
34098 case V2DFmode:
34099 case V2DImode:
34100 n = GET_MODE_NUNITS (mode);
34101 for (i = 0; i < n; i++)
34102 ops[i] = XVECEXP (vals, 0, i);
34103 ix86_expand_vector_init_concat (mode, target, ops, n);
34104 return;
34105
34106 case V32QImode:
34107 half_mode = V16QImode;
34108 goto half;
34109
34110 case V16HImode:
34111 half_mode = V8HImode;
34112 goto half;
34113
34114 half:
34115 n = GET_MODE_NUNITS (mode);
34116 for (i = 0; i < n; i++)
34117 ops[i] = XVECEXP (vals, 0, i);
34118 op0 = gen_reg_rtx (half_mode);
34119 op1 = gen_reg_rtx (half_mode);
34120 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34121 n >> 2);
34122 ix86_expand_vector_init_interleave (half_mode, op1,
34123 &ops [n >> 1], n >> 2);
34124 emit_insn (gen_rtx_SET (VOIDmode, target,
34125 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34126 return;
34127
34128 case V16QImode:
34129 if (!TARGET_SSE4_1)
34130 break;
34131 /* FALLTHRU */
34132
34133 case V8HImode:
34134 if (!TARGET_SSE2)
34135 break;
34136
34137 /* Don't use ix86_expand_vector_init_interleave if we can't
34138 move from GPR to SSE register directly. */
34139 if (!TARGET_INTER_UNIT_MOVES)
34140 break;
34141
34142 n = GET_MODE_NUNITS (mode);
34143 for (i = 0; i < n; i++)
34144 ops[i] = XVECEXP (vals, 0, i);
34145 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34146 return;
34147
34148 case V4HImode:
34149 case V8QImode:
34150 break;
34151
34152 default:
34153 gcc_unreachable ();
34154 }
34155
34156 {
34157 int i, j, n_elts, n_words, n_elt_per_word;
34158 enum machine_mode inner_mode;
34159 rtx words[4], shift;
34160
34161 inner_mode = GET_MODE_INNER (mode);
34162 n_elts = GET_MODE_NUNITS (mode);
34163 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34164 n_elt_per_word = n_elts / n_words;
34165 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34166
34167 for (i = 0; i < n_words; ++i)
34168 {
34169 rtx word = NULL_RTX;
34170
34171 for (j = 0; j < n_elt_per_word; ++j)
34172 {
34173 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34174 elt = convert_modes (word_mode, inner_mode, elt, true);
34175
34176 if (j == 0)
34177 word = elt;
34178 else
34179 {
34180 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34181 word, 1, OPTAB_LIB_WIDEN);
34182 word = expand_simple_binop (word_mode, IOR, word, elt,
34183 word, 1, OPTAB_LIB_WIDEN);
34184 }
34185 }
34186
34187 words[i] = word;
34188 }
34189
34190 if (n_words == 1)
34191 emit_move_insn (target, gen_lowpart (mode, words[0]));
34192 else if (n_words == 2)
34193 {
34194 rtx tmp = gen_reg_rtx (mode);
34195 emit_clobber (tmp);
34196 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34197 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34198 emit_move_insn (target, tmp);
34199 }
34200 else if (n_words == 4)
34201 {
34202 rtx tmp = gen_reg_rtx (V4SImode);
34203 gcc_assert (word_mode == SImode);
34204 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34205 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34206 emit_move_insn (target, gen_lowpart (mode, tmp));
34207 }
34208 else
34209 gcc_unreachable ();
34210 }
34211 }
34212
34213 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34214 instructions unless MMX_OK is true. */
34215
34216 void
34217 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34218 {
34219 enum machine_mode mode = GET_MODE (target);
34220 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34221 int n_elts = GET_MODE_NUNITS (mode);
34222 int n_var = 0, one_var = -1;
34223 bool all_same = true, all_const_zero = true;
34224 int i;
34225 rtx x;
34226
34227 for (i = 0; i < n_elts; ++i)
34228 {
34229 x = XVECEXP (vals, 0, i);
34230 if (!(CONST_INT_P (x)
34231 || GET_CODE (x) == CONST_DOUBLE
34232 || GET_CODE (x) == CONST_FIXED))
34233 n_var++, one_var = i;
34234 else if (x != CONST0_RTX (inner_mode))
34235 all_const_zero = false;
34236 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34237 all_same = false;
34238 }
34239
34240 /* Constants are best loaded from the constant pool. */
34241 if (n_var == 0)
34242 {
34243 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34244 return;
34245 }
34246
34247 /* If all values are identical, broadcast the value. */
34248 if (all_same
34249 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34250 XVECEXP (vals, 0, 0)))
34251 return;
34252
34253 /* Values where only one field is non-constant are best loaded from
34254 the pool and overwritten via move later. */
34255 if (n_var == 1)
34256 {
34257 if (all_const_zero
34258 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34259 XVECEXP (vals, 0, one_var),
34260 one_var))
34261 return;
34262
34263 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34264 return;
34265 }
34266
34267 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34268 }
34269
34270 void
34271 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34272 {
34273 enum machine_mode mode = GET_MODE (target);
34274 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34275 enum machine_mode half_mode;
34276 bool use_vec_merge = false;
34277 rtx tmp;
34278 static rtx (*gen_extract[6][2]) (rtx, rtx)
34279 = {
34280 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34281 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34282 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34283 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34284 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34285 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34286 };
34287 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34288 = {
34289 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34290 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34291 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34292 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34293 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34294 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34295 };
34296 int i, j, n;
34297
34298 switch (mode)
34299 {
34300 case V2SFmode:
34301 case V2SImode:
34302 if (mmx_ok)
34303 {
34304 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34305 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34306 if (elt == 0)
34307 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34308 else
34309 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34310 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34311 return;
34312 }
34313 break;
34314
34315 case V2DImode:
34316 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34317 if (use_vec_merge)
34318 break;
34319
34320 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34321 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34322 if (elt == 0)
34323 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34324 else
34325 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34326 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34327 return;
34328
34329 case V2DFmode:
34330 {
34331 rtx op0, op1;
34332
34333 /* For the two element vectors, we implement a VEC_CONCAT with
34334 the extraction of the other element. */
34335
34336 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34337 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34338
34339 if (elt == 0)
34340 op0 = val, op1 = tmp;
34341 else
34342 op0 = tmp, op1 = val;
34343
34344 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34345 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34346 }
34347 return;
34348
34349 case V4SFmode:
34350 use_vec_merge = TARGET_SSE4_1;
34351 if (use_vec_merge)
34352 break;
34353
34354 switch (elt)
34355 {
34356 case 0:
34357 use_vec_merge = true;
34358 break;
34359
34360 case 1:
34361 /* tmp = target = A B C D */
34362 tmp = copy_to_reg (target);
34363 /* target = A A B B */
34364 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34365 /* target = X A B B */
34366 ix86_expand_vector_set (false, target, val, 0);
34367 /* target = A X C D */
34368 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34369 const1_rtx, const0_rtx,
34370 GEN_INT (2+4), GEN_INT (3+4)));
34371 return;
34372
34373 case 2:
34374 /* tmp = target = A B C D */
34375 tmp = copy_to_reg (target);
34376 /* tmp = X B C D */
34377 ix86_expand_vector_set (false, tmp, val, 0);
34378 /* target = A B X D */
34379 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34380 const0_rtx, const1_rtx,
34381 GEN_INT (0+4), GEN_INT (3+4)));
34382 return;
34383
34384 case 3:
34385 /* tmp = target = A B C D */
34386 tmp = copy_to_reg (target);
34387 /* tmp = X B C D */
34388 ix86_expand_vector_set (false, tmp, val, 0);
34389 /* target = A B X D */
34390 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34391 const0_rtx, const1_rtx,
34392 GEN_INT (2+4), GEN_INT (0+4)));
34393 return;
34394
34395 default:
34396 gcc_unreachable ();
34397 }
34398 break;
34399
34400 case V4SImode:
34401 use_vec_merge = TARGET_SSE4_1;
34402 if (use_vec_merge)
34403 break;
34404
34405 /* Element 0 handled by vec_merge below. */
34406 if (elt == 0)
34407 {
34408 use_vec_merge = true;
34409 break;
34410 }
34411
34412 if (TARGET_SSE2)
34413 {
34414 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34415 store into element 0, then shuffle them back. */
34416
34417 rtx order[4];
34418
34419 order[0] = GEN_INT (elt);
34420 order[1] = const1_rtx;
34421 order[2] = const2_rtx;
34422 order[3] = GEN_INT (3);
34423 order[elt] = const0_rtx;
34424
34425 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34426 order[1], order[2], order[3]));
34427
34428 ix86_expand_vector_set (false, target, val, 0);
34429
34430 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34431 order[1], order[2], order[3]));
34432 }
34433 else
34434 {
34435 /* For SSE1, we have to reuse the V4SF code. */
34436 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34437 gen_lowpart (SFmode, val), elt);
34438 }
34439 return;
34440
34441 case V8HImode:
34442 use_vec_merge = TARGET_SSE2;
34443 break;
34444 case V4HImode:
34445 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34446 break;
34447
34448 case V16QImode:
34449 use_vec_merge = TARGET_SSE4_1;
34450 break;
34451
34452 case V8QImode:
34453 break;
34454
34455 case V32QImode:
34456 half_mode = V16QImode;
34457 j = 0;
34458 n = 16;
34459 goto half;
34460
34461 case V16HImode:
34462 half_mode = V8HImode;
34463 j = 1;
34464 n = 8;
34465 goto half;
34466
34467 case V8SImode:
34468 half_mode = V4SImode;
34469 j = 2;
34470 n = 4;
34471 goto half;
34472
34473 case V4DImode:
34474 half_mode = V2DImode;
34475 j = 3;
34476 n = 2;
34477 goto half;
34478
34479 case V8SFmode:
34480 half_mode = V4SFmode;
34481 j = 4;
34482 n = 4;
34483 goto half;
34484
34485 case V4DFmode:
34486 half_mode = V2DFmode;
34487 j = 5;
34488 n = 2;
34489 goto half;
34490
34491 half:
34492 /* Compute offset. */
34493 i = elt / n;
34494 elt %= n;
34495
34496 gcc_assert (i <= 1);
34497
34498 /* Extract the half. */
34499 tmp = gen_reg_rtx (half_mode);
34500 emit_insn (gen_extract[j][i] (tmp, target));
34501
34502 /* Put val in tmp at elt. */
34503 ix86_expand_vector_set (false, tmp, val, elt);
34504
34505 /* Put it back. */
34506 emit_insn (gen_insert[j][i] (target, target, tmp));
34507 return;
34508
34509 default:
34510 break;
34511 }
34512
34513 if (use_vec_merge)
34514 {
34515 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34516 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34517 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34518 }
34519 else
34520 {
34521 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34522
34523 emit_move_insn (mem, target);
34524
34525 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34526 emit_move_insn (tmp, val);
34527
34528 emit_move_insn (target, mem);
34529 }
34530 }
34531
34532 void
34533 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34534 {
34535 enum machine_mode mode = GET_MODE (vec);
34536 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34537 bool use_vec_extr = false;
34538 rtx tmp;
34539
34540 switch (mode)
34541 {
34542 case V2SImode:
34543 case V2SFmode:
34544 if (!mmx_ok)
34545 break;
34546 /* FALLTHRU */
34547
34548 case V2DFmode:
34549 case V2DImode:
34550 use_vec_extr = true;
34551 break;
34552
34553 case V4SFmode:
34554 use_vec_extr = TARGET_SSE4_1;
34555 if (use_vec_extr)
34556 break;
34557
34558 switch (elt)
34559 {
34560 case 0:
34561 tmp = vec;
34562 break;
34563
34564 case 1:
34565 case 3:
34566 tmp = gen_reg_rtx (mode);
34567 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34568 GEN_INT (elt), GEN_INT (elt),
34569 GEN_INT (elt+4), GEN_INT (elt+4)));
34570 break;
34571
34572 case 2:
34573 tmp = gen_reg_rtx (mode);
34574 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34575 break;
34576
34577 default:
34578 gcc_unreachable ();
34579 }
34580 vec = tmp;
34581 use_vec_extr = true;
34582 elt = 0;
34583 break;
34584
34585 case V4SImode:
34586 use_vec_extr = TARGET_SSE4_1;
34587 if (use_vec_extr)
34588 break;
34589
34590 if (TARGET_SSE2)
34591 {
34592 switch (elt)
34593 {
34594 case 0:
34595 tmp = vec;
34596 break;
34597
34598 case 1:
34599 case 3:
34600 tmp = gen_reg_rtx (mode);
34601 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34602 GEN_INT (elt), GEN_INT (elt),
34603 GEN_INT (elt), GEN_INT (elt)));
34604 break;
34605
34606 case 2:
34607 tmp = gen_reg_rtx (mode);
34608 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34609 break;
34610
34611 default:
34612 gcc_unreachable ();
34613 }
34614 vec = tmp;
34615 use_vec_extr = true;
34616 elt = 0;
34617 }
34618 else
34619 {
34620 /* For SSE1, we have to reuse the V4SF code. */
34621 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34622 gen_lowpart (V4SFmode, vec), elt);
34623 return;
34624 }
34625 break;
34626
34627 case V8HImode:
34628 use_vec_extr = TARGET_SSE2;
34629 break;
34630 case V4HImode:
34631 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34632 break;
34633
34634 case V16QImode:
34635 use_vec_extr = TARGET_SSE4_1;
34636 break;
34637
34638 case V8SFmode:
34639 if (TARGET_AVX)
34640 {
34641 tmp = gen_reg_rtx (V4SFmode);
34642 if (elt < 4)
34643 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34644 else
34645 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34646 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34647 return;
34648 }
34649 break;
34650
34651 case V4DFmode:
34652 if (TARGET_AVX)
34653 {
34654 tmp = gen_reg_rtx (V2DFmode);
34655 if (elt < 2)
34656 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34657 else
34658 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34659 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34660 return;
34661 }
34662 break;
34663
34664 case V32QImode:
34665 if (TARGET_AVX)
34666 {
34667 tmp = gen_reg_rtx (V16QImode);
34668 if (elt < 16)
34669 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34670 else
34671 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34672 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34673 return;
34674 }
34675 break;
34676
34677 case V16HImode:
34678 if (TARGET_AVX)
34679 {
34680 tmp = gen_reg_rtx (V8HImode);
34681 if (elt < 8)
34682 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34683 else
34684 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34685 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34686 return;
34687 }
34688 break;
34689
34690 case V8SImode:
34691 if (TARGET_AVX)
34692 {
34693 tmp = gen_reg_rtx (V4SImode);
34694 if (elt < 4)
34695 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34696 else
34697 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34698 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34699 return;
34700 }
34701 break;
34702
34703 case V4DImode:
34704 if (TARGET_AVX)
34705 {
34706 tmp = gen_reg_rtx (V2DImode);
34707 if (elt < 2)
34708 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34709 else
34710 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34711 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34712 return;
34713 }
34714 break;
34715
34716 case V8QImode:
34717 /* ??? Could extract the appropriate HImode element and shift. */
34718 default:
34719 break;
34720 }
34721
34722 if (use_vec_extr)
34723 {
34724 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34725 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34726
34727 /* Let the rtl optimizers know about the zero extension performed. */
34728 if (inner_mode == QImode || inner_mode == HImode)
34729 {
34730 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34731 target = gen_lowpart (SImode, target);
34732 }
34733
34734 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34735 }
34736 else
34737 {
34738 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34739
34740 emit_move_insn (mem, vec);
34741
34742 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34743 emit_move_insn (target, tmp);
34744 }
34745 }
34746
34747 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34748 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34749 The upper bits of DEST are undefined, though they shouldn't cause
34750 exceptions (some bits from src or all zeros are ok). */
34751
34752 static void
34753 emit_reduc_half (rtx dest, rtx src, int i)
34754 {
34755 rtx tem;
34756 switch (GET_MODE (src))
34757 {
34758 case V4SFmode:
34759 if (i == 128)
34760 tem = gen_sse_movhlps (dest, src, src);
34761 else
34762 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34763 GEN_INT (1 + 4), GEN_INT (1 + 4));
34764 break;
34765 case V2DFmode:
34766 tem = gen_vec_interleave_highv2df (dest, src, src);
34767 break;
34768 case V16QImode:
34769 case V8HImode:
34770 case V4SImode:
34771 case V2DImode:
34772 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34773 gen_lowpart (V1TImode, src),
34774 GEN_INT (i / 2));
34775 break;
34776 case V8SFmode:
34777 if (i == 256)
34778 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34779 else
34780 tem = gen_avx_shufps256 (dest, src, src,
34781 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34782 break;
34783 case V4DFmode:
34784 if (i == 256)
34785 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34786 else
34787 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34788 break;
34789 case V32QImode:
34790 case V16HImode:
34791 case V8SImode:
34792 case V4DImode:
34793 if (i == 256)
34794 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34795 gen_lowpart (V4DImode, src),
34796 gen_lowpart (V4DImode, src),
34797 const1_rtx);
34798 else
34799 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34800 gen_lowpart (V2TImode, src),
34801 GEN_INT (i / 2));
34802 break;
34803 default:
34804 gcc_unreachable ();
34805 }
34806 emit_insn (tem);
34807 }
34808
34809 /* Expand a vector reduction. FN is the binary pattern to reduce;
34810 DEST is the destination; IN is the input vector. */
34811
34812 void
34813 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34814 {
34815 rtx half, dst, vec = in;
34816 enum machine_mode mode = GET_MODE (in);
34817 int i;
34818
34819 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34820 if (TARGET_SSE4_1
34821 && mode == V8HImode
34822 && fn == gen_uminv8hi3)
34823 {
34824 emit_insn (gen_sse4_1_phminposuw (dest, in));
34825 return;
34826 }
34827
34828 for (i = GET_MODE_BITSIZE (mode);
34829 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34830 i >>= 1)
34831 {
34832 half = gen_reg_rtx (mode);
34833 emit_reduc_half (half, vec, i);
34834 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34835 dst = dest;
34836 else
34837 dst = gen_reg_rtx (mode);
34838 emit_insn (fn (dst, half, vec));
34839 vec = dst;
34840 }
34841 }
34842 \f
34843 /* Target hook for scalar_mode_supported_p. */
34844 static bool
34845 ix86_scalar_mode_supported_p (enum machine_mode mode)
34846 {
34847 if (DECIMAL_FLOAT_MODE_P (mode))
34848 return default_decimal_float_supported_p ();
34849 else if (mode == TFmode)
34850 return true;
34851 else
34852 return default_scalar_mode_supported_p (mode);
34853 }
34854
34855 /* Implements target hook vector_mode_supported_p. */
34856 static bool
34857 ix86_vector_mode_supported_p (enum machine_mode mode)
34858 {
34859 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34860 return true;
34861 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34862 return true;
34863 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34864 return true;
34865 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34866 return true;
34867 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34868 return true;
34869 return false;
34870 }
34871
34872 /* Target hook for c_mode_for_suffix. */
34873 static enum machine_mode
34874 ix86_c_mode_for_suffix (char suffix)
34875 {
34876 if (suffix == 'q')
34877 return TFmode;
34878 if (suffix == 'w')
34879 return XFmode;
34880
34881 return VOIDmode;
34882 }
34883
34884 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34885
34886 We do this in the new i386 backend to maintain source compatibility
34887 with the old cc0-based compiler. */
34888
34889 static tree
34890 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34891 tree inputs ATTRIBUTE_UNUSED,
34892 tree clobbers)
34893 {
34894 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34895 clobbers);
34896 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34897 clobbers);
34898 return clobbers;
34899 }
34900
34901 /* Implements target vector targetm.asm.encode_section_info. */
34902
34903 static void ATTRIBUTE_UNUSED
34904 ix86_encode_section_info (tree decl, rtx rtl, int first)
34905 {
34906 default_encode_section_info (decl, rtl, first);
34907
34908 if (TREE_CODE (decl) == VAR_DECL
34909 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34910 && ix86_in_large_data_p (decl))
34911 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34912 }
34913
34914 /* Worker function for REVERSE_CONDITION. */
34915
34916 enum rtx_code
34917 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34918 {
34919 return (mode != CCFPmode && mode != CCFPUmode
34920 ? reverse_condition (code)
34921 : reverse_condition_maybe_unordered (code));
34922 }
34923
34924 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34925 to OPERANDS[0]. */
34926
34927 const char *
34928 output_387_reg_move (rtx insn, rtx *operands)
34929 {
34930 if (REG_P (operands[0]))
34931 {
34932 if (REG_P (operands[1])
34933 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34934 {
34935 if (REGNO (operands[0]) == FIRST_STACK_REG)
34936 return output_387_ffreep (operands, 0);
34937 return "fstp\t%y0";
34938 }
34939 if (STACK_TOP_P (operands[0]))
34940 return "fld%Z1\t%y1";
34941 return "fst\t%y0";
34942 }
34943 else if (MEM_P (operands[0]))
34944 {
34945 gcc_assert (REG_P (operands[1]));
34946 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34947 return "fstp%Z0\t%y0";
34948 else
34949 {
34950 /* There is no non-popping store to memory for XFmode.
34951 So if we need one, follow the store with a load. */
34952 if (GET_MODE (operands[0]) == XFmode)
34953 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34954 else
34955 return "fst%Z0\t%y0";
34956 }
34957 }
34958 else
34959 gcc_unreachable();
34960 }
34961
34962 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34963 FP status register is set. */
34964
34965 void
34966 ix86_emit_fp_unordered_jump (rtx label)
34967 {
34968 rtx reg = gen_reg_rtx (HImode);
34969 rtx temp;
34970
34971 emit_insn (gen_x86_fnstsw_1 (reg));
34972
34973 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34974 {
34975 emit_insn (gen_x86_sahf_1 (reg));
34976
34977 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34978 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34979 }
34980 else
34981 {
34982 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34983
34984 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34985 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34986 }
34987
34988 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34989 gen_rtx_LABEL_REF (VOIDmode, label),
34990 pc_rtx);
34991 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34992
34993 emit_jump_insn (temp);
34994 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34995 }
34996
34997 /* Output code to perform a log1p XFmode calculation. */
34998
34999 void ix86_emit_i387_log1p (rtx op0, rtx op1)
35000 {
35001 rtx label1 = gen_label_rtx ();
35002 rtx label2 = gen_label_rtx ();
35003
35004 rtx tmp = gen_reg_rtx (XFmode);
35005 rtx tmp2 = gen_reg_rtx (XFmode);
35006 rtx test;
35007
35008 emit_insn (gen_absxf2 (tmp, op1));
35009 test = gen_rtx_GE (VOIDmode, tmp,
35010 CONST_DOUBLE_FROM_REAL_VALUE (
35011 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
35012 XFmode));
35013 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
35014
35015 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35016 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
35017 emit_jump (label2);
35018
35019 emit_label (label1);
35020 emit_move_insn (tmp, CONST1_RTX (XFmode));
35021 emit_insn (gen_addxf3 (tmp, op1, tmp));
35022 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35023 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
35024
35025 emit_label (label2);
35026 }
35027
35028 /* Emit code for round calculation. */
35029 void ix86_emit_i387_round (rtx op0, rtx op1)
35030 {
35031 enum machine_mode inmode = GET_MODE (op1);
35032 enum machine_mode outmode = GET_MODE (op0);
35033 rtx e1, e2, res, tmp, tmp1, half;
35034 rtx scratch = gen_reg_rtx (HImode);
35035 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
35036 rtx jump_label = gen_label_rtx ();
35037 rtx insn;
35038 rtx (*gen_abs) (rtx, rtx);
35039 rtx (*gen_neg) (rtx, rtx);
35040
35041 switch (inmode)
35042 {
35043 case SFmode:
35044 gen_abs = gen_abssf2;
35045 break;
35046 case DFmode:
35047 gen_abs = gen_absdf2;
35048 break;
35049 case XFmode:
35050 gen_abs = gen_absxf2;
35051 break;
35052 default:
35053 gcc_unreachable ();
35054 }
35055
35056 switch (outmode)
35057 {
35058 case SFmode:
35059 gen_neg = gen_negsf2;
35060 break;
35061 case DFmode:
35062 gen_neg = gen_negdf2;
35063 break;
35064 case XFmode:
35065 gen_neg = gen_negxf2;
35066 break;
35067 case HImode:
35068 gen_neg = gen_neghi2;
35069 break;
35070 case SImode:
35071 gen_neg = gen_negsi2;
35072 break;
35073 case DImode:
35074 gen_neg = gen_negdi2;
35075 break;
35076 default:
35077 gcc_unreachable ();
35078 }
35079
35080 e1 = gen_reg_rtx (inmode);
35081 e2 = gen_reg_rtx (inmode);
35082 res = gen_reg_rtx (outmode);
35083
35084 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35085
35086 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35087
35088 /* scratch = fxam(op1) */
35089 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35090 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35091 UNSPEC_FXAM)));
35092 /* e1 = fabs(op1) */
35093 emit_insn (gen_abs (e1, op1));
35094
35095 /* e2 = e1 + 0.5 */
35096 half = force_reg (inmode, half);
35097 emit_insn (gen_rtx_SET (VOIDmode, e2,
35098 gen_rtx_PLUS (inmode, e1, half)));
35099
35100 /* res = floor(e2) */
35101 if (inmode != XFmode)
35102 {
35103 tmp1 = gen_reg_rtx (XFmode);
35104
35105 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35106 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35107 }
35108 else
35109 tmp1 = e2;
35110
35111 switch (outmode)
35112 {
35113 case SFmode:
35114 case DFmode:
35115 {
35116 rtx tmp0 = gen_reg_rtx (XFmode);
35117
35118 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35119
35120 emit_insn (gen_rtx_SET (VOIDmode, res,
35121 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35122 UNSPEC_TRUNC_NOOP)));
35123 }
35124 break;
35125 case XFmode:
35126 emit_insn (gen_frndintxf2_floor (res, tmp1));
35127 break;
35128 case HImode:
35129 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35130 break;
35131 case SImode:
35132 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35133 break;
35134 case DImode:
35135 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35136 break;
35137 default:
35138 gcc_unreachable ();
35139 }
35140
35141 /* flags = signbit(a) */
35142 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35143
35144 /* if (flags) then res = -res */
35145 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35146 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35147 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35148 pc_rtx);
35149 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35150 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35151 JUMP_LABEL (insn) = jump_label;
35152
35153 emit_insn (gen_neg (res, res));
35154
35155 emit_label (jump_label);
35156 LABEL_NUSES (jump_label) = 1;
35157
35158 emit_move_insn (op0, res);
35159 }
35160
35161 /* Output code to perform a Newton-Rhapson approximation of a single precision
35162 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35163
35164 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35165 {
35166 rtx x0, x1, e0, e1;
35167
35168 x0 = gen_reg_rtx (mode);
35169 e0 = gen_reg_rtx (mode);
35170 e1 = gen_reg_rtx (mode);
35171 x1 = gen_reg_rtx (mode);
35172
35173 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35174
35175 b = force_reg (mode, b);
35176
35177 /* x0 = rcp(b) estimate */
35178 emit_insn (gen_rtx_SET (VOIDmode, x0,
35179 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35180 UNSPEC_RCP)));
35181 /* e0 = x0 * b */
35182 emit_insn (gen_rtx_SET (VOIDmode, e0,
35183 gen_rtx_MULT (mode, x0, b)));
35184
35185 /* e0 = x0 * e0 */
35186 emit_insn (gen_rtx_SET (VOIDmode, e0,
35187 gen_rtx_MULT (mode, x0, e0)));
35188
35189 /* e1 = x0 + x0 */
35190 emit_insn (gen_rtx_SET (VOIDmode, e1,
35191 gen_rtx_PLUS (mode, x0, x0)));
35192
35193 /* x1 = e1 - e0 */
35194 emit_insn (gen_rtx_SET (VOIDmode, x1,
35195 gen_rtx_MINUS (mode, e1, e0)));
35196
35197 /* res = a * x1 */
35198 emit_insn (gen_rtx_SET (VOIDmode, res,
35199 gen_rtx_MULT (mode, a, x1)));
35200 }
35201
35202 /* Output code to perform a Newton-Rhapson approximation of a
35203 single precision floating point [reciprocal] square root. */
35204
35205 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35206 bool recip)
35207 {
35208 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35209 REAL_VALUE_TYPE r;
35210
35211 x0 = gen_reg_rtx (mode);
35212 e0 = gen_reg_rtx (mode);
35213 e1 = gen_reg_rtx (mode);
35214 e2 = gen_reg_rtx (mode);
35215 e3 = gen_reg_rtx (mode);
35216
35217 real_from_integer (&r, VOIDmode, -3, -1, 0);
35218 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35219
35220 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35221 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35222
35223 if (VECTOR_MODE_P (mode))
35224 {
35225 mthree = ix86_build_const_vector (mode, true, mthree);
35226 mhalf = ix86_build_const_vector (mode, true, mhalf);
35227 }
35228
35229 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35230 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35231
35232 a = force_reg (mode, a);
35233
35234 /* x0 = rsqrt(a) estimate */
35235 emit_insn (gen_rtx_SET (VOIDmode, x0,
35236 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35237 UNSPEC_RSQRT)));
35238
35239 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35240 if (!recip)
35241 {
35242 rtx zero, mask;
35243
35244 zero = gen_reg_rtx (mode);
35245 mask = gen_reg_rtx (mode);
35246
35247 zero = force_reg (mode, CONST0_RTX(mode));
35248 emit_insn (gen_rtx_SET (VOIDmode, mask,
35249 gen_rtx_NE (mode, zero, a)));
35250
35251 emit_insn (gen_rtx_SET (VOIDmode, x0,
35252 gen_rtx_AND (mode, x0, mask)));
35253 }
35254
35255 /* e0 = x0 * a */
35256 emit_insn (gen_rtx_SET (VOIDmode, e0,
35257 gen_rtx_MULT (mode, x0, a)));
35258 /* e1 = e0 * x0 */
35259 emit_insn (gen_rtx_SET (VOIDmode, e1,
35260 gen_rtx_MULT (mode, e0, x0)));
35261
35262 /* e2 = e1 - 3. */
35263 mthree = force_reg (mode, mthree);
35264 emit_insn (gen_rtx_SET (VOIDmode, e2,
35265 gen_rtx_PLUS (mode, e1, mthree)));
35266
35267 mhalf = force_reg (mode, mhalf);
35268 if (recip)
35269 /* e3 = -.5 * x0 */
35270 emit_insn (gen_rtx_SET (VOIDmode, e3,
35271 gen_rtx_MULT (mode, x0, mhalf)));
35272 else
35273 /* e3 = -.5 * e0 */
35274 emit_insn (gen_rtx_SET (VOIDmode, e3,
35275 gen_rtx_MULT (mode, e0, mhalf)));
35276 /* ret = e2 * e3 */
35277 emit_insn (gen_rtx_SET (VOIDmode, res,
35278 gen_rtx_MULT (mode, e2, e3)));
35279 }
35280
35281 #ifdef TARGET_SOLARIS
35282 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35283
35284 static void
35285 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35286 tree decl)
35287 {
35288 /* With Binutils 2.15, the "@unwind" marker must be specified on
35289 every occurrence of the ".eh_frame" section, not just the first
35290 one. */
35291 if (TARGET_64BIT
35292 && strcmp (name, ".eh_frame") == 0)
35293 {
35294 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35295 flags & SECTION_WRITE ? "aw" : "a");
35296 return;
35297 }
35298
35299 #ifndef USE_GAS
35300 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35301 {
35302 solaris_elf_asm_comdat_section (name, flags, decl);
35303 return;
35304 }
35305 #endif
35306
35307 default_elf_asm_named_section (name, flags, decl);
35308 }
35309 #endif /* TARGET_SOLARIS */
35310
35311 /* Return the mangling of TYPE if it is an extended fundamental type. */
35312
35313 static const char *
35314 ix86_mangle_type (const_tree type)
35315 {
35316 type = TYPE_MAIN_VARIANT (type);
35317
35318 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35319 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35320 return NULL;
35321
35322 switch (TYPE_MODE (type))
35323 {
35324 case TFmode:
35325 /* __float128 is "g". */
35326 return "g";
35327 case XFmode:
35328 /* "long double" or __float80 is "e". */
35329 return "e";
35330 default:
35331 return NULL;
35332 }
35333 }
35334
35335 /* For 32-bit code we can save PIC register setup by using
35336 __stack_chk_fail_local hidden function instead of calling
35337 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35338 register, so it is better to call __stack_chk_fail directly. */
35339
35340 static tree ATTRIBUTE_UNUSED
35341 ix86_stack_protect_fail (void)
35342 {
35343 return TARGET_64BIT
35344 ? default_external_stack_protect_fail ()
35345 : default_hidden_stack_protect_fail ();
35346 }
35347
35348 /* Select a format to encode pointers in exception handling data. CODE
35349 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35350 true if the symbol may be affected by dynamic relocations.
35351
35352 ??? All x86 object file formats are capable of representing this.
35353 After all, the relocation needed is the same as for the call insn.
35354 Whether or not a particular assembler allows us to enter such, I
35355 guess we'll have to see. */
35356 int
35357 asm_preferred_eh_data_format (int code, int global)
35358 {
35359 if (flag_pic)
35360 {
35361 int type = DW_EH_PE_sdata8;
35362 if (!TARGET_64BIT
35363 || ix86_cmodel == CM_SMALL_PIC
35364 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35365 type = DW_EH_PE_sdata4;
35366 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35367 }
35368 if (ix86_cmodel == CM_SMALL
35369 || (ix86_cmodel == CM_MEDIUM && code))
35370 return DW_EH_PE_udata4;
35371 return DW_EH_PE_absptr;
35372 }
35373 \f
35374 /* Expand copysign from SIGN to the positive value ABS_VALUE
35375 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35376 the sign-bit. */
35377 static void
35378 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35379 {
35380 enum machine_mode mode = GET_MODE (sign);
35381 rtx sgn = gen_reg_rtx (mode);
35382 if (mask == NULL_RTX)
35383 {
35384 enum machine_mode vmode;
35385
35386 if (mode == SFmode)
35387 vmode = V4SFmode;
35388 else if (mode == DFmode)
35389 vmode = V2DFmode;
35390 else
35391 vmode = mode;
35392
35393 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35394 if (!VECTOR_MODE_P (mode))
35395 {
35396 /* We need to generate a scalar mode mask in this case. */
35397 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35398 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35399 mask = gen_reg_rtx (mode);
35400 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35401 }
35402 }
35403 else
35404 mask = gen_rtx_NOT (mode, mask);
35405 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35406 gen_rtx_AND (mode, mask, sign)));
35407 emit_insn (gen_rtx_SET (VOIDmode, result,
35408 gen_rtx_IOR (mode, abs_value, sgn)));
35409 }
35410
35411 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35412 mask for masking out the sign-bit is stored in *SMASK, if that is
35413 non-null. */
35414 static rtx
35415 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35416 {
35417 enum machine_mode vmode, mode = GET_MODE (op0);
35418 rtx xa, mask;
35419
35420 xa = gen_reg_rtx (mode);
35421 if (mode == SFmode)
35422 vmode = V4SFmode;
35423 else if (mode == DFmode)
35424 vmode = V2DFmode;
35425 else
35426 vmode = mode;
35427 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35428 if (!VECTOR_MODE_P (mode))
35429 {
35430 /* We need to generate a scalar mode mask in this case. */
35431 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35432 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35433 mask = gen_reg_rtx (mode);
35434 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35435 }
35436 emit_insn (gen_rtx_SET (VOIDmode, xa,
35437 gen_rtx_AND (mode, op0, mask)));
35438
35439 if (smask)
35440 *smask = mask;
35441
35442 return xa;
35443 }
35444
35445 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35446 swapping the operands if SWAP_OPERANDS is true. The expanded
35447 code is a forward jump to a newly created label in case the
35448 comparison is true. The generated label rtx is returned. */
35449 static rtx
35450 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35451 bool swap_operands)
35452 {
35453 rtx label, tmp;
35454
35455 if (swap_operands)
35456 {
35457 tmp = op0;
35458 op0 = op1;
35459 op1 = tmp;
35460 }
35461
35462 label = gen_label_rtx ();
35463 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35464 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35465 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35466 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35467 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35468 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35469 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35470 JUMP_LABEL (tmp) = label;
35471
35472 return label;
35473 }
35474
35475 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35476 using comparison code CODE. Operands are swapped for the comparison if
35477 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35478 static rtx
35479 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35480 bool swap_operands)
35481 {
35482 rtx (*insn)(rtx, rtx, rtx, rtx);
35483 enum machine_mode mode = GET_MODE (op0);
35484 rtx mask = gen_reg_rtx (mode);
35485
35486 if (swap_operands)
35487 {
35488 rtx tmp = op0;
35489 op0 = op1;
35490 op1 = tmp;
35491 }
35492
35493 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35494
35495 emit_insn (insn (mask, op0, op1,
35496 gen_rtx_fmt_ee (code, mode, op0, op1)));
35497 return mask;
35498 }
35499
35500 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35501 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35502 static rtx
35503 ix86_gen_TWO52 (enum machine_mode mode)
35504 {
35505 REAL_VALUE_TYPE TWO52r;
35506 rtx TWO52;
35507
35508 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35509 TWO52 = const_double_from_real_value (TWO52r, mode);
35510 TWO52 = force_reg (mode, TWO52);
35511
35512 return TWO52;
35513 }
35514
35515 /* Expand SSE sequence for computing lround from OP1 storing
35516 into OP0. */
35517 void
35518 ix86_expand_lround (rtx op0, rtx op1)
35519 {
35520 /* C code for the stuff we're doing below:
35521 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35522 return (long)tmp;
35523 */
35524 enum machine_mode mode = GET_MODE (op1);
35525 const struct real_format *fmt;
35526 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35527 rtx adj;
35528
35529 /* load nextafter (0.5, 0.0) */
35530 fmt = REAL_MODE_FORMAT (mode);
35531 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35532 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35533
35534 /* adj = copysign (0.5, op1) */
35535 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35536 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35537
35538 /* adj = op1 + adj */
35539 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35540
35541 /* op0 = (imode)adj */
35542 expand_fix (op0, adj, 0);
35543 }
35544
35545 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35546 into OPERAND0. */
35547 void
35548 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35549 {
35550 /* C code for the stuff we're doing below (for do_floor):
35551 xi = (long)op1;
35552 xi -= (double)xi > op1 ? 1 : 0;
35553 return xi;
35554 */
35555 enum machine_mode fmode = GET_MODE (op1);
35556 enum machine_mode imode = GET_MODE (op0);
35557 rtx ireg, freg, label, tmp;
35558
35559 /* reg = (long)op1 */
35560 ireg = gen_reg_rtx (imode);
35561 expand_fix (ireg, op1, 0);
35562
35563 /* freg = (double)reg */
35564 freg = gen_reg_rtx (fmode);
35565 expand_float (freg, ireg, 0);
35566
35567 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35568 label = ix86_expand_sse_compare_and_jump (UNLE,
35569 freg, op1, !do_floor);
35570 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35571 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35572 emit_move_insn (ireg, tmp);
35573
35574 emit_label (label);
35575 LABEL_NUSES (label) = 1;
35576
35577 emit_move_insn (op0, ireg);
35578 }
35579
35580 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35581 result in OPERAND0. */
35582 void
35583 ix86_expand_rint (rtx operand0, rtx operand1)
35584 {
35585 /* C code for the stuff we're doing below:
35586 xa = fabs (operand1);
35587 if (!isless (xa, 2**52))
35588 return operand1;
35589 xa = xa + 2**52 - 2**52;
35590 return copysign (xa, operand1);
35591 */
35592 enum machine_mode mode = GET_MODE (operand0);
35593 rtx res, xa, label, TWO52, mask;
35594
35595 res = gen_reg_rtx (mode);
35596 emit_move_insn (res, operand1);
35597
35598 /* xa = abs (operand1) */
35599 xa = ix86_expand_sse_fabs (res, &mask);
35600
35601 /* if (!isless (xa, TWO52)) goto label; */
35602 TWO52 = ix86_gen_TWO52 (mode);
35603 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35604
35605 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35606 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35607
35608 ix86_sse_copysign_to_positive (res, xa, res, mask);
35609
35610 emit_label (label);
35611 LABEL_NUSES (label) = 1;
35612
35613 emit_move_insn (operand0, res);
35614 }
35615
35616 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35617 into OPERAND0. */
35618 void
35619 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35620 {
35621 /* C code for the stuff we expand below.
35622 double xa = fabs (x), x2;
35623 if (!isless (xa, TWO52))
35624 return x;
35625 xa = xa + TWO52 - TWO52;
35626 x2 = copysign (xa, x);
35627 Compensate. Floor:
35628 if (x2 > x)
35629 x2 -= 1;
35630 Compensate. Ceil:
35631 if (x2 < x)
35632 x2 -= -1;
35633 return x2;
35634 */
35635 enum machine_mode mode = GET_MODE (operand0);
35636 rtx xa, TWO52, tmp, label, one, res, mask;
35637
35638 TWO52 = ix86_gen_TWO52 (mode);
35639
35640 /* Temporary for holding the result, initialized to the input
35641 operand to ease control flow. */
35642 res = gen_reg_rtx (mode);
35643 emit_move_insn (res, operand1);
35644
35645 /* xa = abs (operand1) */
35646 xa = ix86_expand_sse_fabs (res, &mask);
35647
35648 /* if (!isless (xa, TWO52)) goto label; */
35649 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35650
35651 /* xa = xa + TWO52 - TWO52; */
35652 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35653 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35654
35655 /* xa = copysign (xa, operand1) */
35656 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35657
35658 /* generate 1.0 or -1.0 */
35659 one = force_reg (mode,
35660 const_double_from_real_value (do_floor
35661 ? dconst1 : dconstm1, mode));
35662
35663 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35664 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35665 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35666 gen_rtx_AND (mode, one, tmp)));
35667 /* We always need to subtract here to preserve signed zero. */
35668 tmp = expand_simple_binop (mode, MINUS,
35669 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35670 emit_move_insn (res, tmp);
35671
35672 emit_label (label);
35673 LABEL_NUSES (label) = 1;
35674
35675 emit_move_insn (operand0, res);
35676 }
35677
35678 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35679 into OPERAND0. */
35680 void
35681 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35682 {
35683 /* C code for the stuff we expand below.
35684 double xa = fabs (x), x2;
35685 if (!isless (xa, TWO52))
35686 return x;
35687 x2 = (double)(long)x;
35688 Compensate. Floor:
35689 if (x2 > x)
35690 x2 -= 1;
35691 Compensate. Ceil:
35692 if (x2 < x)
35693 x2 += 1;
35694 if (HONOR_SIGNED_ZEROS (mode))
35695 return copysign (x2, x);
35696 return x2;
35697 */
35698 enum machine_mode mode = GET_MODE (operand0);
35699 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35700
35701 TWO52 = ix86_gen_TWO52 (mode);
35702
35703 /* Temporary for holding the result, initialized to the input
35704 operand to ease control flow. */
35705 res = gen_reg_rtx (mode);
35706 emit_move_insn (res, operand1);
35707
35708 /* xa = abs (operand1) */
35709 xa = ix86_expand_sse_fabs (res, &mask);
35710
35711 /* if (!isless (xa, TWO52)) goto label; */
35712 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35713
35714 /* xa = (double)(long)x */
35715 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35716 expand_fix (xi, res, 0);
35717 expand_float (xa, xi, 0);
35718
35719 /* generate 1.0 */
35720 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35721
35722 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35723 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35724 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35725 gen_rtx_AND (mode, one, tmp)));
35726 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35727 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35728 emit_move_insn (res, tmp);
35729
35730 if (HONOR_SIGNED_ZEROS (mode))
35731 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35732
35733 emit_label (label);
35734 LABEL_NUSES (label) = 1;
35735
35736 emit_move_insn (operand0, res);
35737 }
35738
35739 /* Expand SSE sequence for computing round from OPERAND1 storing
35740 into OPERAND0. Sequence that works without relying on DImode truncation
35741 via cvttsd2siq that is only available on 64bit targets. */
35742 void
35743 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35744 {
35745 /* C code for the stuff we expand below.
35746 double xa = fabs (x), xa2, x2;
35747 if (!isless (xa, TWO52))
35748 return x;
35749 Using the absolute value and copying back sign makes
35750 -0.0 -> -0.0 correct.
35751 xa2 = xa + TWO52 - TWO52;
35752 Compensate.
35753 dxa = xa2 - xa;
35754 if (dxa <= -0.5)
35755 xa2 += 1;
35756 else if (dxa > 0.5)
35757 xa2 -= 1;
35758 x2 = copysign (xa2, x);
35759 return x2;
35760 */
35761 enum machine_mode mode = GET_MODE (operand0);
35762 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35763
35764 TWO52 = ix86_gen_TWO52 (mode);
35765
35766 /* Temporary for holding the result, initialized to the input
35767 operand to ease control flow. */
35768 res = gen_reg_rtx (mode);
35769 emit_move_insn (res, operand1);
35770
35771 /* xa = abs (operand1) */
35772 xa = ix86_expand_sse_fabs (res, &mask);
35773
35774 /* if (!isless (xa, TWO52)) goto label; */
35775 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35776
35777 /* xa2 = xa + TWO52 - TWO52; */
35778 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35779 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35780
35781 /* dxa = xa2 - xa; */
35782 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35783
35784 /* generate 0.5, 1.0 and -0.5 */
35785 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35786 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35787 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35788 0, OPTAB_DIRECT);
35789
35790 /* Compensate. */
35791 tmp = gen_reg_rtx (mode);
35792 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35793 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35794 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35795 gen_rtx_AND (mode, one, tmp)));
35796 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35797 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35798 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35799 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35800 gen_rtx_AND (mode, one, tmp)));
35801 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35802
35803 /* res = copysign (xa2, operand1) */
35804 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35805
35806 emit_label (label);
35807 LABEL_NUSES (label) = 1;
35808
35809 emit_move_insn (operand0, res);
35810 }
35811
35812 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35813 into OPERAND0. */
35814 void
35815 ix86_expand_trunc (rtx operand0, rtx operand1)
35816 {
35817 /* C code for SSE variant we expand below.
35818 double xa = fabs (x), x2;
35819 if (!isless (xa, TWO52))
35820 return x;
35821 x2 = (double)(long)x;
35822 if (HONOR_SIGNED_ZEROS (mode))
35823 return copysign (x2, x);
35824 return x2;
35825 */
35826 enum machine_mode mode = GET_MODE (operand0);
35827 rtx xa, xi, TWO52, label, res, mask;
35828
35829 TWO52 = ix86_gen_TWO52 (mode);
35830
35831 /* Temporary for holding the result, initialized to the input
35832 operand to ease control flow. */
35833 res = gen_reg_rtx (mode);
35834 emit_move_insn (res, operand1);
35835
35836 /* xa = abs (operand1) */
35837 xa = ix86_expand_sse_fabs (res, &mask);
35838
35839 /* if (!isless (xa, TWO52)) goto label; */
35840 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35841
35842 /* x = (double)(long)x */
35843 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35844 expand_fix (xi, res, 0);
35845 expand_float (res, xi, 0);
35846
35847 if (HONOR_SIGNED_ZEROS (mode))
35848 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35849
35850 emit_label (label);
35851 LABEL_NUSES (label) = 1;
35852
35853 emit_move_insn (operand0, res);
35854 }
35855
35856 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35857 into OPERAND0. */
35858 void
35859 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35860 {
35861 enum machine_mode mode = GET_MODE (operand0);
35862 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35863
35864 /* C code for SSE variant we expand below.
35865 double xa = fabs (x), x2;
35866 if (!isless (xa, TWO52))
35867 return x;
35868 xa2 = xa + TWO52 - TWO52;
35869 Compensate:
35870 if (xa2 > xa)
35871 xa2 -= 1.0;
35872 x2 = copysign (xa2, x);
35873 return x2;
35874 */
35875
35876 TWO52 = ix86_gen_TWO52 (mode);
35877
35878 /* Temporary for holding the result, initialized to the input
35879 operand to ease control flow. */
35880 res = gen_reg_rtx (mode);
35881 emit_move_insn (res, operand1);
35882
35883 /* xa = abs (operand1) */
35884 xa = ix86_expand_sse_fabs (res, &smask);
35885
35886 /* if (!isless (xa, TWO52)) goto label; */
35887 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35888
35889 /* res = xa + TWO52 - TWO52; */
35890 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35891 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35892 emit_move_insn (res, tmp);
35893
35894 /* generate 1.0 */
35895 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35896
35897 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35898 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35899 emit_insn (gen_rtx_SET (VOIDmode, mask,
35900 gen_rtx_AND (mode, mask, one)));
35901 tmp = expand_simple_binop (mode, MINUS,
35902 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35903 emit_move_insn (res, tmp);
35904
35905 /* res = copysign (res, operand1) */
35906 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35907
35908 emit_label (label);
35909 LABEL_NUSES (label) = 1;
35910
35911 emit_move_insn (operand0, res);
35912 }
35913
35914 /* Expand SSE sequence for computing round from OPERAND1 storing
35915 into OPERAND0. */
35916 void
35917 ix86_expand_round (rtx operand0, rtx operand1)
35918 {
35919 /* C code for the stuff we're doing below:
35920 double xa = fabs (x);
35921 if (!isless (xa, TWO52))
35922 return x;
35923 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35924 return copysign (xa, x);
35925 */
35926 enum machine_mode mode = GET_MODE (operand0);
35927 rtx res, TWO52, xa, label, xi, half, mask;
35928 const struct real_format *fmt;
35929 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35930
35931 /* Temporary for holding the result, initialized to the input
35932 operand to ease control flow. */
35933 res = gen_reg_rtx (mode);
35934 emit_move_insn (res, operand1);
35935
35936 TWO52 = ix86_gen_TWO52 (mode);
35937 xa = ix86_expand_sse_fabs (res, &mask);
35938 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35939
35940 /* load nextafter (0.5, 0.0) */
35941 fmt = REAL_MODE_FORMAT (mode);
35942 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35943 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35944
35945 /* xa = xa + 0.5 */
35946 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35947 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35948
35949 /* xa = (double)(int64_t)xa */
35950 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35951 expand_fix (xi, xa, 0);
35952 expand_float (xa, xi, 0);
35953
35954 /* res = copysign (xa, operand1) */
35955 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35956
35957 emit_label (label);
35958 LABEL_NUSES (label) = 1;
35959
35960 emit_move_insn (operand0, res);
35961 }
35962
35963 /* Expand SSE sequence for computing round
35964 from OP1 storing into OP0 using sse4 round insn. */
35965 void
35966 ix86_expand_round_sse4 (rtx op0, rtx op1)
35967 {
35968 enum machine_mode mode = GET_MODE (op0);
35969 rtx e1, e2, res, half;
35970 const struct real_format *fmt;
35971 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35972 rtx (*gen_copysign) (rtx, rtx, rtx);
35973 rtx (*gen_round) (rtx, rtx, rtx);
35974
35975 switch (mode)
35976 {
35977 case SFmode:
35978 gen_copysign = gen_copysignsf3;
35979 gen_round = gen_sse4_1_roundsf2;
35980 break;
35981 case DFmode:
35982 gen_copysign = gen_copysigndf3;
35983 gen_round = gen_sse4_1_rounddf2;
35984 break;
35985 default:
35986 gcc_unreachable ();
35987 }
35988
35989 /* round (a) = trunc (a + copysign (0.5, a)) */
35990
35991 /* load nextafter (0.5, 0.0) */
35992 fmt = REAL_MODE_FORMAT (mode);
35993 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35994 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35995 half = const_double_from_real_value (pred_half, mode);
35996
35997 /* e1 = copysign (0.5, op1) */
35998 e1 = gen_reg_rtx (mode);
35999 emit_insn (gen_copysign (e1, half, op1));
36000
36001 /* e2 = op1 + e1 */
36002 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
36003
36004 /* res = trunc (e2) */
36005 res = gen_reg_rtx (mode);
36006 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
36007
36008 emit_move_insn (op0, res);
36009 }
36010 \f
36011
36012 /* Table of valid machine attributes. */
36013 static const struct attribute_spec ix86_attribute_table[] =
36014 {
36015 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
36016 affects_type_identity } */
36017 /* Stdcall attribute says callee is responsible for popping arguments
36018 if they are not variable. */
36019 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36020 true },
36021 /* Fastcall attribute says callee is responsible for popping arguments
36022 if they are not variable. */
36023 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36024 true },
36025 /* Thiscall attribute says callee is responsible for popping arguments
36026 if they are not variable. */
36027 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36028 true },
36029 /* Cdecl attribute says the callee is a normal C declaration */
36030 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36031 true },
36032 /* Regparm attribute specifies how many integer arguments are to be
36033 passed in registers. */
36034 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
36035 true },
36036 /* Sseregparm attribute says we are using x86_64 calling conventions
36037 for FP arguments. */
36038 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36039 true },
36040 /* The transactional memory builtins are implicitly regparm or fastcall
36041 depending on the ABI. Override the generic do-nothing attribute that
36042 these builtins were declared with. */
36043 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
36044 true },
36045 /* force_align_arg_pointer says this function realigns the stack at entry. */
36046 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
36047 false, true, true, ix86_handle_cconv_attribute, false },
36048 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
36049 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
36050 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
36051 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
36052 false },
36053 #endif
36054 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36055 false },
36056 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36057 false },
36058 #ifdef SUBTARGET_ATTRIBUTE_TABLE
36059 SUBTARGET_ATTRIBUTE_TABLE,
36060 #endif
36061 /* ms_abi and sysv_abi calling convention function attributes. */
36062 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36063 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36064 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
36065 false },
36066 { "callee_pop_aggregate_return", 1, 1, false, true, true,
36067 ix86_handle_callee_pop_aggregate_return, true },
36068 /* End element. */
36069 { NULL, 0, 0, false, false, false, NULL, false }
36070 };
36071
36072 /* Implement targetm.vectorize.builtin_vectorization_cost. */
36073 static int
36074 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
36075 tree vectype,
36076 int misalign ATTRIBUTE_UNUSED)
36077 {
36078 unsigned elements;
36079
36080 switch (type_of_cost)
36081 {
36082 case scalar_stmt:
36083 return ix86_cost->scalar_stmt_cost;
36084
36085 case scalar_load:
36086 return ix86_cost->scalar_load_cost;
36087
36088 case scalar_store:
36089 return ix86_cost->scalar_store_cost;
36090
36091 case vector_stmt:
36092 return ix86_cost->vec_stmt_cost;
36093
36094 case vector_load:
36095 return ix86_cost->vec_align_load_cost;
36096
36097 case vector_store:
36098 return ix86_cost->vec_store_cost;
36099
36100 case vec_to_scalar:
36101 return ix86_cost->vec_to_scalar_cost;
36102
36103 case scalar_to_vec:
36104 return ix86_cost->scalar_to_vec_cost;
36105
36106 case unaligned_load:
36107 case unaligned_store:
36108 return ix86_cost->vec_unalign_load_cost;
36109
36110 case cond_branch_taken:
36111 return ix86_cost->cond_taken_branch_cost;
36112
36113 case cond_branch_not_taken:
36114 return ix86_cost->cond_not_taken_branch_cost;
36115
36116 case vec_perm:
36117 case vec_promote_demote:
36118 return ix86_cost->vec_stmt_cost;
36119
36120 case vec_construct:
36121 elements = TYPE_VECTOR_SUBPARTS (vectype);
36122 return elements / 2 + 1;
36123
36124 default:
36125 gcc_unreachable ();
36126 }
36127 }
36128
36129 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36130 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36131 insn every time. */
36132
36133 static GTY(()) rtx vselect_insn;
36134
36135 /* Initialize vselect_insn. */
36136
36137 static void
36138 init_vselect_insn (void)
36139 {
36140 unsigned i;
36141 rtx x;
36142
36143 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36144 for (i = 0; i < MAX_VECT_LEN; ++i)
36145 XVECEXP (x, 0, i) = const0_rtx;
36146 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36147 const0_rtx), x);
36148 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36149 start_sequence ();
36150 vselect_insn = emit_insn (x);
36151 end_sequence ();
36152 }
36153
36154 /* Construct (set target (vec_select op0 (parallel perm))) and
36155 return true if that's a valid instruction in the active ISA. */
36156
36157 static bool
36158 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36159 unsigned nelt, bool testing_p)
36160 {
36161 unsigned int i;
36162 rtx x, save_vconcat;
36163 int icode;
36164
36165 if (vselect_insn == NULL_RTX)
36166 init_vselect_insn ();
36167
36168 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36169 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36170 for (i = 0; i < nelt; ++i)
36171 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36172 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36173 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36174 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36175 SET_DEST (PATTERN (vselect_insn)) = target;
36176 icode = recog_memoized (vselect_insn);
36177
36178 if (icode >= 0 && !testing_p)
36179 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36180
36181 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36182 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36183 INSN_CODE (vselect_insn) = -1;
36184
36185 return icode >= 0;
36186 }
36187
36188 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36189
36190 static bool
36191 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36192 const unsigned char *perm, unsigned nelt,
36193 bool testing_p)
36194 {
36195 enum machine_mode v2mode;
36196 rtx x;
36197 bool ok;
36198
36199 if (vselect_insn == NULL_RTX)
36200 init_vselect_insn ();
36201
36202 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36203 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36204 PUT_MODE (x, v2mode);
36205 XEXP (x, 0) = op0;
36206 XEXP (x, 1) = op1;
36207 ok = expand_vselect (target, x, perm, nelt, testing_p);
36208 XEXP (x, 0) = const0_rtx;
36209 XEXP (x, 1) = const0_rtx;
36210 return ok;
36211 }
36212
36213 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36214 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36215
36216 static bool
36217 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36218 {
36219 enum machine_mode vmode = d->vmode;
36220 unsigned i, mask, nelt = d->nelt;
36221 rtx target, op0, op1, x;
36222 rtx rperm[32], vperm;
36223
36224 if (d->one_operand_p)
36225 return false;
36226 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36227 ;
36228 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36229 ;
36230 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36231 ;
36232 else
36233 return false;
36234
36235 /* This is a blend, not a permute. Elements must stay in their
36236 respective lanes. */
36237 for (i = 0; i < nelt; ++i)
36238 {
36239 unsigned e = d->perm[i];
36240 if (!(e == i || e == i + nelt))
36241 return false;
36242 }
36243
36244 if (d->testing_p)
36245 return true;
36246
36247 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36248 decision should be extracted elsewhere, so that we only try that
36249 sequence once all budget==3 options have been tried. */
36250 target = d->target;
36251 op0 = d->op0;
36252 op1 = d->op1;
36253 mask = 0;
36254
36255 switch (vmode)
36256 {
36257 case V4DFmode:
36258 case V8SFmode:
36259 case V2DFmode:
36260 case V4SFmode:
36261 case V8HImode:
36262 case V8SImode:
36263 for (i = 0; i < nelt; ++i)
36264 mask |= (d->perm[i] >= nelt) << i;
36265 break;
36266
36267 case V2DImode:
36268 for (i = 0; i < 2; ++i)
36269 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36270 vmode = V8HImode;
36271 goto do_subreg;
36272
36273 case V4SImode:
36274 for (i = 0; i < 4; ++i)
36275 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36276 vmode = V8HImode;
36277 goto do_subreg;
36278
36279 case V16QImode:
36280 /* See if bytes move in pairs so we can use pblendw with
36281 an immediate argument, rather than pblendvb with a vector
36282 argument. */
36283 for (i = 0; i < 16; i += 2)
36284 if (d->perm[i] + 1 != d->perm[i + 1])
36285 {
36286 use_pblendvb:
36287 for (i = 0; i < nelt; ++i)
36288 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36289
36290 finish_pblendvb:
36291 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36292 vperm = force_reg (vmode, vperm);
36293
36294 if (GET_MODE_SIZE (vmode) == 16)
36295 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36296 else
36297 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36298 return true;
36299 }
36300
36301 for (i = 0; i < 8; ++i)
36302 mask |= (d->perm[i * 2] >= 16) << i;
36303 vmode = V8HImode;
36304 /* FALLTHRU */
36305
36306 do_subreg:
36307 target = gen_lowpart (vmode, target);
36308 op0 = gen_lowpart (vmode, op0);
36309 op1 = gen_lowpart (vmode, op1);
36310 break;
36311
36312 case V32QImode:
36313 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36314 for (i = 0; i < 32; i += 2)
36315 if (d->perm[i] + 1 != d->perm[i + 1])
36316 goto use_pblendvb;
36317 /* See if bytes move in quadruplets. If yes, vpblendd
36318 with immediate can be used. */
36319 for (i = 0; i < 32; i += 4)
36320 if (d->perm[i] + 2 != d->perm[i + 2])
36321 break;
36322 if (i < 32)
36323 {
36324 /* See if bytes move the same in both lanes. If yes,
36325 vpblendw with immediate can be used. */
36326 for (i = 0; i < 16; i += 2)
36327 if (d->perm[i] + 16 != d->perm[i + 16])
36328 goto use_pblendvb;
36329
36330 /* Use vpblendw. */
36331 for (i = 0; i < 16; ++i)
36332 mask |= (d->perm[i * 2] >= 32) << i;
36333 vmode = V16HImode;
36334 goto do_subreg;
36335 }
36336
36337 /* Use vpblendd. */
36338 for (i = 0; i < 8; ++i)
36339 mask |= (d->perm[i * 4] >= 32) << i;
36340 vmode = V8SImode;
36341 goto do_subreg;
36342
36343 case V16HImode:
36344 /* See if words move in pairs. If yes, vpblendd can be used. */
36345 for (i = 0; i < 16; i += 2)
36346 if (d->perm[i] + 1 != d->perm[i + 1])
36347 break;
36348 if (i < 16)
36349 {
36350 /* See if words move the same in both lanes. If not,
36351 vpblendvb must be used. */
36352 for (i = 0; i < 8; i++)
36353 if (d->perm[i] + 8 != d->perm[i + 8])
36354 {
36355 /* Use vpblendvb. */
36356 for (i = 0; i < 32; ++i)
36357 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36358
36359 vmode = V32QImode;
36360 nelt = 32;
36361 target = gen_lowpart (vmode, target);
36362 op0 = gen_lowpart (vmode, op0);
36363 op1 = gen_lowpart (vmode, op1);
36364 goto finish_pblendvb;
36365 }
36366
36367 /* Use vpblendw. */
36368 for (i = 0; i < 16; ++i)
36369 mask |= (d->perm[i] >= 16) << i;
36370 break;
36371 }
36372
36373 /* Use vpblendd. */
36374 for (i = 0; i < 8; ++i)
36375 mask |= (d->perm[i * 2] >= 16) << i;
36376 vmode = V8SImode;
36377 goto do_subreg;
36378
36379 case V4DImode:
36380 /* Use vpblendd. */
36381 for (i = 0; i < 4; ++i)
36382 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36383 vmode = V8SImode;
36384 goto do_subreg;
36385
36386 default:
36387 gcc_unreachable ();
36388 }
36389
36390 /* This matches five different patterns with the different modes. */
36391 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36392 x = gen_rtx_SET (VOIDmode, target, x);
36393 emit_insn (x);
36394
36395 return true;
36396 }
36397
36398 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36399 in terms of the variable form of vpermilps.
36400
36401 Note that we will have already failed the immediate input vpermilps,
36402 which requires that the high and low part shuffle be identical; the
36403 variable form doesn't require that. */
36404
36405 static bool
36406 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36407 {
36408 rtx rperm[8], vperm;
36409 unsigned i;
36410
36411 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36412 return false;
36413
36414 /* We can only permute within the 128-bit lane. */
36415 for (i = 0; i < 8; ++i)
36416 {
36417 unsigned e = d->perm[i];
36418 if (i < 4 ? e >= 4 : e < 4)
36419 return false;
36420 }
36421
36422 if (d->testing_p)
36423 return true;
36424
36425 for (i = 0; i < 8; ++i)
36426 {
36427 unsigned e = d->perm[i];
36428
36429 /* Within each 128-bit lane, the elements of op0 are numbered
36430 from 0 and the elements of op1 are numbered from 4. */
36431 if (e >= 8 + 4)
36432 e -= 8;
36433 else if (e >= 4)
36434 e -= 4;
36435
36436 rperm[i] = GEN_INT (e);
36437 }
36438
36439 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36440 vperm = force_reg (V8SImode, vperm);
36441 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36442
36443 return true;
36444 }
36445
36446 /* Return true if permutation D can be performed as VMODE permutation
36447 instead. */
36448
36449 static bool
36450 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36451 {
36452 unsigned int i, j, chunk;
36453
36454 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36455 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36456 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36457 return false;
36458
36459 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36460 return true;
36461
36462 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36463 for (i = 0; i < d->nelt; i += chunk)
36464 if (d->perm[i] & (chunk - 1))
36465 return false;
36466 else
36467 for (j = 1; j < chunk; ++j)
36468 if (d->perm[i] + j != d->perm[i + j])
36469 return false;
36470
36471 return true;
36472 }
36473
36474 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36475 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36476
36477 static bool
36478 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36479 {
36480 unsigned i, nelt, eltsz, mask;
36481 unsigned char perm[32];
36482 enum machine_mode vmode = V16QImode;
36483 rtx rperm[32], vperm, target, op0, op1;
36484
36485 nelt = d->nelt;
36486
36487 if (!d->one_operand_p)
36488 {
36489 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36490 {
36491 if (TARGET_AVX2
36492 && valid_perm_using_mode_p (V2TImode, d))
36493 {
36494 if (d->testing_p)
36495 return true;
36496
36497 /* Use vperm2i128 insn. The pattern uses
36498 V4DImode instead of V2TImode. */
36499 target = gen_lowpart (V4DImode, d->target);
36500 op0 = gen_lowpart (V4DImode, d->op0);
36501 op1 = gen_lowpart (V4DImode, d->op1);
36502 rperm[0]
36503 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36504 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36505 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36506 return true;
36507 }
36508 return false;
36509 }
36510 }
36511 else
36512 {
36513 if (GET_MODE_SIZE (d->vmode) == 16)
36514 {
36515 if (!TARGET_SSSE3)
36516 return false;
36517 }
36518 else if (GET_MODE_SIZE (d->vmode) == 32)
36519 {
36520 if (!TARGET_AVX2)
36521 return false;
36522
36523 /* V4DImode should be already handled through
36524 expand_vselect by vpermq instruction. */
36525 gcc_assert (d->vmode != V4DImode);
36526
36527 vmode = V32QImode;
36528 if (d->vmode == V8SImode
36529 || d->vmode == V16HImode
36530 || d->vmode == V32QImode)
36531 {
36532 /* First see if vpermq can be used for
36533 V8SImode/V16HImode/V32QImode. */
36534 if (valid_perm_using_mode_p (V4DImode, d))
36535 {
36536 for (i = 0; i < 4; i++)
36537 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36538 if (d->testing_p)
36539 return true;
36540 return expand_vselect (gen_lowpart (V4DImode, d->target),
36541 gen_lowpart (V4DImode, d->op0),
36542 perm, 4, false);
36543 }
36544
36545 /* Next see if vpermd can be used. */
36546 if (valid_perm_using_mode_p (V8SImode, d))
36547 vmode = V8SImode;
36548 }
36549 /* Or if vpermps can be used. */
36550 else if (d->vmode == V8SFmode)
36551 vmode = V8SImode;
36552
36553 if (vmode == V32QImode)
36554 {
36555 /* vpshufb only works intra lanes, it is not
36556 possible to shuffle bytes in between the lanes. */
36557 for (i = 0; i < nelt; ++i)
36558 if ((d->perm[i] ^ i) & (nelt / 2))
36559 return false;
36560 }
36561 }
36562 else
36563 return false;
36564 }
36565
36566 if (d->testing_p)
36567 return true;
36568
36569 if (vmode == V8SImode)
36570 for (i = 0; i < 8; ++i)
36571 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36572 else
36573 {
36574 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36575 if (!d->one_operand_p)
36576 mask = 2 * nelt - 1;
36577 else if (vmode == V16QImode)
36578 mask = nelt - 1;
36579 else
36580 mask = nelt / 2 - 1;
36581
36582 for (i = 0; i < nelt; ++i)
36583 {
36584 unsigned j, e = d->perm[i] & mask;
36585 for (j = 0; j < eltsz; ++j)
36586 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36587 }
36588 }
36589
36590 vperm = gen_rtx_CONST_VECTOR (vmode,
36591 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36592 vperm = force_reg (vmode, vperm);
36593
36594 target = gen_lowpart (vmode, d->target);
36595 op0 = gen_lowpart (vmode, d->op0);
36596 if (d->one_operand_p)
36597 {
36598 if (vmode == V16QImode)
36599 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36600 else if (vmode == V32QImode)
36601 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36602 else if (vmode == V8SFmode)
36603 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
36604 else
36605 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36606 }
36607 else
36608 {
36609 op1 = gen_lowpart (vmode, d->op1);
36610 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36611 }
36612
36613 return true;
36614 }
36615
36616 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36617 in a single instruction. */
36618
36619 static bool
36620 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36621 {
36622 unsigned i, nelt = d->nelt;
36623 unsigned char perm2[MAX_VECT_LEN];
36624
36625 /* Check plain VEC_SELECT first, because AVX has instructions that could
36626 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36627 input where SEL+CONCAT may not. */
36628 if (d->one_operand_p)
36629 {
36630 int mask = nelt - 1;
36631 bool identity_perm = true;
36632 bool broadcast_perm = true;
36633
36634 for (i = 0; i < nelt; i++)
36635 {
36636 perm2[i] = d->perm[i] & mask;
36637 if (perm2[i] != i)
36638 identity_perm = false;
36639 if (perm2[i])
36640 broadcast_perm = false;
36641 }
36642
36643 if (identity_perm)
36644 {
36645 if (!d->testing_p)
36646 emit_move_insn (d->target, d->op0);
36647 return true;
36648 }
36649 else if (broadcast_perm && TARGET_AVX2)
36650 {
36651 /* Use vpbroadcast{b,w,d}. */
36652 rtx (*gen) (rtx, rtx) = NULL;
36653 switch (d->vmode)
36654 {
36655 case V32QImode:
36656 gen = gen_avx2_pbroadcastv32qi_1;
36657 break;
36658 case V16HImode:
36659 gen = gen_avx2_pbroadcastv16hi_1;
36660 break;
36661 case V8SImode:
36662 gen = gen_avx2_pbroadcastv8si_1;
36663 break;
36664 case V16QImode:
36665 gen = gen_avx2_pbroadcastv16qi;
36666 break;
36667 case V8HImode:
36668 gen = gen_avx2_pbroadcastv8hi;
36669 break;
36670 case V8SFmode:
36671 gen = gen_avx2_vec_dupv8sf_1;
36672 break;
36673 /* For other modes prefer other shuffles this function creates. */
36674 default: break;
36675 }
36676 if (gen != NULL)
36677 {
36678 if (!d->testing_p)
36679 emit_insn (gen (d->target, d->op0));
36680 return true;
36681 }
36682 }
36683
36684 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36685 return true;
36686
36687 /* There are plenty of patterns in sse.md that are written for
36688 SEL+CONCAT and are not replicated for a single op. Perhaps
36689 that should be changed, to avoid the nastiness here. */
36690
36691 /* Recognize interleave style patterns, which means incrementing
36692 every other permutation operand. */
36693 for (i = 0; i < nelt; i += 2)
36694 {
36695 perm2[i] = d->perm[i] & mask;
36696 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36697 }
36698 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36699 d->testing_p))
36700 return true;
36701
36702 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36703 if (nelt >= 4)
36704 {
36705 for (i = 0; i < nelt; i += 4)
36706 {
36707 perm2[i + 0] = d->perm[i + 0] & mask;
36708 perm2[i + 1] = d->perm[i + 1] & mask;
36709 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36710 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36711 }
36712
36713 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36714 d->testing_p))
36715 return true;
36716 }
36717 }
36718
36719 /* Finally, try the fully general two operand permute. */
36720 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36721 d->testing_p))
36722 return true;
36723
36724 /* Recognize interleave style patterns with reversed operands. */
36725 if (!d->one_operand_p)
36726 {
36727 for (i = 0; i < nelt; ++i)
36728 {
36729 unsigned e = d->perm[i];
36730 if (e >= nelt)
36731 e -= nelt;
36732 else
36733 e += nelt;
36734 perm2[i] = e;
36735 }
36736
36737 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36738 d->testing_p))
36739 return true;
36740 }
36741
36742 /* Try the SSE4.1 blend variable merge instructions. */
36743 if (expand_vec_perm_blend (d))
36744 return true;
36745
36746 /* Try one of the AVX vpermil variable permutations. */
36747 if (expand_vec_perm_vpermil (d))
36748 return true;
36749
36750 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36751 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36752 if (expand_vec_perm_pshufb (d))
36753 return true;
36754
36755 return false;
36756 }
36757
36758 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36759 in terms of a pair of pshuflw + pshufhw instructions. */
36760
36761 static bool
36762 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36763 {
36764 unsigned char perm2[MAX_VECT_LEN];
36765 unsigned i;
36766 bool ok;
36767
36768 if (d->vmode != V8HImode || !d->one_operand_p)
36769 return false;
36770
36771 /* The two permutations only operate in 64-bit lanes. */
36772 for (i = 0; i < 4; ++i)
36773 if (d->perm[i] >= 4)
36774 return false;
36775 for (i = 4; i < 8; ++i)
36776 if (d->perm[i] < 4)
36777 return false;
36778
36779 if (d->testing_p)
36780 return true;
36781
36782 /* Emit the pshuflw. */
36783 memcpy (perm2, d->perm, 4);
36784 for (i = 4; i < 8; ++i)
36785 perm2[i] = i;
36786 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36787 gcc_assert (ok);
36788
36789 /* Emit the pshufhw. */
36790 memcpy (perm2 + 4, d->perm + 4, 4);
36791 for (i = 0; i < 4; ++i)
36792 perm2[i] = i;
36793 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36794 gcc_assert (ok);
36795
36796 return true;
36797 }
36798
36799 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36800 the permutation using the SSSE3 palignr instruction. This succeeds
36801 when all of the elements in PERM fit within one vector and we merely
36802 need to shift them down so that a single vector permutation has a
36803 chance to succeed. */
36804
36805 static bool
36806 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36807 {
36808 unsigned i, nelt = d->nelt;
36809 unsigned min, max;
36810 bool in_order, ok;
36811 rtx shift;
36812
36813 /* Even with AVX, palignr only operates on 128-bit vectors. */
36814 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36815 return false;
36816
36817 min = nelt, max = 0;
36818 for (i = 0; i < nelt; ++i)
36819 {
36820 unsigned e = d->perm[i];
36821 if (e < min)
36822 min = e;
36823 if (e > max)
36824 max = e;
36825 }
36826 if (min == 0 || max - min >= nelt)
36827 return false;
36828
36829 /* Given that we have SSSE3, we know we'll be able to implement the
36830 single operand permutation after the palignr with pshufb. */
36831 if (d->testing_p)
36832 return true;
36833
36834 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36835 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36836 gen_lowpart (TImode, d->op1),
36837 gen_lowpart (TImode, d->op0), shift));
36838
36839 d->op0 = d->op1 = d->target;
36840 d->one_operand_p = true;
36841
36842 in_order = true;
36843 for (i = 0; i < nelt; ++i)
36844 {
36845 unsigned e = d->perm[i] - min;
36846 if (e != i)
36847 in_order = false;
36848 d->perm[i] = e;
36849 }
36850
36851 /* Test for the degenerate case where the alignment by itself
36852 produces the desired permutation. */
36853 if (in_order)
36854 return true;
36855
36856 ok = expand_vec_perm_1 (d);
36857 gcc_assert (ok);
36858
36859 return ok;
36860 }
36861
36862 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36863
36864 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36865 a two vector permutation into a single vector permutation by using
36866 an interleave operation to merge the vectors. */
36867
36868 static bool
36869 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36870 {
36871 struct expand_vec_perm_d dremap, dfinal;
36872 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36873 unsigned HOST_WIDE_INT contents;
36874 unsigned char remap[2 * MAX_VECT_LEN];
36875 rtx seq;
36876 bool ok, same_halves = false;
36877
36878 if (GET_MODE_SIZE (d->vmode) == 16)
36879 {
36880 if (d->one_operand_p)
36881 return false;
36882 }
36883 else if (GET_MODE_SIZE (d->vmode) == 32)
36884 {
36885 if (!TARGET_AVX)
36886 return false;
36887 /* For 32-byte modes allow even d->one_operand_p.
36888 The lack of cross-lane shuffling in some instructions
36889 might prevent a single insn shuffle. */
36890 dfinal = *d;
36891 dfinal.testing_p = true;
36892 /* If expand_vec_perm_interleave3 can expand this into
36893 a 3 insn sequence, give up and let it be expanded as
36894 3 insn sequence. While that is one insn longer,
36895 it doesn't need a memory operand and in the common
36896 case that both interleave low and high permutations
36897 with the same operands are adjacent needs 4 insns
36898 for both after CSE. */
36899 if (expand_vec_perm_interleave3 (&dfinal))
36900 return false;
36901 }
36902 else
36903 return false;
36904
36905 /* Examine from whence the elements come. */
36906 contents = 0;
36907 for (i = 0; i < nelt; ++i)
36908 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36909
36910 memset (remap, 0xff, sizeof (remap));
36911 dremap = *d;
36912
36913 if (GET_MODE_SIZE (d->vmode) == 16)
36914 {
36915 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36916
36917 /* Split the two input vectors into 4 halves. */
36918 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36919 h2 = h1 << nelt2;
36920 h3 = h2 << nelt2;
36921 h4 = h3 << nelt2;
36922
36923 /* If the elements from the low halves use interleave low, and similarly
36924 for interleave high. If the elements are from mis-matched halves, we
36925 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36926 if ((contents & (h1 | h3)) == contents)
36927 {
36928 /* punpckl* */
36929 for (i = 0; i < nelt2; ++i)
36930 {
36931 remap[i] = i * 2;
36932 remap[i + nelt] = i * 2 + 1;
36933 dremap.perm[i * 2] = i;
36934 dremap.perm[i * 2 + 1] = i + nelt;
36935 }
36936 if (!TARGET_SSE2 && d->vmode == V4SImode)
36937 dremap.vmode = V4SFmode;
36938 }
36939 else if ((contents & (h2 | h4)) == contents)
36940 {
36941 /* punpckh* */
36942 for (i = 0; i < nelt2; ++i)
36943 {
36944 remap[i + nelt2] = i * 2;
36945 remap[i + nelt + nelt2] = i * 2 + 1;
36946 dremap.perm[i * 2] = i + nelt2;
36947 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36948 }
36949 if (!TARGET_SSE2 && d->vmode == V4SImode)
36950 dremap.vmode = V4SFmode;
36951 }
36952 else if ((contents & (h1 | h4)) == contents)
36953 {
36954 /* shufps */
36955 for (i = 0; i < nelt2; ++i)
36956 {
36957 remap[i] = i;
36958 remap[i + nelt + nelt2] = i + nelt2;
36959 dremap.perm[i] = i;
36960 dremap.perm[i + nelt2] = i + nelt + nelt2;
36961 }
36962 if (nelt != 4)
36963 {
36964 /* shufpd */
36965 dremap.vmode = V2DImode;
36966 dremap.nelt = 2;
36967 dremap.perm[0] = 0;
36968 dremap.perm[1] = 3;
36969 }
36970 }
36971 else if ((contents & (h2 | h3)) == contents)
36972 {
36973 /* shufps */
36974 for (i = 0; i < nelt2; ++i)
36975 {
36976 remap[i + nelt2] = i;
36977 remap[i + nelt] = i + nelt2;
36978 dremap.perm[i] = i + nelt2;
36979 dremap.perm[i + nelt2] = i + nelt;
36980 }
36981 if (nelt != 4)
36982 {
36983 /* shufpd */
36984 dremap.vmode = V2DImode;
36985 dremap.nelt = 2;
36986 dremap.perm[0] = 1;
36987 dremap.perm[1] = 2;
36988 }
36989 }
36990 else
36991 return false;
36992 }
36993 else
36994 {
36995 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36996 unsigned HOST_WIDE_INT q[8];
36997 unsigned int nonzero_halves[4];
36998
36999 /* Split the two input vectors into 8 quarters. */
37000 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
37001 for (i = 1; i < 8; ++i)
37002 q[i] = q[0] << (nelt4 * i);
37003 for (i = 0; i < 4; ++i)
37004 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
37005 {
37006 nonzero_halves[nzcnt] = i;
37007 ++nzcnt;
37008 }
37009
37010 if (nzcnt == 1)
37011 {
37012 gcc_assert (d->one_operand_p);
37013 nonzero_halves[1] = nonzero_halves[0];
37014 same_halves = true;
37015 }
37016 else if (d->one_operand_p)
37017 {
37018 gcc_assert (nonzero_halves[0] == 0);
37019 gcc_assert (nonzero_halves[1] == 1);
37020 }
37021
37022 if (nzcnt <= 2)
37023 {
37024 if (d->perm[0] / nelt2 == nonzero_halves[1])
37025 {
37026 /* Attempt to increase the likelihood that dfinal
37027 shuffle will be intra-lane. */
37028 char tmph = nonzero_halves[0];
37029 nonzero_halves[0] = nonzero_halves[1];
37030 nonzero_halves[1] = tmph;
37031 }
37032
37033 /* vperm2f128 or vperm2i128. */
37034 for (i = 0; i < nelt2; ++i)
37035 {
37036 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
37037 remap[i + nonzero_halves[0] * nelt2] = i;
37038 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
37039 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
37040 }
37041
37042 if (d->vmode != V8SFmode
37043 && d->vmode != V4DFmode
37044 && d->vmode != V8SImode)
37045 {
37046 dremap.vmode = V8SImode;
37047 dremap.nelt = 8;
37048 for (i = 0; i < 4; ++i)
37049 {
37050 dremap.perm[i] = i + nonzero_halves[0] * 4;
37051 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
37052 }
37053 }
37054 }
37055 else if (d->one_operand_p)
37056 return false;
37057 else if (TARGET_AVX2
37058 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
37059 {
37060 /* vpunpckl* */
37061 for (i = 0; i < nelt4; ++i)
37062 {
37063 remap[i] = i * 2;
37064 remap[i + nelt] = i * 2 + 1;
37065 remap[i + nelt2] = i * 2 + nelt2;
37066 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
37067 dremap.perm[i * 2] = i;
37068 dremap.perm[i * 2 + 1] = i + nelt;
37069 dremap.perm[i * 2 + nelt2] = i + nelt2;
37070 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
37071 }
37072 }
37073 else if (TARGET_AVX2
37074 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
37075 {
37076 /* vpunpckh* */
37077 for (i = 0; i < nelt4; ++i)
37078 {
37079 remap[i + nelt4] = i * 2;
37080 remap[i + nelt + nelt4] = i * 2 + 1;
37081 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37082 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37083 dremap.perm[i * 2] = i + nelt4;
37084 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37085 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37086 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37087 }
37088 }
37089 else
37090 return false;
37091 }
37092
37093 /* Use the remapping array set up above to move the elements from their
37094 swizzled locations into their final destinations. */
37095 dfinal = *d;
37096 for (i = 0; i < nelt; ++i)
37097 {
37098 unsigned e = remap[d->perm[i]];
37099 gcc_assert (e < nelt);
37100 /* If same_halves is true, both halves of the remapped vector are the
37101 same. Avoid cross-lane accesses if possible. */
37102 if (same_halves && i >= nelt2)
37103 {
37104 gcc_assert (e < nelt2);
37105 dfinal.perm[i] = e + nelt2;
37106 }
37107 else
37108 dfinal.perm[i] = e;
37109 }
37110 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37111 dfinal.op1 = dfinal.op0;
37112 dfinal.one_operand_p = true;
37113 dremap.target = dfinal.op0;
37114
37115 /* Test if the final remap can be done with a single insn. For V4SFmode or
37116 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37117 start_sequence ();
37118 ok = expand_vec_perm_1 (&dfinal);
37119 seq = get_insns ();
37120 end_sequence ();
37121
37122 if (!ok)
37123 return false;
37124
37125 if (d->testing_p)
37126 return true;
37127
37128 if (dremap.vmode != dfinal.vmode)
37129 {
37130 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37131 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37132 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37133 }
37134
37135 ok = expand_vec_perm_1 (&dremap);
37136 gcc_assert (ok);
37137
37138 emit_insn (seq);
37139 return true;
37140 }
37141
37142 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37143 a single vector cross-lane permutation into vpermq followed
37144 by any of the single insn permutations. */
37145
37146 static bool
37147 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37148 {
37149 struct expand_vec_perm_d dremap, dfinal;
37150 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37151 unsigned contents[2];
37152 bool ok;
37153
37154 if (!(TARGET_AVX2
37155 && (d->vmode == V32QImode || d->vmode == V16HImode)
37156 && d->one_operand_p))
37157 return false;
37158
37159 contents[0] = 0;
37160 contents[1] = 0;
37161 for (i = 0; i < nelt2; ++i)
37162 {
37163 contents[0] |= 1u << (d->perm[i] / nelt4);
37164 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37165 }
37166
37167 for (i = 0; i < 2; ++i)
37168 {
37169 unsigned int cnt = 0;
37170 for (j = 0; j < 4; ++j)
37171 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37172 return false;
37173 }
37174
37175 if (d->testing_p)
37176 return true;
37177
37178 dremap = *d;
37179 dremap.vmode = V4DImode;
37180 dremap.nelt = 4;
37181 dremap.target = gen_reg_rtx (V4DImode);
37182 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37183 dremap.op1 = dremap.op0;
37184 dremap.one_operand_p = true;
37185 for (i = 0; i < 2; ++i)
37186 {
37187 unsigned int cnt = 0;
37188 for (j = 0; j < 4; ++j)
37189 if ((contents[i] & (1u << j)) != 0)
37190 dremap.perm[2 * i + cnt++] = j;
37191 for (; cnt < 2; ++cnt)
37192 dremap.perm[2 * i + cnt] = 0;
37193 }
37194
37195 dfinal = *d;
37196 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37197 dfinal.op1 = dfinal.op0;
37198 dfinal.one_operand_p = true;
37199 for (i = 0, j = 0; i < nelt; ++i)
37200 {
37201 if (i == nelt2)
37202 j = 2;
37203 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37204 if ((d->perm[i] / nelt4) == dremap.perm[j])
37205 ;
37206 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37207 dfinal.perm[i] |= nelt4;
37208 else
37209 gcc_unreachable ();
37210 }
37211
37212 ok = expand_vec_perm_1 (&dremap);
37213 gcc_assert (ok);
37214
37215 ok = expand_vec_perm_1 (&dfinal);
37216 gcc_assert (ok);
37217
37218 return true;
37219 }
37220
37221 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37222 a vector permutation using two instructions, vperm2f128 resp.
37223 vperm2i128 followed by any single in-lane permutation. */
37224
37225 static bool
37226 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37227 {
37228 struct expand_vec_perm_d dfirst, dsecond;
37229 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37230 bool ok;
37231
37232 if (!TARGET_AVX
37233 || GET_MODE_SIZE (d->vmode) != 32
37234 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37235 return false;
37236
37237 dsecond = *d;
37238 dsecond.one_operand_p = false;
37239 dsecond.testing_p = true;
37240
37241 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37242 immediate. For perm < 16 the second permutation uses
37243 d->op0 as first operand, for perm >= 16 it uses d->op1
37244 as first operand. The second operand is the result of
37245 vperm2[fi]128. */
37246 for (perm = 0; perm < 32; perm++)
37247 {
37248 /* Ignore permutations which do not move anything cross-lane. */
37249 if (perm < 16)
37250 {
37251 /* The second shuffle for e.g. V4DFmode has
37252 0123 and ABCD operands.
37253 Ignore AB23, as 23 is already in the second lane
37254 of the first operand. */
37255 if ((perm & 0xc) == (1 << 2)) continue;
37256 /* And 01CD, as 01 is in the first lane of the first
37257 operand. */
37258 if ((perm & 3) == 0) continue;
37259 /* And 4567, as then the vperm2[fi]128 doesn't change
37260 anything on the original 4567 second operand. */
37261 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37262 }
37263 else
37264 {
37265 /* The second shuffle for e.g. V4DFmode has
37266 4567 and ABCD operands.
37267 Ignore AB67, as 67 is already in the second lane
37268 of the first operand. */
37269 if ((perm & 0xc) == (3 << 2)) continue;
37270 /* And 45CD, as 45 is in the first lane of the first
37271 operand. */
37272 if ((perm & 3) == 2) continue;
37273 /* And 0123, as then the vperm2[fi]128 doesn't change
37274 anything on the original 0123 first operand. */
37275 if ((perm & 0xf) == (1 << 2)) continue;
37276 }
37277
37278 for (i = 0; i < nelt; i++)
37279 {
37280 j = d->perm[i] / nelt2;
37281 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37282 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37283 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37284 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37285 else
37286 break;
37287 }
37288
37289 if (i == nelt)
37290 {
37291 start_sequence ();
37292 ok = expand_vec_perm_1 (&dsecond);
37293 end_sequence ();
37294 }
37295 else
37296 ok = false;
37297
37298 if (ok)
37299 {
37300 if (d->testing_p)
37301 return true;
37302
37303 /* Found a usable second shuffle. dfirst will be
37304 vperm2f128 on d->op0 and d->op1. */
37305 dsecond.testing_p = false;
37306 dfirst = *d;
37307 dfirst.target = gen_reg_rtx (d->vmode);
37308 for (i = 0; i < nelt; i++)
37309 dfirst.perm[i] = (i & (nelt2 - 1))
37310 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37311
37312 ok = expand_vec_perm_1 (&dfirst);
37313 gcc_assert (ok);
37314
37315 /* And dsecond is some single insn shuffle, taking
37316 d->op0 and result of vperm2f128 (if perm < 16) or
37317 d->op1 and result of vperm2f128 (otherwise). */
37318 dsecond.op1 = dfirst.target;
37319 if (perm >= 16)
37320 dsecond.op0 = dfirst.op1;
37321
37322 ok = expand_vec_perm_1 (&dsecond);
37323 gcc_assert (ok);
37324
37325 return true;
37326 }
37327
37328 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37329 if (d->one_operand_p)
37330 return false;
37331 }
37332
37333 return false;
37334 }
37335
37336 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37337 a two vector permutation using 2 intra-lane interleave insns
37338 and cross-lane shuffle for 32-byte vectors. */
37339
37340 static bool
37341 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37342 {
37343 unsigned i, nelt;
37344 rtx (*gen) (rtx, rtx, rtx);
37345
37346 if (d->one_operand_p)
37347 return false;
37348 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37349 ;
37350 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37351 ;
37352 else
37353 return false;
37354
37355 nelt = d->nelt;
37356 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37357 return false;
37358 for (i = 0; i < nelt; i += 2)
37359 if (d->perm[i] != d->perm[0] + i / 2
37360 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37361 return false;
37362
37363 if (d->testing_p)
37364 return true;
37365
37366 switch (d->vmode)
37367 {
37368 case V32QImode:
37369 if (d->perm[0])
37370 gen = gen_vec_interleave_highv32qi;
37371 else
37372 gen = gen_vec_interleave_lowv32qi;
37373 break;
37374 case V16HImode:
37375 if (d->perm[0])
37376 gen = gen_vec_interleave_highv16hi;
37377 else
37378 gen = gen_vec_interleave_lowv16hi;
37379 break;
37380 case V8SImode:
37381 if (d->perm[0])
37382 gen = gen_vec_interleave_highv8si;
37383 else
37384 gen = gen_vec_interleave_lowv8si;
37385 break;
37386 case V4DImode:
37387 if (d->perm[0])
37388 gen = gen_vec_interleave_highv4di;
37389 else
37390 gen = gen_vec_interleave_lowv4di;
37391 break;
37392 case V8SFmode:
37393 if (d->perm[0])
37394 gen = gen_vec_interleave_highv8sf;
37395 else
37396 gen = gen_vec_interleave_lowv8sf;
37397 break;
37398 case V4DFmode:
37399 if (d->perm[0])
37400 gen = gen_vec_interleave_highv4df;
37401 else
37402 gen = gen_vec_interleave_lowv4df;
37403 break;
37404 default:
37405 gcc_unreachable ();
37406 }
37407
37408 emit_insn (gen (d->target, d->op0, d->op1));
37409 return true;
37410 }
37411
37412 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37413 a single vector permutation using a single intra-lane vector
37414 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37415 the non-swapped and swapped vectors together. */
37416
37417 static bool
37418 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37419 {
37420 struct expand_vec_perm_d dfirst, dsecond;
37421 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37422 rtx seq;
37423 bool ok;
37424 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37425
37426 if (!TARGET_AVX
37427 || TARGET_AVX2
37428 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37429 || !d->one_operand_p)
37430 return false;
37431
37432 dfirst = *d;
37433 for (i = 0; i < nelt; i++)
37434 dfirst.perm[i] = 0xff;
37435 for (i = 0, msk = 0; i < nelt; i++)
37436 {
37437 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37438 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37439 return false;
37440 dfirst.perm[j] = d->perm[i];
37441 if (j != i)
37442 msk |= (1 << i);
37443 }
37444 for (i = 0; i < nelt; i++)
37445 if (dfirst.perm[i] == 0xff)
37446 dfirst.perm[i] = i;
37447
37448 if (!d->testing_p)
37449 dfirst.target = gen_reg_rtx (dfirst.vmode);
37450
37451 start_sequence ();
37452 ok = expand_vec_perm_1 (&dfirst);
37453 seq = get_insns ();
37454 end_sequence ();
37455
37456 if (!ok)
37457 return false;
37458
37459 if (d->testing_p)
37460 return true;
37461
37462 emit_insn (seq);
37463
37464 dsecond = *d;
37465 dsecond.op0 = dfirst.target;
37466 dsecond.op1 = dfirst.target;
37467 dsecond.one_operand_p = true;
37468 dsecond.target = gen_reg_rtx (dsecond.vmode);
37469 for (i = 0; i < nelt; i++)
37470 dsecond.perm[i] = i ^ nelt2;
37471
37472 ok = expand_vec_perm_1 (&dsecond);
37473 gcc_assert (ok);
37474
37475 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37476 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37477 return true;
37478 }
37479
37480 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
37481 permutation using two vperm2f128, followed by a vshufpd insn blending
37482 the two vectors together. */
37483
37484 static bool
37485 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
37486 {
37487 struct expand_vec_perm_d dfirst, dsecond, dthird;
37488 bool ok;
37489
37490 if (!TARGET_AVX || (d->vmode != V4DFmode))
37491 return false;
37492
37493 if (d->testing_p)
37494 return true;
37495
37496 dfirst = *d;
37497 dsecond = *d;
37498 dthird = *d;
37499
37500 dfirst.perm[0] = (d->perm[0] & ~1);
37501 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
37502 dfirst.perm[2] = (d->perm[2] & ~1);
37503 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
37504 dsecond.perm[0] = (d->perm[1] & ~1);
37505 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
37506 dsecond.perm[2] = (d->perm[3] & ~1);
37507 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
37508 dthird.perm[0] = (d->perm[0] % 2);
37509 dthird.perm[1] = (d->perm[1] % 2) + 4;
37510 dthird.perm[2] = (d->perm[2] % 2) + 2;
37511 dthird.perm[3] = (d->perm[3] % 2) + 6;
37512
37513 dfirst.target = gen_reg_rtx (dfirst.vmode);
37514 dsecond.target = gen_reg_rtx (dsecond.vmode);
37515 dthird.op0 = dfirst.target;
37516 dthird.op1 = dsecond.target;
37517 dthird.one_operand_p = false;
37518
37519 canonicalize_perm (&dfirst);
37520 canonicalize_perm (&dsecond);
37521
37522 ok = expand_vec_perm_1 (&dfirst)
37523 && expand_vec_perm_1 (&dsecond)
37524 && expand_vec_perm_1 (&dthird);
37525
37526 gcc_assert (ok);
37527
37528 return true;
37529 }
37530
37531 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37532 permutation with two pshufb insns and an ior. We should have already
37533 failed all two instruction sequences. */
37534
37535 static bool
37536 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37537 {
37538 rtx rperm[2][16], vperm, l, h, op, m128;
37539 unsigned int i, nelt, eltsz;
37540
37541 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37542 return false;
37543 gcc_assert (!d->one_operand_p);
37544
37545 nelt = d->nelt;
37546 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37547
37548 /* Generate two permutation masks. If the required element is within
37549 the given vector it is shuffled into the proper lane. If the required
37550 element is in the other vector, force a zero into the lane by setting
37551 bit 7 in the permutation mask. */
37552 m128 = GEN_INT (-128);
37553 for (i = 0; i < nelt; ++i)
37554 {
37555 unsigned j, e = d->perm[i];
37556 unsigned which = (e >= nelt);
37557 if (e >= nelt)
37558 e -= nelt;
37559
37560 for (j = 0; j < eltsz; ++j)
37561 {
37562 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37563 rperm[1-which][i*eltsz + j] = m128;
37564 }
37565 }
37566
37567 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37568 vperm = force_reg (V16QImode, vperm);
37569
37570 l = gen_reg_rtx (V16QImode);
37571 op = gen_lowpart (V16QImode, d->op0);
37572 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37573
37574 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37575 vperm = force_reg (V16QImode, vperm);
37576
37577 h = gen_reg_rtx (V16QImode);
37578 op = gen_lowpart (V16QImode, d->op1);
37579 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37580
37581 op = gen_lowpart (V16QImode, d->target);
37582 emit_insn (gen_iorv16qi3 (op, l, h));
37583
37584 return true;
37585 }
37586
37587 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37588 with two vpshufb insns, vpermq and vpor. We should have already failed
37589 all two or three instruction sequences. */
37590
37591 static bool
37592 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37593 {
37594 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37595 unsigned int i, nelt, eltsz;
37596
37597 if (!TARGET_AVX2
37598 || !d->one_operand_p
37599 || (d->vmode != V32QImode && d->vmode != V16HImode))
37600 return false;
37601
37602 if (d->testing_p)
37603 return true;
37604
37605 nelt = d->nelt;
37606 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37607
37608 /* Generate two permutation masks. If the required element is within
37609 the same lane, it is shuffled in. If the required element from the
37610 other lane, force a zero by setting bit 7 in the permutation mask.
37611 In the other mask the mask has non-negative elements if element
37612 is requested from the other lane, but also moved to the other lane,
37613 so that the result of vpshufb can have the two V2TImode halves
37614 swapped. */
37615 m128 = GEN_INT (-128);
37616 for (i = 0; i < nelt; ++i)
37617 {
37618 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37619 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37620
37621 for (j = 0; j < eltsz; ++j)
37622 {
37623 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37624 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37625 }
37626 }
37627
37628 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37629 vperm = force_reg (V32QImode, vperm);
37630
37631 h = gen_reg_rtx (V32QImode);
37632 op = gen_lowpart (V32QImode, d->op0);
37633 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37634
37635 /* Swap the 128-byte lanes of h into hp. */
37636 hp = gen_reg_rtx (V4DImode);
37637 op = gen_lowpart (V4DImode, h);
37638 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37639 const1_rtx));
37640
37641 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37642 vperm = force_reg (V32QImode, vperm);
37643
37644 l = gen_reg_rtx (V32QImode);
37645 op = gen_lowpart (V32QImode, d->op0);
37646 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37647
37648 op = gen_lowpart (V32QImode, d->target);
37649 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37650
37651 return true;
37652 }
37653
37654 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37655 and extract-odd permutations of two V32QImode and V16QImode operand
37656 with two vpshufb insns, vpor and vpermq. We should have already
37657 failed all two or three instruction sequences. */
37658
37659 static bool
37660 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37661 {
37662 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37663 unsigned int i, nelt, eltsz;
37664
37665 if (!TARGET_AVX2
37666 || d->one_operand_p
37667 || (d->vmode != V32QImode && d->vmode != V16HImode))
37668 return false;
37669
37670 for (i = 0; i < d->nelt; ++i)
37671 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37672 return false;
37673
37674 if (d->testing_p)
37675 return true;
37676
37677 nelt = d->nelt;
37678 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37679
37680 /* Generate two permutation masks. In the first permutation mask
37681 the first quarter will contain indexes for the first half
37682 of the op0, the second quarter will contain bit 7 set, third quarter
37683 will contain indexes for the second half of the op0 and the
37684 last quarter bit 7 set. In the second permutation mask
37685 the first quarter will contain bit 7 set, the second quarter
37686 indexes for the first half of the op1, the third quarter bit 7 set
37687 and last quarter indexes for the second half of the op1.
37688 I.e. the first mask e.g. for V32QImode extract even will be:
37689 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37690 (all values masked with 0xf except for -128) and second mask
37691 for extract even will be
37692 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37693 m128 = GEN_INT (-128);
37694 for (i = 0; i < nelt; ++i)
37695 {
37696 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37697 unsigned which = d->perm[i] >= nelt;
37698 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37699
37700 for (j = 0; j < eltsz; ++j)
37701 {
37702 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37703 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37704 }
37705 }
37706
37707 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37708 vperm = force_reg (V32QImode, vperm);
37709
37710 l = gen_reg_rtx (V32QImode);
37711 op = gen_lowpart (V32QImode, d->op0);
37712 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37713
37714 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37715 vperm = force_reg (V32QImode, vperm);
37716
37717 h = gen_reg_rtx (V32QImode);
37718 op = gen_lowpart (V32QImode, d->op1);
37719 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37720
37721 ior = gen_reg_rtx (V32QImode);
37722 emit_insn (gen_iorv32qi3 (ior, l, h));
37723
37724 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37725 op = gen_lowpart (V4DImode, d->target);
37726 ior = gen_lowpart (V4DImode, ior);
37727 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37728 const1_rtx, GEN_INT (3)));
37729
37730 return true;
37731 }
37732
37733 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37734 and extract-odd permutations. */
37735
37736 static bool
37737 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37738 {
37739 rtx t1, t2, t3;
37740
37741 switch (d->vmode)
37742 {
37743 case V4DFmode:
37744 t1 = gen_reg_rtx (V4DFmode);
37745 t2 = gen_reg_rtx (V4DFmode);
37746
37747 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37748 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37749 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37750
37751 /* Now an unpck[lh]pd will produce the result required. */
37752 if (odd)
37753 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37754 else
37755 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37756 emit_insn (t3);
37757 break;
37758
37759 case V8SFmode:
37760 {
37761 int mask = odd ? 0xdd : 0x88;
37762
37763 t1 = gen_reg_rtx (V8SFmode);
37764 t2 = gen_reg_rtx (V8SFmode);
37765 t3 = gen_reg_rtx (V8SFmode);
37766
37767 /* Shuffle within the 128-bit lanes to produce:
37768 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37769 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37770 GEN_INT (mask)));
37771
37772 /* Shuffle the lanes around to produce:
37773 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37774 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37775 GEN_INT (0x3)));
37776
37777 /* Shuffle within the 128-bit lanes to produce:
37778 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37779 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37780
37781 /* Shuffle within the 128-bit lanes to produce:
37782 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37783 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37784
37785 /* Shuffle the lanes around to produce:
37786 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37787 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37788 GEN_INT (0x20)));
37789 }
37790 break;
37791
37792 case V2DFmode:
37793 case V4SFmode:
37794 case V2DImode:
37795 case V4SImode:
37796 /* These are always directly implementable by expand_vec_perm_1. */
37797 gcc_unreachable ();
37798
37799 case V8HImode:
37800 if (TARGET_SSSE3)
37801 return expand_vec_perm_pshufb2 (d);
37802 else
37803 {
37804 /* We need 2*log2(N)-1 operations to achieve odd/even
37805 with interleave. */
37806 t1 = gen_reg_rtx (V8HImode);
37807 t2 = gen_reg_rtx (V8HImode);
37808 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37809 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37810 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37811 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37812 if (odd)
37813 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37814 else
37815 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37816 emit_insn (t3);
37817 }
37818 break;
37819
37820 case V16QImode:
37821 if (TARGET_SSSE3)
37822 return expand_vec_perm_pshufb2 (d);
37823 else
37824 {
37825 t1 = gen_reg_rtx (V16QImode);
37826 t2 = gen_reg_rtx (V16QImode);
37827 t3 = gen_reg_rtx (V16QImode);
37828 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37829 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37830 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37831 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37832 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37833 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37834 if (odd)
37835 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37836 else
37837 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37838 emit_insn (t3);
37839 }
37840 break;
37841
37842 case V16HImode:
37843 case V32QImode:
37844 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37845
37846 case V4DImode:
37847 if (!TARGET_AVX2)
37848 {
37849 struct expand_vec_perm_d d_copy = *d;
37850 d_copy.vmode = V4DFmode;
37851 d_copy.target = gen_lowpart (V4DFmode, d->target);
37852 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37853 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37854 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37855 }
37856
37857 t1 = gen_reg_rtx (V4DImode);
37858 t2 = gen_reg_rtx (V4DImode);
37859
37860 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37861 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37862 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37863
37864 /* Now an vpunpck[lh]qdq will produce the result required. */
37865 if (odd)
37866 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37867 else
37868 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37869 emit_insn (t3);
37870 break;
37871
37872 case V8SImode:
37873 if (!TARGET_AVX2)
37874 {
37875 struct expand_vec_perm_d d_copy = *d;
37876 d_copy.vmode = V8SFmode;
37877 d_copy.target = gen_lowpart (V8SFmode, d->target);
37878 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37879 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37880 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37881 }
37882
37883 t1 = gen_reg_rtx (V8SImode);
37884 t2 = gen_reg_rtx (V8SImode);
37885
37886 /* Shuffle the lanes around into
37887 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37888 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37889 gen_lowpart (V4DImode, d->op0),
37890 gen_lowpart (V4DImode, d->op1),
37891 GEN_INT (0x20)));
37892 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37893 gen_lowpart (V4DImode, d->op0),
37894 gen_lowpart (V4DImode, d->op1),
37895 GEN_INT (0x31)));
37896
37897 /* Swap the 2nd and 3rd position in each lane into
37898 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37899 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37900 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37901 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37902 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37903
37904 /* Now an vpunpck[lh]qdq will produce
37905 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37906 if (odd)
37907 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37908 gen_lowpart (V4DImode, t1),
37909 gen_lowpart (V4DImode, t2));
37910 else
37911 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37912 gen_lowpart (V4DImode, t1),
37913 gen_lowpart (V4DImode, t2));
37914 emit_insn (t3);
37915 break;
37916
37917 default:
37918 gcc_unreachable ();
37919 }
37920
37921 return true;
37922 }
37923
37924 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37925 extract-even and extract-odd permutations. */
37926
37927 static bool
37928 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37929 {
37930 unsigned i, odd, nelt = d->nelt;
37931
37932 odd = d->perm[0];
37933 if (odd != 0 && odd != 1)
37934 return false;
37935
37936 for (i = 1; i < nelt; ++i)
37937 if (d->perm[i] != 2 * i + odd)
37938 return false;
37939
37940 return expand_vec_perm_even_odd_1 (d, odd);
37941 }
37942
37943 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37944 permutations. We assume that expand_vec_perm_1 has already failed. */
37945
37946 static bool
37947 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37948 {
37949 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37950 enum machine_mode vmode = d->vmode;
37951 unsigned char perm2[4];
37952 rtx op0 = d->op0;
37953 bool ok;
37954
37955 switch (vmode)
37956 {
37957 case V4DFmode:
37958 case V8SFmode:
37959 /* These are special-cased in sse.md so that we can optionally
37960 use the vbroadcast instruction. They expand to two insns
37961 if the input happens to be in a register. */
37962 gcc_unreachable ();
37963
37964 case V2DFmode:
37965 case V2DImode:
37966 case V4SFmode:
37967 case V4SImode:
37968 /* These are always implementable using standard shuffle patterns. */
37969 gcc_unreachable ();
37970
37971 case V8HImode:
37972 case V16QImode:
37973 /* These can be implemented via interleave. We save one insn by
37974 stopping once we have promoted to V4SImode and then use pshufd. */
37975 do
37976 {
37977 rtx dest;
37978 rtx (*gen) (rtx, rtx, rtx)
37979 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37980 : gen_vec_interleave_lowv8hi;
37981
37982 if (elt >= nelt2)
37983 {
37984 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37985 : gen_vec_interleave_highv8hi;
37986 elt -= nelt2;
37987 }
37988 nelt2 /= 2;
37989
37990 dest = gen_reg_rtx (vmode);
37991 emit_insn (gen (dest, op0, op0));
37992 vmode = get_mode_wider_vector (vmode);
37993 op0 = gen_lowpart (vmode, dest);
37994 }
37995 while (vmode != V4SImode);
37996
37997 memset (perm2, elt, 4);
37998 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
37999 d->testing_p);
38000 gcc_assert (ok);
38001 return true;
38002
38003 case V32QImode:
38004 case V16HImode:
38005 case V8SImode:
38006 case V4DImode:
38007 /* For AVX2 broadcasts of the first element vpbroadcast* or
38008 vpermq should be used by expand_vec_perm_1. */
38009 gcc_assert (!TARGET_AVX2 || d->perm[0]);
38010 return false;
38011
38012 default:
38013 gcc_unreachable ();
38014 }
38015 }
38016
38017 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38018 broadcast permutations. */
38019
38020 static bool
38021 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
38022 {
38023 unsigned i, elt, nelt = d->nelt;
38024
38025 if (!d->one_operand_p)
38026 return false;
38027
38028 elt = d->perm[0];
38029 for (i = 1; i < nelt; ++i)
38030 if (d->perm[i] != elt)
38031 return false;
38032
38033 return expand_vec_perm_broadcast_1 (d);
38034 }
38035
38036 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
38037 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
38038 all the shorter instruction sequences. */
38039
38040 static bool
38041 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
38042 {
38043 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
38044 unsigned int i, nelt, eltsz;
38045 bool used[4];
38046
38047 if (!TARGET_AVX2
38048 || d->one_operand_p
38049 || (d->vmode != V32QImode && d->vmode != V16HImode))
38050 return false;
38051
38052 if (d->testing_p)
38053 return true;
38054
38055 nelt = d->nelt;
38056 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38057
38058 /* Generate 4 permutation masks. If the required element is within
38059 the same lane, it is shuffled in. If the required element from the
38060 other lane, force a zero by setting bit 7 in the permutation mask.
38061 In the other mask the mask has non-negative elements if element
38062 is requested from the other lane, but also moved to the other lane,
38063 so that the result of vpshufb can have the two V2TImode halves
38064 swapped. */
38065 m128 = GEN_INT (-128);
38066 for (i = 0; i < 32; ++i)
38067 {
38068 rperm[0][i] = m128;
38069 rperm[1][i] = m128;
38070 rperm[2][i] = m128;
38071 rperm[3][i] = m128;
38072 }
38073 used[0] = false;
38074 used[1] = false;
38075 used[2] = false;
38076 used[3] = false;
38077 for (i = 0; i < nelt; ++i)
38078 {
38079 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38080 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38081 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
38082
38083 for (j = 0; j < eltsz; ++j)
38084 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
38085 used[which] = true;
38086 }
38087
38088 for (i = 0; i < 2; ++i)
38089 {
38090 if (!used[2 * i + 1])
38091 {
38092 h[i] = NULL_RTX;
38093 continue;
38094 }
38095 vperm = gen_rtx_CONST_VECTOR (V32QImode,
38096 gen_rtvec_v (32, rperm[2 * i + 1]));
38097 vperm = force_reg (V32QImode, vperm);
38098 h[i] = gen_reg_rtx (V32QImode);
38099 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38100 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
38101 }
38102
38103 /* Swap the 128-byte lanes of h[X]. */
38104 for (i = 0; i < 2; ++i)
38105 {
38106 if (h[i] == NULL_RTX)
38107 continue;
38108 op = gen_reg_rtx (V4DImode);
38109 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
38110 const2_rtx, GEN_INT (3), const0_rtx,
38111 const1_rtx));
38112 h[i] = gen_lowpart (V32QImode, op);
38113 }
38114
38115 for (i = 0; i < 2; ++i)
38116 {
38117 if (!used[2 * i])
38118 {
38119 l[i] = NULL_RTX;
38120 continue;
38121 }
38122 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
38123 vperm = force_reg (V32QImode, vperm);
38124 l[i] = gen_reg_rtx (V32QImode);
38125 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38126 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
38127 }
38128
38129 for (i = 0; i < 2; ++i)
38130 {
38131 if (h[i] && l[i])
38132 {
38133 op = gen_reg_rtx (V32QImode);
38134 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38135 l[i] = op;
38136 }
38137 else if (h[i])
38138 l[i] = h[i];
38139 }
38140
38141 gcc_assert (l[0] && l[1]);
38142 op = gen_lowpart (V32QImode, d->target);
38143 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38144 return true;
38145 }
38146
38147 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38148 With all of the interface bits taken care of, perform the expansion
38149 in D and return true on success. */
38150
38151 static bool
38152 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38153 {
38154 /* Try a single instruction expansion. */
38155 if (expand_vec_perm_1 (d))
38156 return true;
38157
38158 /* Try sequences of two instructions. */
38159
38160 if (expand_vec_perm_pshuflw_pshufhw (d))
38161 return true;
38162
38163 if (expand_vec_perm_palignr (d))
38164 return true;
38165
38166 if (expand_vec_perm_interleave2 (d))
38167 return true;
38168
38169 if (expand_vec_perm_broadcast (d))
38170 return true;
38171
38172 if (expand_vec_perm_vpermq_perm_1 (d))
38173 return true;
38174
38175 if (expand_vec_perm_vperm2f128 (d))
38176 return true;
38177
38178 /* Try sequences of three instructions. */
38179
38180 if (expand_vec_perm_2vperm2f128_vshuf (d))
38181 return true;
38182
38183 if (expand_vec_perm_pshufb2 (d))
38184 return true;
38185
38186 if (expand_vec_perm_interleave3 (d))
38187 return true;
38188
38189 if (expand_vec_perm_vperm2f128_vblend (d))
38190 return true;
38191
38192 /* Try sequences of four instructions. */
38193
38194 if (expand_vec_perm_vpshufb2_vpermq (d))
38195 return true;
38196
38197 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38198 return true;
38199
38200 /* ??? Look for narrow permutations whose element orderings would
38201 allow the promotion to a wider mode. */
38202
38203 /* ??? Look for sequences of interleave or a wider permute that place
38204 the data into the correct lanes for a half-vector shuffle like
38205 pshuf[lh]w or vpermilps. */
38206
38207 /* ??? Look for sequences of interleave that produce the desired results.
38208 The combinatorics of punpck[lh] get pretty ugly... */
38209
38210 if (expand_vec_perm_even_odd (d))
38211 return true;
38212
38213 /* Even longer sequences. */
38214 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38215 return true;
38216
38217 return false;
38218 }
38219
38220 /* If a permutation only uses one operand, make it clear. Returns true
38221 if the permutation references both operands. */
38222
38223 static bool
38224 canonicalize_perm (struct expand_vec_perm_d *d)
38225 {
38226 int i, which, nelt = d->nelt;
38227
38228 for (i = which = 0; i < nelt; ++i)
38229 which |= (d->perm[i] < nelt ? 1 : 2);
38230
38231 d->one_operand_p = true;
38232 switch (which)
38233 {
38234 default:
38235 gcc_unreachable();
38236
38237 case 3:
38238 if (!rtx_equal_p (d->op0, d->op1))
38239 {
38240 d->one_operand_p = false;
38241 break;
38242 }
38243 /* The elements of PERM do not suggest that only the first operand
38244 is used, but both operands are identical. Allow easier matching
38245 of the permutation by folding the permutation into the single
38246 input vector. */
38247 /* FALLTHRU */
38248
38249 case 2:
38250 for (i = 0; i < nelt; ++i)
38251 d->perm[i] &= nelt - 1;
38252 d->op0 = d->op1;
38253 break;
38254
38255 case 1:
38256 d->op1 = d->op0;
38257 break;
38258 }
38259
38260 return (which == 3);
38261 }
38262
38263 bool
38264 ix86_expand_vec_perm_const (rtx operands[4])
38265 {
38266 struct expand_vec_perm_d d;
38267 unsigned char perm[MAX_VECT_LEN];
38268 int i, nelt;
38269 bool two_args;
38270 rtx sel;
38271
38272 d.target = operands[0];
38273 d.op0 = operands[1];
38274 d.op1 = operands[2];
38275 sel = operands[3];
38276
38277 d.vmode = GET_MODE (d.target);
38278 gcc_assert (VECTOR_MODE_P (d.vmode));
38279 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38280 d.testing_p = false;
38281
38282 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38283 gcc_assert (XVECLEN (sel, 0) == nelt);
38284 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38285
38286 for (i = 0; i < nelt; ++i)
38287 {
38288 rtx e = XVECEXP (sel, 0, i);
38289 int ei = INTVAL (e) & (2 * nelt - 1);
38290 d.perm[i] = ei;
38291 perm[i] = ei;
38292 }
38293
38294 two_args = canonicalize_perm (&d);
38295
38296 if (ix86_expand_vec_perm_const_1 (&d))
38297 return true;
38298
38299 /* If the selector says both arguments are needed, but the operands are the
38300 same, the above tried to expand with one_operand_p and flattened selector.
38301 If that didn't work, retry without one_operand_p; we succeeded with that
38302 during testing. */
38303 if (two_args && d.one_operand_p)
38304 {
38305 d.one_operand_p = false;
38306 memcpy (d.perm, perm, sizeof (perm));
38307 return ix86_expand_vec_perm_const_1 (&d);
38308 }
38309
38310 return false;
38311 }
38312
38313 /* Implement targetm.vectorize.vec_perm_const_ok. */
38314
38315 static bool
38316 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38317 const unsigned char *sel)
38318 {
38319 struct expand_vec_perm_d d;
38320 unsigned int i, nelt, which;
38321 bool ret;
38322
38323 d.vmode = vmode;
38324 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38325 d.testing_p = true;
38326
38327 /* Given sufficient ISA support we can just return true here
38328 for selected vector modes. */
38329 if (GET_MODE_SIZE (d.vmode) == 16)
38330 {
38331 /* All implementable with a single vpperm insn. */
38332 if (TARGET_XOP)
38333 return true;
38334 /* All implementable with 2 pshufb + 1 ior. */
38335 if (TARGET_SSSE3)
38336 return true;
38337 /* All implementable with shufpd or unpck[lh]pd. */
38338 if (d.nelt == 2)
38339 return true;
38340 }
38341
38342 /* Extract the values from the vector CST into the permutation
38343 array in D. */
38344 memcpy (d.perm, sel, nelt);
38345 for (i = which = 0; i < nelt; ++i)
38346 {
38347 unsigned char e = d.perm[i];
38348 gcc_assert (e < 2 * nelt);
38349 which |= (e < nelt ? 1 : 2);
38350 }
38351
38352 /* For all elements from second vector, fold the elements to first. */
38353 if (which == 2)
38354 for (i = 0; i < nelt; ++i)
38355 d.perm[i] -= nelt;
38356
38357 /* Check whether the mask can be applied to the vector type. */
38358 d.one_operand_p = (which != 3);
38359
38360 /* Implementable with shufps or pshufd. */
38361 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38362 return true;
38363
38364 /* Otherwise we have to go through the motions and see if we can
38365 figure out how to generate the requested permutation. */
38366 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38367 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38368 if (!d.one_operand_p)
38369 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38370
38371 start_sequence ();
38372 ret = ix86_expand_vec_perm_const_1 (&d);
38373 end_sequence ();
38374
38375 return ret;
38376 }
38377
38378 void
38379 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38380 {
38381 struct expand_vec_perm_d d;
38382 unsigned i, nelt;
38383
38384 d.target = targ;
38385 d.op0 = op0;
38386 d.op1 = op1;
38387 d.vmode = GET_MODE (targ);
38388 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38389 d.one_operand_p = false;
38390 d.testing_p = false;
38391
38392 for (i = 0; i < nelt; ++i)
38393 d.perm[i] = i * 2 + odd;
38394
38395 /* We'll either be able to implement the permutation directly... */
38396 if (expand_vec_perm_1 (&d))
38397 return;
38398
38399 /* ... or we use the special-case patterns. */
38400 expand_vec_perm_even_odd_1 (&d, odd);
38401 }
38402
38403 /* Expand an insert into a vector register through pinsr insn.
38404 Return true if successful. */
38405
38406 bool
38407 ix86_expand_pinsr (rtx *operands)
38408 {
38409 rtx dst = operands[0];
38410 rtx src = operands[3];
38411
38412 unsigned int size = INTVAL (operands[1]);
38413 unsigned int pos = INTVAL (operands[2]);
38414
38415 if (GET_CODE (dst) == SUBREG)
38416 {
38417 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
38418 dst = SUBREG_REG (dst);
38419 }
38420
38421 if (GET_CODE (src) == SUBREG)
38422 src = SUBREG_REG (src);
38423
38424 switch (GET_MODE (dst))
38425 {
38426 case V16QImode:
38427 case V8HImode:
38428 case V4SImode:
38429 case V2DImode:
38430 {
38431 enum machine_mode srcmode, dstmode;
38432 rtx (*pinsr)(rtx, rtx, rtx, rtx);
38433
38434 srcmode = mode_for_size (size, MODE_INT, 0);
38435
38436 switch (srcmode)
38437 {
38438 case QImode:
38439 if (!TARGET_SSE4_1)
38440 return false;
38441 dstmode = V16QImode;
38442 pinsr = gen_sse4_1_pinsrb;
38443 break;
38444
38445 case HImode:
38446 if (!TARGET_SSE2)
38447 return false;
38448 dstmode = V8HImode;
38449 pinsr = gen_sse2_pinsrw;
38450 break;
38451
38452 case SImode:
38453 if (!TARGET_SSE4_1)
38454 return false;
38455 dstmode = V4SImode;
38456 pinsr = gen_sse4_1_pinsrd;
38457 break;
38458
38459 case DImode:
38460 gcc_assert (TARGET_64BIT);
38461 if (!TARGET_SSE4_1)
38462 return false;
38463 dstmode = V2DImode;
38464 pinsr = gen_sse4_1_pinsrq;
38465 break;
38466
38467 default:
38468 return false;
38469 }
38470
38471 dst = gen_lowpart (dstmode, dst);
38472 src = gen_lowpart (srcmode, src);
38473
38474 pos /= size;
38475
38476 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
38477 return true;
38478 }
38479
38480 default:
38481 return false;
38482 }
38483 }
38484 \f
38485 /* This function returns the calling abi specific va_list type node.
38486 It returns the FNDECL specific va_list type. */
38487
38488 static tree
38489 ix86_fn_abi_va_list (tree fndecl)
38490 {
38491 if (!TARGET_64BIT)
38492 return va_list_type_node;
38493 gcc_assert (fndecl != NULL_TREE);
38494
38495 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
38496 return ms_va_list_type_node;
38497 else
38498 return sysv_va_list_type_node;
38499 }
38500
38501 /* Returns the canonical va_list type specified by TYPE. If there
38502 is no valid TYPE provided, it return NULL_TREE. */
38503
38504 static tree
38505 ix86_canonical_va_list_type (tree type)
38506 {
38507 tree wtype, htype;
38508
38509 /* Resolve references and pointers to va_list type. */
38510 if (TREE_CODE (type) == MEM_REF)
38511 type = TREE_TYPE (type);
38512 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
38513 type = TREE_TYPE (type);
38514 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
38515 type = TREE_TYPE (type);
38516
38517 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
38518 {
38519 wtype = va_list_type_node;
38520 gcc_assert (wtype != NULL_TREE);
38521 htype = type;
38522 if (TREE_CODE (wtype) == ARRAY_TYPE)
38523 {
38524 /* If va_list is an array type, the argument may have decayed
38525 to a pointer type, e.g. by being passed to another function.
38526 In that case, unwrap both types so that we can compare the
38527 underlying records. */
38528 if (TREE_CODE (htype) == ARRAY_TYPE
38529 || POINTER_TYPE_P (htype))
38530 {
38531 wtype = TREE_TYPE (wtype);
38532 htype = TREE_TYPE (htype);
38533 }
38534 }
38535 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38536 return va_list_type_node;
38537 wtype = sysv_va_list_type_node;
38538 gcc_assert (wtype != NULL_TREE);
38539 htype = type;
38540 if (TREE_CODE (wtype) == ARRAY_TYPE)
38541 {
38542 /* If va_list is an array type, the argument may have decayed
38543 to a pointer type, e.g. by being passed to another function.
38544 In that case, unwrap both types so that we can compare the
38545 underlying records. */
38546 if (TREE_CODE (htype) == ARRAY_TYPE
38547 || POINTER_TYPE_P (htype))
38548 {
38549 wtype = TREE_TYPE (wtype);
38550 htype = TREE_TYPE (htype);
38551 }
38552 }
38553 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38554 return sysv_va_list_type_node;
38555 wtype = ms_va_list_type_node;
38556 gcc_assert (wtype != NULL_TREE);
38557 htype = type;
38558 if (TREE_CODE (wtype) == ARRAY_TYPE)
38559 {
38560 /* If va_list is an array type, the argument may have decayed
38561 to a pointer type, e.g. by being passed to another function.
38562 In that case, unwrap both types so that we can compare the
38563 underlying records. */
38564 if (TREE_CODE (htype) == ARRAY_TYPE
38565 || POINTER_TYPE_P (htype))
38566 {
38567 wtype = TREE_TYPE (wtype);
38568 htype = TREE_TYPE (htype);
38569 }
38570 }
38571 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38572 return ms_va_list_type_node;
38573 return NULL_TREE;
38574 }
38575 return std_canonical_va_list_type (type);
38576 }
38577
38578 /* Iterate through the target-specific builtin types for va_list.
38579 IDX denotes the iterator, *PTREE is set to the result type of
38580 the va_list builtin, and *PNAME to its internal type.
38581 Returns zero if there is no element for this index, otherwise
38582 IDX should be increased upon the next call.
38583 Note, do not iterate a base builtin's name like __builtin_va_list.
38584 Used from c_common_nodes_and_builtins. */
38585
38586 static int
38587 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
38588 {
38589 if (TARGET_64BIT)
38590 {
38591 switch (idx)
38592 {
38593 default:
38594 break;
38595
38596 case 0:
38597 *ptree = ms_va_list_type_node;
38598 *pname = "__builtin_ms_va_list";
38599 return 1;
38600
38601 case 1:
38602 *ptree = sysv_va_list_type_node;
38603 *pname = "__builtin_sysv_va_list";
38604 return 1;
38605 }
38606 }
38607
38608 return 0;
38609 }
38610
38611 #undef TARGET_SCHED_DISPATCH
38612 #define TARGET_SCHED_DISPATCH has_dispatch
38613 #undef TARGET_SCHED_DISPATCH_DO
38614 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38615 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38616 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38617 #undef TARGET_SCHED_REORDER
38618 #define TARGET_SCHED_REORDER ix86_sched_reorder
38619
38620 /* The size of the dispatch window is the total number of bytes of
38621 object code allowed in a window. */
38622 #define DISPATCH_WINDOW_SIZE 16
38623
38624 /* Number of dispatch windows considered for scheduling. */
38625 #define MAX_DISPATCH_WINDOWS 3
38626
38627 /* Maximum number of instructions in a window. */
38628 #define MAX_INSN 4
38629
38630 /* Maximum number of immediate operands in a window. */
38631 #define MAX_IMM 4
38632
38633 /* Maximum number of immediate bits allowed in a window. */
38634 #define MAX_IMM_SIZE 128
38635
38636 /* Maximum number of 32 bit immediates allowed in a window. */
38637 #define MAX_IMM_32 4
38638
38639 /* Maximum number of 64 bit immediates allowed in a window. */
38640 #define MAX_IMM_64 2
38641
38642 /* Maximum total of loads or prefetches allowed in a window. */
38643 #define MAX_LOAD 2
38644
38645 /* Maximum total of stores allowed in a window. */
38646 #define MAX_STORE 1
38647
38648 #undef BIG
38649 #define BIG 100
38650
38651
38652 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38653 enum dispatch_group {
38654 disp_no_group = 0,
38655 disp_load,
38656 disp_store,
38657 disp_load_store,
38658 disp_prefetch,
38659 disp_imm,
38660 disp_imm_32,
38661 disp_imm_64,
38662 disp_branch,
38663 disp_cmp,
38664 disp_jcc,
38665 disp_last
38666 };
38667
38668 /* Number of allowable groups in a dispatch window. It is an array
38669 indexed by dispatch_group enum. 100 is used as a big number,
38670 because the number of these kind of operations does not have any
38671 effect in dispatch window, but we need them for other reasons in
38672 the table. */
38673 static unsigned int num_allowable_groups[disp_last] = {
38674 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38675 };
38676
38677 char group_name[disp_last + 1][16] = {
38678 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38679 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38680 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38681 };
38682
38683 /* Instruction path. */
38684 enum insn_path {
38685 no_path = 0,
38686 path_single, /* Single micro op. */
38687 path_double, /* Double micro op. */
38688 path_multi, /* Instructions with more than 2 micro op.. */
38689 last_path
38690 };
38691
38692 /* sched_insn_info defines a window to the instructions scheduled in
38693 the basic block. It contains a pointer to the insn_info table and
38694 the instruction scheduled.
38695
38696 Windows are allocated for each basic block and are linked
38697 together. */
38698 typedef struct sched_insn_info_s {
38699 rtx insn;
38700 enum dispatch_group group;
38701 enum insn_path path;
38702 int byte_len;
38703 int imm_bytes;
38704 } sched_insn_info;
38705
38706 /* Linked list of dispatch windows. This is a two way list of
38707 dispatch windows of a basic block. It contains information about
38708 the number of uops in the window and the total number of
38709 instructions and of bytes in the object code for this dispatch
38710 window. */
38711 typedef struct dispatch_windows_s {
38712 int num_insn; /* Number of insn in the window. */
38713 int num_uops; /* Number of uops in the window. */
38714 int window_size; /* Number of bytes in the window. */
38715 int window_num; /* Window number between 0 or 1. */
38716 int num_imm; /* Number of immediates in an insn. */
38717 int num_imm_32; /* Number of 32 bit immediates in an insn. */
38718 int num_imm_64; /* Number of 64 bit immediates in an insn. */
38719 int imm_size; /* Total immediates in the window. */
38720 int num_loads; /* Total memory loads in the window. */
38721 int num_stores; /* Total memory stores in the window. */
38722 int violation; /* Violation exists in window. */
38723 sched_insn_info *window; /* Pointer to the window. */
38724 struct dispatch_windows_s *next;
38725 struct dispatch_windows_s *prev;
38726 } dispatch_windows;
38727
38728 /* Immediate valuse used in an insn. */
38729 typedef struct imm_info_s
38730 {
38731 int imm;
38732 int imm32;
38733 int imm64;
38734 } imm_info;
38735
38736 static dispatch_windows *dispatch_window_list;
38737 static dispatch_windows *dispatch_window_list1;
38738
38739 /* Get dispatch group of insn. */
38740
38741 static enum dispatch_group
38742 get_mem_group (rtx insn)
38743 {
38744 enum attr_memory memory;
38745
38746 if (INSN_CODE (insn) < 0)
38747 return disp_no_group;
38748 memory = get_attr_memory (insn);
38749 if (memory == MEMORY_STORE)
38750 return disp_store;
38751
38752 if (memory == MEMORY_LOAD)
38753 return disp_load;
38754
38755 if (memory == MEMORY_BOTH)
38756 return disp_load_store;
38757
38758 return disp_no_group;
38759 }
38760
38761 /* Return true if insn is a compare instruction. */
38762
38763 static bool
38764 is_cmp (rtx insn)
38765 {
38766 enum attr_type type;
38767
38768 type = get_attr_type (insn);
38769 return (type == TYPE_TEST
38770 || type == TYPE_ICMP
38771 || type == TYPE_FCMP
38772 || GET_CODE (PATTERN (insn)) == COMPARE);
38773 }
38774
38775 /* Return true if a dispatch violation encountered. */
38776
38777 static bool
38778 dispatch_violation (void)
38779 {
38780 if (dispatch_window_list->next)
38781 return dispatch_window_list->next->violation;
38782 return dispatch_window_list->violation;
38783 }
38784
38785 /* Return true if insn is a branch instruction. */
38786
38787 static bool
38788 is_branch (rtx insn)
38789 {
38790 return (CALL_P (insn) || JUMP_P (insn));
38791 }
38792
38793 /* Return true if insn is a prefetch instruction. */
38794
38795 static bool
38796 is_prefetch (rtx insn)
38797 {
38798 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38799 }
38800
38801 /* This function initializes a dispatch window and the list container holding a
38802 pointer to the window. */
38803
38804 static void
38805 init_window (int window_num)
38806 {
38807 int i;
38808 dispatch_windows *new_list;
38809
38810 if (window_num == 0)
38811 new_list = dispatch_window_list;
38812 else
38813 new_list = dispatch_window_list1;
38814
38815 new_list->num_insn = 0;
38816 new_list->num_uops = 0;
38817 new_list->window_size = 0;
38818 new_list->next = NULL;
38819 new_list->prev = NULL;
38820 new_list->window_num = window_num;
38821 new_list->num_imm = 0;
38822 new_list->num_imm_32 = 0;
38823 new_list->num_imm_64 = 0;
38824 new_list->imm_size = 0;
38825 new_list->num_loads = 0;
38826 new_list->num_stores = 0;
38827 new_list->violation = false;
38828
38829 for (i = 0; i < MAX_INSN; i++)
38830 {
38831 new_list->window[i].insn = NULL;
38832 new_list->window[i].group = disp_no_group;
38833 new_list->window[i].path = no_path;
38834 new_list->window[i].byte_len = 0;
38835 new_list->window[i].imm_bytes = 0;
38836 }
38837 return;
38838 }
38839
38840 /* This function allocates and initializes a dispatch window and the
38841 list container holding a pointer to the window. */
38842
38843 static dispatch_windows *
38844 allocate_window (void)
38845 {
38846 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38847 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38848
38849 return new_list;
38850 }
38851
38852 /* This routine initializes the dispatch scheduling information. It
38853 initiates building dispatch scheduler tables and constructs the
38854 first dispatch window. */
38855
38856 static void
38857 init_dispatch_sched (void)
38858 {
38859 /* Allocate a dispatch list and a window. */
38860 dispatch_window_list = allocate_window ();
38861 dispatch_window_list1 = allocate_window ();
38862 init_window (0);
38863 init_window (1);
38864 }
38865
38866 /* This function returns true if a branch is detected. End of a basic block
38867 does not have to be a branch, but here we assume only branches end a
38868 window. */
38869
38870 static bool
38871 is_end_basic_block (enum dispatch_group group)
38872 {
38873 return group == disp_branch;
38874 }
38875
38876 /* This function is called when the end of a window processing is reached. */
38877
38878 static void
38879 process_end_window (void)
38880 {
38881 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38882 if (dispatch_window_list->next)
38883 {
38884 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38885 gcc_assert (dispatch_window_list->window_size
38886 + dispatch_window_list1->window_size <= 48);
38887 init_window (1);
38888 }
38889 init_window (0);
38890 }
38891
38892 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38893 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38894 for 48 bytes of instructions. Note that these windows are not dispatch
38895 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38896
38897 static dispatch_windows *
38898 allocate_next_window (int window_num)
38899 {
38900 if (window_num == 0)
38901 {
38902 if (dispatch_window_list->next)
38903 init_window (1);
38904 init_window (0);
38905 return dispatch_window_list;
38906 }
38907
38908 dispatch_window_list->next = dispatch_window_list1;
38909 dispatch_window_list1->prev = dispatch_window_list;
38910
38911 return dispatch_window_list1;
38912 }
38913
38914 /* Increment the number of immediate operands of an instruction. */
38915
38916 static int
38917 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38918 {
38919 if (*in_rtx == 0)
38920 return 0;
38921
38922 switch ( GET_CODE (*in_rtx))
38923 {
38924 case CONST:
38925 case SYMBOL_REF:
38926 case CONST_INT:
38927 (imm_values->imm)++;
38928 if (x86_64_immediate_operand (*in_rtx, SImode))
38929 (imm_values->imm32)++;
38930 else
38931 (imm_values->imm64)++;
38932 break;
38933
38934 case CONST_DOUBLE:
38935 (imm_values->imm)++;
38936 (imm_values->imm64)++;
38937 break;
38938
38939 case CODE_LABEL:
38940 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38941 {
38942 (imm_values->imm)++;
38943 (imm_values->imm32)++;
38944 }
38945 break;
38946
38947 default:
38948 break;
38949 }
38950
38951 return 0;
38952 }
38953
38954 /* Compute number of immediate operands of an instruction. */
38955
38956 static void
38957 find_constant (rtx in_rtx, imm_info *imm_values)
38958 {
38959 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38960 (rtx_function) find_constant_1, (void *) imm_values);
38961 }
38962
38963 /* Return total size of immediate operands of an instruction along with number
38964 of corresponding immediate-operands. It initializes its parameters to zero
38965 befor calling FIND_CONSTANT.
38966 INSN is the input instruction. IMM is the total of immediates.
38967 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38968 bit immediates. */
38969
38970 static int
38971 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38972 {
38973 imm_info imm_values = {0, 0, 0};
38974
38975 find_constant (insn, &imm_values);
38976 *imm = imm_values.imm;
38977 *imm32 = imm_values.imm32;
38978 *imm64 = imm_values.imm64;
38979 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38980 }
38981
38982 /* This function indicates if an operand of an instruction is an
38983 immediate. */
38984
38985 static bool
38986 has_immediate (rtx insn)
38987 {
38988 int num_imm_operand;
38989 int num_imm32_operand;
38990 int num_imm64_operand;
38991
38992 if (insn)
38993 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38994 &num_imm64_operand);
38995 return false;
38996 }
38997
38998 /* Return single or double path for instructions. */
38999
39000 static enum insn_path
39001 get_insn_path (rtx insn)
39002 {
39003 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
39004
39005 if ((int)path == 0)
39006 return path_single;
39007
39008 if ((int)path == 1)
39009 return path_double;
39010
39011 return path_multi;
39012 }
39013
39014 /* Return insn dispatch group. */
39015
39016 static enum dispatch_group
39017 get_insn_group (rtx insn)
39018 {
39019 enum dispatch_group group = get_mem_group (insn);
39020 if (group)
39021 return group;
39022
39023 if (is_branch (insn))
39024 return disp_branch;
39025
39026 if (is_cmp (insn))
39027 return disp_cmp;
39028
39029 if (has_immediate (insn))
39030 return disp_imm;
39031
39032 if (is_prefetch (insn))
39033 return disp_prefetch;
39034
39035 return disp_no_group;
39036 }
39037
39038 /* Count number of GROUP restricted instructions in a dispatch
39039 window WINDOW_LIST. */
39040
39041 static int
39042 count_num_restricted (rtx insn, dispatch_windows *window_list)
39043 {
39044 enum dispatch_group group = get_insn_group (insn);
39045 int imm_size;
39046 int num_imm_operand;
39047 int num_imm32_operand;
39048 int num_imm64_operand;
39049
39050 if (group == disp_no_group)
39051 return 0;
39052
39053 if (group == disp_imm)
39054 {
39055 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39056 &num_imm64_operand);
39057 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
39058 || num_imm_operand + window_list->num_imm > MAX_IMM
39059 || (num_imm32_operand > 0
39060 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
39061 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
39062 || (num_imm64_operand > 0
39063 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
39064 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
39065 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
39066 && num_imm64_operand > 0
39067 && ((window_list->num_imm_64 > 0
39068 && window_list->num_insn >= 2)
39069 || window_list->num_insn >= 3)))
39070 return BIG;
39071
39072 return 1;
39073 }
39074
39075 if ((group == disp_load_store
39076 && (window_list->num_loads >= MAX_LOAD
39077 || window_list->num_stores >= MAX_STORE))
39078 || ((group == disp_load
39079 || group == disp_prefetch)
39080 && window_list->num_loads >= MAX_LOAD)
39081 || (group == disp_store
39082 && window_list->num_stores >= MAX_STORE))
39083 return BIG;
39084
39085 return 1;
39086 }
39087
39088 /* This function returns true if insn satisfies dispatch rules on the
39089 last window scheduled. */
39090
39091 static bool
39092 fits_dispatch_window (rtx insn)
39093 {
39094 dispatch_windows *window_list = dispatch_window_list;
39095 dispatch_windows *window_list_next = dispatch_window_list->next;
39096 unsigned int num_restrict;
39097 enum dispatch_group group = get_insn_group (insn);
39098 enum insn_path path = get_insn_path (insn);
39099 int sum;
39100
39101 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
39102 instructions should be given the lowest priority in the
39103 scheduling process in Haifa scheduler to make sure they will be
39104 scheduled in the same dispatch window as the reference to them. */
39105 if (group == disp_jcc || group == disp_cmp)
39106 return false;
39107
39108 /* Check nonrestricted. */
39109 if (group == disp_no_group || group == disp_branch)
39110 return true;
39111
39112 /* Get last dispatch window. */
39113 if (window_list_next)
39114 window_list = window_list_next;
39115
39116 if (window_list->window_num == 1)
39117 {
39118 sum = window_list->prev->window_size + window_list->window_size;
39119
39120 if (sum == 32
39121 || (min_insn_size (insn) + sum) >= 48)
39122 /* Window 1 is full. Go for next window. */
39123 return true;
39124 }
39125
39126 num_restrict = count_num_restricted (insn, window_list);
39127
39128 if (num_restrict > num_allowable_groups[group])
39129 return false;
39130
39131 /* See if it fits in the first window. */
39132 if (window_list->window_num == 0)
39133 {
39134 /* The first widow should have only single and double path
39135 uops. */
39136 if (path == path_double
39137 && (window_list->num_uops + 2) > MAX_INSN)
39138 return false;
39139 else if (path != path_single)
39140 return false;
39141 }
39142 return true;
39143 }
39144
39145 /* Add an instruction INSN with NUM_UOPS micro-operations to the
39146 dispatch window WINDOW_LIST. */
39147
39148 static void
39149 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
39150 {
39151 int byte_len = min_insn_size (insn);
39152 int num_insn = window_list->num_insn;
39153 int imm_size;
39154 sched_insn_info *window = window_list->window;
39155 enum dispatch_group group = get_insn_group (insn);
39156 enum insn_path path = get_insn_path (insn);
39157 int num_imm_operand;
39158 int num_imm32_operand;
39159 int num_imm64_operand;
39160
39161 if (!window_list->violation && group != disp_cmp
39162 && !fits_dispatch_window (insn))
39163 window_list->violation = true;
39164
39165 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39166 &num_imm64_operand);
39167
39168 /* Initialize window with new instruction. */
39169 window[num_insn].insn = insn;
39170 window[num_insn].byte_len = byte_len;
39171 window[num_insn].group = group;
39172 window[num_insn].path = path;
39173 window[num_insn].imm_bytes = imm_size;
39174
39175 window_list->window_size += byte_len;
39176 window_list->num_insn = num_insn + 1;
39177 window_list->num_uops = window_list->num_uops + num_uops;
39178 window_list->imm_size += imm_size;
39179 window_list->num_imm += num_imm_operand;
39180 window_list->num_imm_32 += num_imm32_operand;
39181 window_list->num_imm_64 += num_imm64_operand;
39182
39183 if (group == disp_store)
39184 window_list->num_stores += 1;
39185 else if (group == disp_load
39186 || group == disp_prefetch)
39187 window_list->num_loads += 1;
39188 else if (group == disp_load_store)
39189 {
39190 window_list->num_stores += 1;
39191 window_list->num_loads += 1;
39192 }
39193 }
39194
39195 /* Adds a scheduled instruction, INSN, to the current dispatch window.
39196 If the total bytes of instructions or the number of instructions in
39197 the window exceed allowable, it allocates a new window. */
39198
39199 static void
39200 add_to_dispatch_window (rtx insn)
39201 {
39202 int byte_len;
39203 dispatch_windows *window_list;
39204 dispatch_windows *next_list;
39205 dispatch_windows *window0_list;
39206 enum insn_path path;
39207 enum dispatch_group insn_group;
39208 bool insn_fits;
39209 int num_insn;
39210 int num_uops;
39211 int window_num;
39212 int insn_num_uops;
39213 int sum;
39214
39215 if (INSN_CODE (insn) < 0)
39216 return;
39217
39218 byte_len = min_insn_size (insn);
39219 window_list = dispatch_window_list;
39220 next_list = window_list->next;
39221 path = get_insn_path (insn);
39222 insn_group = get_insn_group (insn);
39223
39224 /* Get the last dispatch window. */
39225 if (next_list)
39226 window_list = dispatch_window_list->next;
39227
39228 if (path == path_single)
39229 insn_num_uops = 1;
39230 else if (path == path_double)
39231 insn_num_uops = 2;
39232 else
39233 insn_num_uops = (int) path;
39234
39235 /* If current window is full, get a new window.
39236 Window number zero is full, if MAX_INSN uops are scheduled in it.
39237 Window number one is full, if window zero's bytes plus window
39238 one's bytes is 32, or if the bytes of the new instruction added
39239 to the total makes it greater than 48, or it has already MAX_INSN
39240 instructions in it. */
39241 num_insn = window_list->num_insn;
39242 num_uops = window_list->num_uops;
39243 window_num = window_list->window_num;
39244 insn_fits = fits_dispatch_window (insn);
39245
39246 if (num_insn >= MAX_INSN
39247 || num_uops + insn_num_uops > MAX_INSN
39248 || !(insn_fits))
39249 {
39250 window_num = ~window_num & 1;
39251 window_list = allocate_next_window (window_num);
39252 }
39253
39254 if (window_num == 0)
39255 {
39256 add_insn_window (insn, window_list, insn_num_uops);
39257 if (window_list->num_insn >= MAX_INSN
39258 && insn_group == disp_branch)
39259 {
39260 process_end_window ();
39261 return;
39262 }
39263 }
39264 else if (window_num == 1)
39265 {
39266 window0_list = window_list->prev;
39267 sum = window0_list->window_size + window_list->window_size;
39268 if (sum == 32
39269 || (byte_len + sum) >= 48)
39270 {
39271 process_end_window ();
39272 window_list = dispatch_window_list;
39273 }
39274
39275 add_insn_window (insn, window_list, insn_num_uops);
39276 }
39277 else
39278 gcc_unreachable ();
39279
39280 if (is_end_basic_block (insn_group))
39281 {
39282 /* End of basic block is reached do end-basic-block process. */
39283 process_end_window ();
39284 return;
39285 }
39286 }
39287
39288 /* Print the dispatch window, WINDOW_NUM, to FILE. */
39289
39290 DEBUG_FUNCTION static void
39291 debug_dispatch_window_file (FILE *file, int window_num)
39292 {
39293 dispatch_windows *list;
39294 int i;
39295
39296 if (window_num == 0)
39297 list = dispatch_window_list;
39298 else
39299 list = dispatch_window_list1;
39300
39301 fprintf (file, "Window #%d:\n", list->window_num);
39302 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
39303 list->num_insn, list->num_uops, list->window_size);
39304 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39305 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
39306
39307 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
39308 list->num_stores);
39309 fprintf (file, " insn info:\n");
39310
39311 for (i = 0; i < MAX_INSN; i++)
39312 {
39313 if (!list->window[i].insn)
39314 break;
39315 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
39316 i, group_name[list->window[i].group],
39317 i, (void *)list->window[i].insn,
39318 i, list->window[i].path,
39319 i, list->window[i].byte_len,
39320 i, list->window[i].imm_bytes);
39321 }
39322 }
39323
39324 /* Print to stdout a dispatch window. */
39325
39326 DEBUG_FUNCTION void
39327 debug_dispatch_window (int window_num)
39328 {
39329 debug_dispatch_window_file (stdout, window_num);
39330 }
39331
39332 /* Print INSN dispatch information to FILE. */
39333
39334 DEBUG_FUNCTION static void
39335 debug_insn_dispatch_info_file (FILE *file, rtx insn)
39336 {
39337 int byte_len;
39338 enum insn_path path;
39339 enum dispatch_group group;
39340 int imm_size;
39341 int num_imm_operand;
39342 int num_imm32_operand;
39343 int num_imm64_operand;
39344
39345 if (INSN_CODE (insn) < 0)
39346 return;
39347
39348 byte_len = min_insn_size (insn);
39349 path = get_insn_path (insn);
39350 group = get_insn_group (insn);
39351 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39352 &num_imm64_operand);
39353
39354 fprintf (file, " insn info:\n");
39355 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
39356 group_name[group], path, byte_len);
39357 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39358 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
39359 }
39360
39361 /* Print to STDERR the status of the ready list with respect to
39362 dispatch windows. */
39363
39364 DEBUG_FUNCTION void
39365 debug_ready_dispatch (void)
39366 {
39367 int i;
39368 int no_ready = number_in_ready ();
39369
39370 fprintf (stdout, "Number of ready: %d\n", no_ready);
39371
39372 for (i = 0; i < no_ready; i++)
39373 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
39374 }
39375
39376 /* This routine is the driver of the dispatch scheduler. */
39377
39378 static void
39379 do_dispatch (rtx insn, int mode)
39380 {
39381 if (mode == DISPATCH_INIT)
39382 init_dispatch_sched ();
39383 else if (mode == ADD_TO_DISPATCH_WINDOW)
39384 add_to_dispatch_window (insn);
39385 }
39386
39387 /* Return TRUE if Dispatch Scheduling is supported. */
39388
39389 static bool
39390 has_dispatch (rtx insn, int action)
39391 {
39392 if ((TARGET_BDVER1 || TARGET_BDVER2)
39393 && flag_dispatch_scheduler)
39394 switch (action)
39395 {
39396 default:
39397 return false;
39398
39399 case IS_DISPATCH_ON:
39400 return true;
39401 break;
39402
39403 case IS_CMP:
39404 return is_cmp (insn);
39405
39406 case DISPATCH_VIOLATION:
39407 return dispatch_violation ();
39408
39409 case FITS_DISPATCH_WINDOW:
39410 return fits_dispatch_window (insn);
39411 }
39412
39413 return false;
39414 }
39415
39416 /* Implementation of reassociation_width target hook used by
39417 reassoc phase to identify parallelism level in reassociated
39418 tree. Statements tree_code is passed in OPC. Arguments type
39419 is passed in MODE.
39420
39421 Currently parallel reassociation is enabled for Atom
39422 processors only and we set reassociation width to be 2
39423 because Atom may issue up to 2 instructions per cycle.
39424
39425 Return value should be fixed if parallel reassociation is
39426 enabled for other processors. */
39427
39428 static int
39429 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
39430 enum machine_mode mode)
39431 {
39432 int res = 1;
39433
39434 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
39435 res = 2;
39436 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
39437 res = 2;
39438
39439 return res;
39440 }
39441
39442 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
39443 place emms and femms instructions. */
39444
39445 static enum machine_mode
39446 ix86_preferred_simd_mode (enum machine_mode mode)
39447 {
39448 if (!TARGET_SSE)
39449 return word_mode;
39450
39451 switch (mode)
39452 {
39453 case QImode:
39454 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
39455 case HImode:
39456 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
39457 case SImode:
39458 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
39459 case DImode:
39460 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
39461
39462 case SFmode:
39463 if (TARGET_AVX && !TARGET_PREFER_AVX128)
39464 return V8SFmode;
39465 else
39466 return V4SFmode;
39467
39468 case DFmode:
39469 if (!TARGET_VECTORIZE_DOUBLE)
39470 return word_mode;
39471 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
39472 return V4DFmode;
39473 else if (TARGET_SSE2)
39474 return V2DFmode;
39475 /* FALLTHRU */
39476
39477 default:
39478 return word_mode;
39479 }
39480 }
39481
39482 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
39483 vectors. */
39484
39485 static unsigned int
39486 ix86_autovectorize_vector_sizes (void)
39487 {
39488 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
39489 }
39490
39491 /* Validate target specific memory model bits in VAL. */
39492
39493 static unsigned HOST_WIDE_INT
39494 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
39495 {
39496 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
39497 unsigned HOST_WIDE_INT strong;
39498
39499 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
39500 |MEMMODEL_MASK)
39501 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
39502 {
39503 warning (OPT_Winvalid_memory_model,
39504 "Unknown architecture specific memory model");
39505 return MEMMODEL_SEQ_CST;
39506 }
39507 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
39508 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
39509 {
39510 warning (OPT_Winvalid_memory_model,
39511 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
39512 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
39513 }
39514 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
39515 {
39516 warning (OPT_Winvalid_memory_model,
39517 "HLE_RELEASE not used with RELEASE or stronger memory model");
39518 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
39519 }
39520 return val;
39521 }
39522
39523 /* Initialize the GCC target structure. */
39524 #undef TARGET_RETURN_IN_MEMORY
39525 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
39526
39527 #undef TARGET_LEGITIMIZE_ADDRESS
39528 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
39529
39530 #undef TARGET_ATTRIBUTE_TABLE
39531 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
39532 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39533 # undef TARGET_MERGE_DECL_ATTRIBUTES
39534 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
39535 #endif
39536
39537 #undef TARGET_COMP_TYPE_ATTRIBUTES
39538 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
39539
39540 #undef TARGET_INIT_BUILTINS
39541 #define TARGET_INIT_BUILTINS ix86_init_builtins
39542 #undef TARGET_BUILTIN_DECL
39543 #define TARGET_BUILTIN_DECL ix86_builtin_decl
39544 #undef TARGET_EXPAND_BUILTIN
39545 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
39546
39547 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
39548 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
39549 ix86_builtin_vectorized_function
39550
39551 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
39552 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
39553
39554 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
39555 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
39556
39557 #undef TARGET_VECTORIZE_BUILTIN_GATHER
39558 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
39559
39560 #undef TARGET_BUILTIN_RECIPROCAL
39561 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
39562
39563 #undef TARGET_ASM_FUNCTION_EPILOGUE
39564 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
39565
39566 #undef TARGET_ENCODE_SECTION_INFO
39567 #ifndef SUBTARGET_ENCODE_SECTION_INFO
39568 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
39569 #else
39570 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
39571 #endif
39572
39573 #undef TARGET_ASM_OPEN_PAREN
39574 #define TARGET_ASM_OPEN_PAREN ""
39575 #undef TARGET_ASM_CLOSE_PAREN
39576 #define TARGET_ASM_CLOSE_PAREN ""
39577
39578 #undef TARGET_ASM_BYTE_OP
39579 #define TARGET_ASM_BYTE_OP ASM_BYTE
39580
39581 #undef TARGET_ASM_ALIGNED_HI_OP
39582 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
39583 #undef TARGET_ASM_ALIGNED_SI_OP
39584 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
39585 #ifdef ASM_QUAD
39586 #undef TARGET_ASM_ALIGNED_DI_OP
39587 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
39588 #endif
39589
39590 #undef TARGET_PROFILE_BEFORE_PROLOGUE
39591 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
39592
39593 #undef TARGET_ASM_UNALIGNED_HI_OP
39594 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
39595 #undef TARGET_ASM_UNALIGNED_SI_OP
39596 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
39597 #undef TARGET_ASM_UNALIGNED_DI_OP
39598 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
39599
39600 #undef TARGET_PRINT_OPERAND
39601 #define TARGET_PRINT_OPERAND ix86_print_operand
39602 #undef TARGET_PRINT_OPERAND_ADDRESS
39603 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
39604 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
39605 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
39606 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
39607 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
39608
39609 #undef TARGET_SCHED_INIT_GLOBAL
39610 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
39611 #undef TARGET_SCHED_ADJUST_COST
39612 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
39613 #undef TARGET_SCHED_ISSUE_RATE
39614 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
39615 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
39616 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
39617 ia32_multipass_dfa_lookahead
39618
39619 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
39620 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
39621
39622 #undef TARGET_MEMMODEL_CHECK
39623 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
39624
39625 #ifdef HAVE_AS_TLS
39626 #undef TARGET_HAVE_TLS
39627 #define TARGET_HAVE_TLS true
39628 #endif
39629 #undef TARGET_CANNOT_FORCE_CONST_MEM
39630 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39631 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39632 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39633
39634 #undef TARGET_DELEGITIMIZE_ADDRESS
39635 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39636
39637 #undef TARGET_MS_BITFIELD_LAYOUT_P
39638 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39639
39640 #if TARGET_MACHO
39641 #undef TARGET_BINDS_LOCAL_P
39642 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39643 #endif
39644 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39645 #undef TARGET_BINDS_LOCAL_P
39646 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39647 #endif
39648
39649 #undef TARGET_ASM_OUTPUT_MI_THUNK
39650 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39651 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39652 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39653
39654 #undef TARGET_ASM_FILE_START
39655 #define TARGET_ASM_FILE_START x86_file_start
39656
39657 #undef TARGET_OPTION_OVERRIDE
39658 #define TARGET_OPTION_OVERRIDE ix86_option_override
39659
39660 #undef TARGET_REGISTER_MOVE_COST
39661 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39662 #undef TARGET_MEMORY_MOVE_COST
39663 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39664 #undef TARGET_RTX_COSTS
39665 #define TARGET_RTX_COSTS ix86_rtx_costs
39666 #undef TARGET_ADDRESS_COST
39667 #define TARGET_ADDRESS_COST ix86_address_cost
39668
39669 #undef TARGET_FIXED_CONDITION_CODE_REGS
39670 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39671 #undef TARGET_CC_MODES_COMPATIBLE
39672 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39673
39674 #undef TARGET_MACHINE_DEPENDENT_REORG
39675 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39676
39677 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39678 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39679
39680 #undef TARGET_BUILD_BUILTIN_VA_LIST
39681 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39682
39683 #undef TARGET_FOLD_BUILTIN
39684 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
39685
39686 #undef TARGET_ENUM_VA_LIST_P
39687 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39688
39689 #undef TARGET_FN_ABI_VA_LIST
39690 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39691
39692 #undef TARGET_CANONICAL_VA_LIST_TYPE
39693 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39694
39695 #undef TARGET_EXPAND_BUILTIN_VA_START
39696 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39697
39698 #undef TARGET_MD_ASM_CLOBBERS
39699 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
39700
39701 #undef TARGET_PROMOTE_PROTOTYPES
39702 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
39703 #undef TARGET_STRUCT_VALUE_RTX
39704 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
39705 #undef TARGET_SETUP_INCOMING_VARARGS
39706 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
39707 #undef TARGET_MUST_PASS_IN_STACK
39708 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
39709 #undef TARGET_FUNCTION_ARG_ADVANCE
39710 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
39711 #undef TARGET_FUNCTION_ARG
39712 #define TARGET_FUNCTION_ARG ix86_function_arg
39713 #undef TARGET_FUNCTION_ARG_BOUNDARY
39714 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
39715 #undef TARGET_PASS_BY_REFERENCE
39716 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39717 #undef TARGET_INTERNAL_ARG_POINTER
39718 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39719 #undef TARGET_UPDATE_STACK_BOUNDARY
39720 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39721 #undef TARGET_GET_DRAP_RTX
39722 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39723 #undef TARGET_STRICT_ARGUMENT_NAMING
39724 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39725 #undef TARGET_STATIC_CHAIN
39726 #define TARGET_STATIC_CHAIN ix86_static_chain
39727 #undef TARGET_TRAMPOLINE_INIT
39728 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39729 #undef TARGET_RETURN_POPS_ARGS
39730 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39731
39732 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39733 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39734
39735 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39736 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39737
39738 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39739 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39740
39741 #undef TARGET_C_MODE_FOR_SUFFIX
39742 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39743
39744 #ifdef HAVE_AS_TLS
39745 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39746 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39747 #endif
39748
39749 #ifdef SUBTARGET_INSERT_ATTRIBUTES
39750 #undef TARGET_INSERT_ATTRIBUTES
39751 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
39752 #endif
39753
39754 #undef TARGET_MANGLE_TYPE
39755 #define TARGET_MANGLE_TYPE ix86_mangle_type
39756
39757 #if !TARGET_MACHO
39758 #undef TARGET_STACK_PROTECT_FAIL
39759 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39760 #endif
39761
39762 #undef TARGET_FUNCTION_VALUE
39763 #define TARGET_FUNCTION_VALUE ix86_function_value
39764
39765 #undef TARGET_FUNCTION_VALUE_REGNO_P
39766 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39767
39768 #undef TARGET_PROMOTE_FUNCTION_MODE
39769 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39770
39771 #undef TARGET_SECONDARY_RELOAD
39772 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39773
39774 #undef TARGET_CLASS_MAX_NREGS
39775 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39776
39777 #undef TARGET_PREFERRED_RELOAD_CLASS
39778 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39779 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39780 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39781 #undef TARGET_CLASS_LIKELY_SPILLED_P
39782 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39783
39784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39786 ix86_builtin_vectorization_cost
39787 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39788 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39789 ix86_vectorize_vec_perm_const_ok
39790 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39791 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39792 ix86_preferred_simd_mode
39793 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39794 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39795 ix86_autovectorize_vector_sizes
39796
39797 #undef TARGET_SET_CURRENT_FUNCTION
39798 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39799
39800 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39801 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39802
39803 #undef TARGET_OPTION_SAVE
39804 #define TARGET_OPTION_SAVE ix86_function_specific_save
39805
39806 #undef TARGET_OPTION_RESTORE
39807 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39808
39809 #undef TARGET_OPTION_PRINT
39810 #define TARGET_OPTION_PRINT ix86_function_specific_print
39811
39812 #undef TARGET_CAN_INLINE_P
39813 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39814
39815 #undef TARGET_EXPAND_TO_RTL_HOOK
39816 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39817
39818 #undef TARGET_LEGITIMATE_ADDRESS_P
39819 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39820
39821 #undef TARGET_LEGITIMATE_CONSTANT_P
39822 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39823
39824 #undef TARGET_FRAME_POINTER_REQUIRED
39825 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39826
39827 #undef TARGET_CAN_ELIMINATE
39828 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39829
39830 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39831 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39832
39833 #undef TARGET_ASM_CODE_END
39834 #define TARGET_ASM_CODE_END ix86_code_end
39835
39836 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39837 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39838
39839 #if TARGET_MACHO
39840 #undef TARGET_INIT_LIBFUNCS
39841 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39842 #endif
39843
39844 struct gcc_target targetm = TARGET_INITIALIZER;
39845 \f
39846 #include "gt-i386.h"