ad4739b805dedda202cda9a5528bdab30eb9f289
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64
65 enum upper_128bits_state
66 {
67 unknown = 0,
68 unused,
69 used
70 };
71
72 typedef struct block_info_def
73 {
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85 } *block_info;
86
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89 enum call_avx256_state
90 {
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102 };
103
104 /* Check if a 256bit AVX register is referenced in stores. */
105
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119 }
120
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130 {
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292 }
293
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338 done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356 }
357
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490 }
491
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580 };
581
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651 };
652
653 static const
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723 };
724
725 static const
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793 };
794
795 static const
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870 };
871
872 static const
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941 };
942
943 static const
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014 };
1015
1016 static const
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087 };
1088
1089 static const
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340 };
1341
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427 };
1428
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509 };
1510
1511 static const
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580 };
1581
1582 static const
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653 };
1654
1655 static const
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726 };
1727
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1729 static const
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803 };
1804
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807 static const
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875 };
1876
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1878
1879 /* Processor feature/optimization bitmasks. */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1896
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER (m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912
1913 /* Generic instruction choice should be common subset of supported CPUs
1914 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916
1917 /* Feature tests against the various tunings. */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921 based on the processor mask. */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924 negatively, so enabling for Generic64 seems like good code size
1925 tradeoff. We can't enable it for 32bit generic because it does not
1926 work well with PPro base chips. */
1927 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928
1929 /* X86_TUNE_PUSH_MEMORY */
1930 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931
1932 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933 m_486 | m_PENT,
1934
1935 /* X86_TUNE_UNROLL_STRLEN */
1936 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937
1938 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939 on simulation result. But after P4 was made, no performance benefit
1940 was observed with branch hints. It also increases the code size.
1941 As a result, icc never generates branch hints. */
1942 0,
1943
1944 /* X86_TUNE_DOUBLE_WITH_ADD */
1945 ~m_386,
1946
1947 /* X86_TUNE_USE_SAHF */
1948 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949
1950 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951 partial dependencies. */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953
1954 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955 register stalls on Generic32 compilation setting as well. However
1956 in current implementation the partial register stalls are not eliminated
1957 very well - they can be introduced via subregs synthesized by combine
1958 and can happen in caller/callee saving sequences. Because this option
1959 pays back little on PPro based chips and is in conflict with partial reg
1960 dependencies used by Athlon/P4 based chips, it is better to leave it off
1961 for generic32 for now. */
1962 m_PPRO,
1963
1964 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965 m_CORE2I7 | m_GENERIC,
1966
1967 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1968 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1969 m_CORE2I7 | m_GENERIC,
1970
1971 /* X86_TUNE_USE_HIMODE_FIOP */
1972 m_386 | m_486 | m_K6_GEODE,
1973
1974 /* X86_TUNE_USE_SIMODE_FIOP */
1975 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1976
1977 /* X86_TUNE_USE_MOV0 */
1978 m_K6,
1979
1980 /* X86_TUNE_USE_CLTD */
1981 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1982
1983 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1984 m_PENT4,
1985
1986 /* X86_TUNE_SPLIT_LONG_MOVES */
1987 m_PPRO,
1988
1989 /* X86_TUNE_READ_MODIFY_WRITE */
1990 ~m_PENT,
1991
1992 /* X86_TUNE_READ_MODIFY */
1993 ~(m_PENT | m_PPRO),
1994
1995 /* X86_TUNE_PROMOTE_QIMODE */
1996 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1997
1998 /* X86_TUNE_FAST_PREFIX */
1999 ~(m_386 | m_486 | m_PENT),
2000
2001 /* X86_TUNE_SINGLE_STRINGOP */
2002 m_386 | m_P4_NOCONA,
2003
2004 /* X86_TUNE_QIMODE_MATH */
2005 ~0,
2006
2007 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2008 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2009 might be considered for Generic32 if our scheme for avoiding partial
2010 stalls was more effective. */
2011 ~m_PPRO,
2012
2013 /* X86_TUNE_PROMOTE_QI_REGS */
2014 0,
2015
2016 /* X86_TUNE_PROMOTE_HI_REGS */
2017 m_PPRO,
2018
2019 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2020 over esp addition. */
2021 m_386 | m_486 | m_PENT | m_PPRO,
2022
2023 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2024 over esp addition. */
2025 m_PENT,
2026
2027 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2028 over esp subtraction. */
2029 m_386 | m_486 | m_PENT | m_K6_GEODE,
2030
2031 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2032 over esp subtraction. */
2033 m_PENT | m_K6_GEODE,
2034
2035 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2036 for DFmode copies */
2037 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2038
2039 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2040 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2041
2042 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2043 conflict here in between PPro/Pentium4 based chips that thread 128bit
2044 SSE registers as single units versus K8 based chips that divide SSE
2045 registers to two 64bit halves. This knob promotes all store destinations
2046 to be 128bit to allow register renaming on 128bit SSE units, but usually
2047 results in one extra microop on 64bit SSE units. Experimental results
2048 shows that disabling this option on P4 brings over 20% SPECfp regression,
2049 while enabling it on K8 brings roughly 2.4% regression that can be partly
2050 masked by careful scheduling of moves. */
2051 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2052
2053 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2054 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2055
2056 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2057 m_COREI7 | m_BDVER,
2058
2059 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2060 m_BDVER ,
2061
2062 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2063 are resolved on SSE register parts instead of whole registers, so we may
2064 maintain just lower part of scalar values in proper format leaving the
2065 upper part undefined. */
2066 m_ATHLON_K8,
2067
2068 /* X86_TUNE_SSE_TYPELESS_STORES */
2069 m_AMD_MULTIPLE,
2070
2071 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2072 m_PPRO | m_P4_NOCONA,
2073
2074 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2075 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2076
2077 /* X86_TUNE_PROLOGUE_USING_MOVE */
2078 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2079
2080 /* X86_TUNE_EPILOGUE_USING_MOVE */
2081 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2082
2083 /* X86_TUNE_SHIFT1 */
2084 ~m_486,
2085
2086 /* X86_TUNE_USE_FFREEP */
2087 m_AMD_MULTIPLE,
2088
2089 /* X86_TUNE_INTER_UNIT_MOVES */
2090 ~(m_AMD_MULTIPLE | m_GENERIC),
2091
2092 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2093 ~(m_AMDFAM10 | m_BDVER ),
2094
2095 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2096 than 4 branch instructions in the 16 byte window. */
2097 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2098
2099 /* X86_TUNE_SCHEDULE */
2100 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2101
2102 /* X86_TUNE_USE_BT */
2103 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2104
2105 /* X86_TUNE_USE_INCDEC */
2106 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2107
2108 /* X86_TUNE_PAD_RETURNS */
2109 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2110
2111 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2112 m_ATOM,
2113
2114 /* X86_TUNE_EXT_80387_CONSTANTS */
2115 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2116
2117 /* X86_TUNE_SHORTEN_X87_SSE */
2118 ~m_K8,
2119
2120 /* X86_TUNE_AVOID_VECTOR_DECODE */
2121 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2122
2123 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2124 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2125 ~(m_386 | m_486),
2126
2127 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2128 vector path on AMD machines. */
2129 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130
2131 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2132 machines. */
2133 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2134
2135 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2136 than a MOV. */
2137 m_PENT,
2138
2139 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2140 but one byte longer. */
2141 m_PENT,
2142
2143 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2144 operand that cannot be represented using a modRM byte. The XOR
2145 replacement is long decoded, so this split helps here as well. */
2146 m_K6,
2147
2148 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2149 from FP to FP. */
2150 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2151
2152 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2153 from integer to FP. */
2154 m_AMDFAM10,
2155
2156 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2157 with a subsequent conditional jump instruction into a single
2158 compare-and-branch uop. */
2159 m_BDVER,
2160
2161 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2162 will impact LEA instruction selection. */
2163 m_ATOM,
2164
2165 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2166 instructions. */
2167 ~m_ATOM,
2168
2169 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2170 at -O3. For the moment, the prefetching seems badly tuned for Intel
2171 chips. */
2172 m_K6_GEODE | m_AMD_MULTIPLE,
2173
2174 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2175 the auto-vectorizer. */
2176 m_BDVER,
2177
2178 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2179 during reassociation of integer computation. */
2180 m_ATOM,
2181
2182 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2183 during reassociation of fp computation. */
2184 m_ATOM
2185 };
2186
2187 /* Feature tests against the various architecture variations. */
2188 unsigned char ix86_arch_features[X86_ARCH_LAST];
2189
2190 /* Feature tests against the various architecture variations, used to create
2191 ix86_arch_features based on the processor mask. */
2192 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2193 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2194 ~(m_386 | m_486 | m_PENT | m_K6),
2195
2196 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2197 ~m_386,
2198
2199 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2200 ~(m_386 | m_486),
2201
2202 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2203 ~m_386,
2204
2205 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2206 ~m_386,
2207 };
2208
2209 static const unsigned int x86_accumulate_outgoing_args
2210 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2211
2212 static const unsigned int x86_arch_always_fancy_math_387
2213 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2214
2215 static const unsigned int x86_avx256_split_unaligned_load
2216 = m_COREI7 | m_GENERIC;
2217
2218 static const unsigned int x86_avx256_split_unaligned_store
2219 = m_COREI7 | m_BDVER | m_GENERIC;
2220
2221 /* In case the average insn count for single function invocation is
2222 lower than this constant, emit fast (but longer) prologue and
2223 epilogue code. */
2224 #define FAST_PROLOGUE_INSN_COUNT 20
2225
2226 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2227 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2228 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2229 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2230
2231 /* Array of the smallest class containing reg number REGNO, indexed by
2232 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2233
2234 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2235 {
2236 /* ax, dx, cx, bx */
2237 AREG, DREG, CREG, BREG,
2238 /* si, di, bp, sp */
2239 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2240 /* FP registers */
2241 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2242 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2243 /* arg pointer */
2244 NON_Q_REGS,
2245 /* flags, fpsr, fpcr, frame */
2246 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2247 /* SSE registers */
2248 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2249 SSE_REGS, SSE_REGS,
2250 /* MMX registers */
2251 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2252 MMX_REGS, MMX_REGS,
2253 /* REX registers */
2254 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2255 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2256 /* SSE REX registers */
2257 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2258 SSE_REGS, SSE_REGS,
2259 };
2260
2261 /* The "default" register map used in 32bit mode. */
2262
2263 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2264 {
2265 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2266 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2267 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2268 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2269 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2270 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2272 };
2273
2274 /* The "default" register map used in 64bit mode. */
2275
2276 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2277 {
2278 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2279 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2280 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2281 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2282 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2283 8,9,10,11,12,13,14,15, /* extended integer registers */
2284 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2285 };
2286
2287 /* Define the register numbers to be used in Dwarf debugging information.
2288 The SVR4 reference port C compiler uses the following register numbers
2289 in its Dwarf output code:
2290 0 for %eax (gcc regno = 0)
2291 1 for %ecx (gcc regno = 2)
2292 2 for %edx (gcc regno = 1)
2293 3 for %ebx (gcc regno = 3)
2294 4 for %esp (gcc regno = 7)
2295 5 for %ebp (gcc regno = 6)
2296 6 for %esi (gcc regno = 4)
2297 7 for %edi (gcc regno = 5)
2298 The following three DWARF register numbers are never generated by
2299 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2300 believes these numbers have these meanings.
2301 8 for %eip (no gcc equivalent)
2302 9 for %eflags (gcc regno = 17)
2303 10 for %trapno (no gcc equivalent)
2304 It is not at all clear how we should number the FP stack registers
2305 for the x86 architecture. If the version of SDB on x86/svr4 were
2306 a bit less brain dead with respect to floating-point then we would
2307 have a precedent to follow with respect to DWARF register numbers
2308 for x86 FP registers, but the SDB on x86/svr4 is so completely
2309 broken with respect to FP registers that it is hardly worth thinking
2310 of it as something to strive for compatibility with.
2311 The version of x86/svr4 SDB I have at the moment does (partially)
2312 seem to believe that DWARF register number 11 is associated with
2313 the x86 register %st(0), but that's about all. Higher DWARF
2314 register numbers don't seem to be associated with anything in
2315 particular, and even for DWARF regno 11, SDB only seems to under-
2316 stand that it should say that a variable lives in %st(0) (when
2317 asked via an `=' command) if we said it was in DWARF regno 11,
2318 but SDB still prints garbage when asked for the value of the
2319 variable in question (via a `/' command).
2320 (Also note that the labels SDB prints for various FP stack regs
2321 when doing an `x' command are all wrong.)
2322 Note that these problems generally don't affect the native SVR4
2323 C compiler because it doesn't allow the use of -O with -g and
2324 because when it is *not* optimizing, it allocates a memory
2325 location for each floating-point variable, and the memory
2326 location is what gets described in the DWARF AT_location
2327 attribute for the variable in question.
2328 Regardless of the severe mental illness of the x86/svr4 SDB, we
2329 do something sensible here and we use the following DWARF
2330 register numbers. Note that these are all stack-top-relative
2331 numbers.
2332 11 for %st(0) (gcc regno = 8)
2333 12 for %st(1) (gcc regno = 9)
2334 13 for %st(2) (gcc regno = 10)
2335 14 for %st(3) (gcc regno = 11)
2336 15 for %st(4) (gcc regno = 12)
2337 16 for %st(5) (gcc regno = 13)
2338 17 for %st(6) (gcc regno = 14)
2339 18 for %st(7) (gcc regno = 15)
2340 */
2341 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2342 {
2343 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2344 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2345 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2346 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2347 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2348 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2349 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2350 };
2351
2352 /* Define parameter passing and return registers. */
2353
2354 static int const x86_64_int_parameter_registers[6] =
2355 {
2356 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_ms_abi_int_parameter_registers[4] =
2360 {
2361 CX_REG, DX_REG, R8_REG, R9_REG
2362 };
2363
2364 static int const x86_64_int_return_registers[4] =
2365 {
2366 AX_REG, DX_REG, DI_REG, SI_REG
2367 };
2368
2369 /* Define the structure for the machine field in struct function. */
2370
2371 struct GTY(()) stack_local_entry {
2372 unsigned short mode;
2373 unsigned short n;
2374 rtx rtl;
2375 struct stack_local_entry *next;
2376 };
2377
2378 /* Structure describing stack frame layout.
2379 Stack grows downward:
2380
2381 [arguments]
2382 <- ARG_POINTER
2383 saved pc
2384
2385 saved static chain if ix86_static_chain_on_stack
2386
2387 saved frame pointer if frame_pointer_needed
2388 <- HARD_FRAME_POINTER
2389 [saved regs]
2390 <- regs_save_offset
2391 [padding0]
2392
2393 [saved SSE regs]
2394 <- sse_regs_save_offset
2395 [padding1] |
2396 | <- FRAME_POINTER
2397 [va_arg registers] |
2398 |
2399 [frame] |
2400 |
2401 [padding2] | = to_allocate
2402 <- STACK_POINTER
2403 */
2404 struct ix86_frame
2405 {
2406 int nsseregs;
2407 int nregs;
2408 int va_arg_size;
2409 int red_zone_size;
2410 int outgoing_arguments_size;
2411 HOST_WIDE_INT frame;
2412
2413 /* The offsets relative to ARG_POINTER. */
2414 HOST_WIDE_INT frame_pointer_offset;
2415 HOST_WIDE_INT hard_frame_pointer_offset;
2416 HOST_WIDE_INT stack_pointer_offset;
2417 HOST_WIDE_INT hfp_save_offset;
2418 HOST_WIDE_INT reg_save_offset;
2419 HOST_WIDE_INT sse_reg_save_offset;
2420
2421 /* When save_regs_using_mov is set, emit prologue using
2422 move instead of push instructions. */
2423 bool save_regs_using_mov;
2424 };
2425
2426 /* Which cpu are we scheduling for. */
2427 enum attr_cpu ix86_schedule;
2428
2429 /* Which cpu are we optimizing for. */
2430 enum processor_type ix86_tune;
2431
2432 /* Which instruction set architecture to use. */
2433 enum processor_type ix86_arch;
2434
2435 /* true if sse prefetch instruction is not NOOP. */
2436 int x86_prefetch_sse;
2437
2438 /* -mstackrealign option */
2439 static const char ix86_force_align_arg_pointer_string[]
2440 = "force_align_arg_pointer";
2441
2442 static rtx (*ix86_gen_leave) (void);
2443 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2445 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2446 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2447 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2449 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2450 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2451 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2452 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2453 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2454
2455 /* Preferred alignment for stack boundary in bits. */
2456 unsigned int ix86_preferred_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits specified at
2459 command line. */
2460 static unsigned int ix86_user_incoming_stack_boundary;
2461
2462 /* Default alignment for incoming stack boundary in bits. */
2463 static unsigned int ix86_default_incoming_stack_boundary;
2464
2465 /* Alignment for incoming stack boundary in bits. */
2466 unsigned int ix86_incoming_stack_boundary;
2467
2468 /* Calling abi specific va_list type nodes. */
2469 static GTY(()) tree sysv_va_list_type_node;
2470 static GTY(()) tree ms_va_list_type_node;
2471
2472 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2473 char internal_label_prefix[16];
2474 int internal_label_prefix_len;
2475
2476 /* Fence to use after loop using movnt. */
2477 tree x86_mfence;
2478
2479 /* Register class used for passing given 64bit part of the argument.
2480 These represent classes as documented by the PS ABI, with the exception
2481 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2482 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2483
2484 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2485 whenever possible (upper half does contain padding). */
2486 enum x86_64_reg_class
2487 {
2488 X86_64_NO_CLASS,
2489 X86_64_INTEGER_CLASS,
2490 X86_64_INTEGERSI_CLASS,
2491 X86_64_SSE_CLASS,
2492 X86_64_SSESF_CLASS,
2493 X86_64_SSEDF_CLASS,
2494 X86_64_SSEUP_CLASS,
2495 X86_64_X87_CLASS,
2496 X86_64_X87UP_CLASS,
2497 X86_64_COMPLEX_X87_CLASS,
2498 X86_64_MEMORY_CLASS
2499 };
2500
2501 #define MAX_CLASSES 4
2502
2503 /* Table of constants used by fldpi, fldln2, etc.... */
2504 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2505 static bool ext_80387_constants_init = 0;
2506
2507 \f
2508 static struct machine_function * ix86_init_machine_status (void);
2509 static rtx ix86_function_value (const_tree, const_tree, bool);
2510 static bool ix86_function_value_regno_p (const unsigned int);
2511 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2512 const_tree);
2513 static rtx ix86_static_chain (const_tree, bool);
2514 static int ix86_function_regparm (const_tree, const_tree);
2515 static void ix86_compute_frame_layout (struct ix86_frame *);
2516 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2517 rtx, rtx, int);
2518 static void ix86_add_new_builtins (HOST_WIDE_INT);
2519 static tree ix86_canonical_va_list_type (tree);
2520 static void predict_jump (int);
2521 static unsigned int split_stack_prologue_scratch_regno (void);
2522 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2523
2524 enum ix86_function_specific_strings
2525 {
2526 IX86_FUNCTION_SPECIFIC_ARCH,
2527 IX86_FUNCTION_SPECIFIC_TUNE,
2528 IX86_FUNCTION_SPECIFIC_MAX
2529 };
2530
2531 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2532 const char *, enum fpmath_unit, bool);
2533 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2534 static void ix86_function_specific_save (struct cl_target_option *);
2535 static void ix86_function_specific_restore (struct cl_target_option *);
2536 static void ix86_function_specific_print (FILE *, int,
2537 struct cl_target_option *);
2538 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2539 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2540 struct gcc_options *);
2541 static bool ix86_can_inline_p (tree, tree);
2542 static void ix86_set_current_function (tree);
2543 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2544
2545 static enum calling_abi ix86_function_abi (const_tree);
2546
2547 \f
2548 #ifndef SUBTARGET32_DEFAULT_CPU
2549 #define SUBTARGET32_DEFAULT_CPU "i386"
2550 #endif
2551
2552 /* The svr4 ABI for the i386 says that records and unions are returned
2553 in memory. */
2554 #ifndef DEFAULT_PCC_STRUCT_RETURN
2555 #define DEFAULT_PCC_STRUCT_RETURN 1
2556 #endif
2557
2558 /* Whether -mtune= or -march= were specified */
2559 static int ix86_tune_defaulted;
2560 static int ix86_arch_specified;
2561
2562 /* Vectorization library interface and handlers. */
2563 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2564
2565 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2566 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2567
2568 /* Processor target table, indexed by processor number */
2569 struct ptt
2570 {
2571 const struct processor_costs *cost; /* Processor costs */
2572 const int align_loop; /* Default alignments. */
2573 const int align_loop_max_skip;
2574 const int align_jump;
2575 const int align_jump_max_skip;
2576 const int align_func;
2577 };
2578
2579 static const struct ptt processor_target_table[PROCESSOR_max] =
2580 {
2581 {&i386_cost, 4, 3, 4, 3, 4},
2582 {&i486_cost, 16, 15, 16, 15, 16},
2583 {&pentium_cost, 16, 7, 16, 7, 16},
2584 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2585 {&geode_cost, 0, 0, 0, 0, 0},
2586 {&k6_cost, 32, 7, 32, 7, 32},
2587 {&athlon_cost, 16, 7, 16, 7, 16},
2588 {&pentium4_cost, 0, 0, 0, 0, 0},
2589 {&k8_cost, 16, 7, 16, 7, 16},
2590 {&nocona_cost, 0, 0, 0, 0, 0},
2591 /* Core 2 32-bit. */
2592 {&generic32_cost, 16, 10, 16, 10, 16},
2593 /* Core 2 64-bit. */
2594 {&generic64_cost, 16, 10, 16, 10, 16},
2595 /* Core i7 32-bit. */
2596 {&generic32_cost, 16, 10, 16, 10, 16},
2597 /* Core i7 64-bit. */
2598 {&generic64_cost, 16, 10, 16, 10, 16},
2599 {&generic32_cost, 16, 7, 16, 7, 16},
2600 {&generic64_cost, 16, 10, 16, 10, 16},
2601 {&amdfam10_cost, 32, 24, 32, 7, 32},
2602 {&bdver1_cost, 32, 24, 32, 7, 32},
2603 {&bdver2_cost, 32, 24, 32, 7, 32},
2604 {&btver1_cost, 32, 24, 32, 7, 32},
2605 {&atom_cost, 16, 15, 16, 7, 16}
2606 };
2607
2608 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2609 {
2610 "generic",
2611 "i386",
2612 "i486",
2613 "pentium",
2614 "pentium-mmx",
2615 "pentiumpro",
2616 "pentium2",
2617 "pentium3",
2618 "pentium4",
2619 "pentium-m",
2620 "prescott",
2621 "nocona",
2622 "core2",
2623 "corei7",
2624 "atom",
2625 "geode",
2626 "k6",
2627 "k6-2",
2628 "k6-3",
2629 "athlon",
2630 "athlon-4",
2631 "k8",
2632 "amdfam10",
2633 "bdver1",
2634 "bdver2",
2635 "btver1"
2636 };
2637 \f
2638 /* Return true if a red-zone is in use. */
2639
2640 static inline bool
2641 ix86_using_red_zone (void)
2642 {
2643 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2644 }
2645 \f
2646 /* Return a string that documents the current -m options. The caller is
2647 responsible for freeing the string. */
2648
2649 static char *
2650 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2651 const char *tune, enum fpmath_unit fpmath,
2652 bool add_nl_p)
2653 {
2654 struct ix86_target_opts
2655 {
2656 const char *option; /* option string */
2657 HOST_WIDE_INT mask; /* isa mask options */
2658 };
2659
2660 /* This table is ordered so that options like -msse4.2 that imply
2661 preceding options while match those first. */
2662 static struct ix86_target_opts isa_opts[] =
2663 {
2664 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2665 { "-mfma", OPTION_MASK_ISA_FMA },
2666 { "-mxop", OPTION_MASK_ISA_XOP },
2667 { "-mlwp", OPTION_MASK_ISA_LWP },
2668 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2669 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2670 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2671 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2672 { "-msse3", OPTION_MASK_ISA_SSE3 },
2673 { "-msse2", OPTION_MASK_ISA_SSE2 },
2674 { "-msse", OPTION_MASK_ISA_SSE },
2675 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2676 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2677 { "-mmmx", OPTION_MASK_ISA_MMX },
2678 { "-mabm", OPTION_MASK_ISA_ABM },
2679 { "-mbmi", OPTION_MASK_ISA_BMI },
2680 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2681 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2682 { "-mhle", OPTION_MASK_ISA_HLE },
2683 { "-mtbm", OPTION_MASK_ISA_TBM },
2684 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2685 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2686 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2687 { "-maes", OPTION_MASK_ISA_AES },
2688 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2689 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2690 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2691 { "-mf16c", OPTION_MASK_ISA_F16C },
2692 { "-mrtm", OPTION_MASK_ISA_RTM },
2693 };
2694
2695 /* Flag options. */
2696 static struct ix86_target_opts flag_opts[] =
2697 {
2698 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2699 { "-m80387", MASK_80387 },
2700 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2701 { "-malign-double", MASK_ALIGN_DOUBLE },
2702 { "-mcld", MASK_CLD },
2703 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2704 { "-mieee-fp", MASK_IEEE_FP },
2705 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2706 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2707 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2708 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2709 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2710 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2711 { "-mno-red-zone", MASK_NO_RED_ZONE },
2712 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2713 { "-mrecip", MASK_RECIP },
2714 { "-mrtd", MASK_RTD },
2715 { "-msseregparm", MASK_SSEREGPARM },
2716 { "-mstack-arg-probe", MASK_STACK_PROBE },
2717 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2718 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2719 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2720 { "-mvzeroupper", MASK_VZEROUPPER },
2721 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2722 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2723 { "-mprefer-avx128", MASK_PREFER_AVX128},
2724 };
2725
2726 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2727
2728 char isa_other[40];
2729 char target_other[40];
2730 unsigned num = 0;
2731 unsigned i, j;
2732 char *ret;
2733 char *ptr;
2734 size_t len;
2735 size_t line_len;
2736 size_t sep_len;
2737 const char *abi;
2738
2739 memset (opts, '\0', sizeof (opts));
2740
2741 /* Add -march= option. */
2742 if (arch)
2743 {
2744 opts[num][0] = "-march=";
2745 opts[num++][1] = arch;
2746 }
2747
2748 /* Add -mtune= option. */
2749 if (tune)
2750 {
2751 opts[num][0] = "-mtune=";
2752 opts[num++][1] = tune;
2753 }
2754
2755 /* Add -m32/-m64/-mx32. */
2756 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2757 {
2758 if ((isa & OPTION_MASK_ABI_64) != 0)
2759 abi = "-m64";
2760 else
2761 abi = "-mx32";
2762 isa &= ~ (OPTION_MASK_ISA_64BIT
2763 | OPTION_MASK_ABI_64
2764 | OPTION_MASK_ABI_X32);
2765 }
2766 else
2767 abi = "-m32";
2768 opts[num++][0] = abi;
2769
2770 /* Pick out the options in isa options. */
2771 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2772 {
2773 if ((isa & isa_opts[i].mask) != 0)
2774 {
2775 opts[num++][0] = isa_opts[i].option;
2776 isa &= ~ isa_opts[i].mask;
2777 }
2778 }
2779
2780 if (isa && add_nl_p)
2781 {
2782 opts[num++][0] = isa_other;
2783 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2784 isa);
2785 }
2786
2787 /* Add flag options. */
2788 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2789 {
2790 if ((flags & flag_opts[i].mask) != 0)
2791 {
2792 opts[num++][0] = flag_opts[i].option;
2793 flags &= ~ flag_opts[i].mask;
2794 }
2795 }
2796
2797 if (flags && add_nl_p)
2798 {
2799 opts[num++][0] = target_other;
2800 sprintf (target_other, "(other flags: %#x)", flags);
2801 }
2802
2803 /* Add -fpmath= option. */
2804 if (fpmath)
2805 {
2806 opts[num][0] = "-mfpmath=";
2807 switch ((int) fpmath)
2808 {
2809 case FPMATH_387:
2810 opts[num++][1] = "387";
2811 break;
2812
2813 case FPMATH_SSE:
2814 opts[num++][1] = "sse";
2815 break;
2816
2817 case FPMATH_387 | FPMATH_SSE:
2818 opts[num++][1] = "sse+387";
2819 break;
2820
2821 default:
2822 gcc_unreachable ();
2823 }
2824 }
2825
2826 /* Any options? */
2827 if (num == 0)
2828 return NULL;
2829
2830 gcc_assert (num < ARRAY_SIZE (opts));
2831
2832 /* Size the string. */
2833 len = 0;
2834 sep_len = (add_nl_p) ? 3 : 1;
2835 for (i = 0; i < num; i++)
2836 {
2837 len += sep_len;
2838 for (j = 0; j < 2; j++)
2839 if (opts[i][j])
2840 len += strlen (opts[i][j]);
2841 }
2842
2843 /* Build the string. */
2844 ret = ptr = (char *) xmalloc (len);
2845 line_len = 0;
2846
2847 for (i = 0; i < num; i++)
2848 {
2849 size_t len2[2];
2850
2851 for (j = 0; j < 2; j++)
2852 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2853
2854 if (i != 0)
2855 {
2856 *ptr++ = ' ';
2857 line_len++;
2858
2859 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2860 {
2861 *ptr++ = '\\';
2862 *ptr++ = '\n';
2863 line_len = 0;
2864 }
2865 }
2866
2867 for (j = 0; j < 2; j++)
2868 if (opts[i][j])
2869 {
2870 memcpy (ptr, opts[i][j], len2[j]);
2871 ptr += len2[j];
2872 line_len += len2[j];
2873 }
2874 }
2875
2876 *ptr = '\0';
2877 gcc_assert (ret + len >= ptr);
2878
2879 return ret;
2880 }
2881
2882 /* Return true, if profiling code should be emitted before
2883 prologue. Otherwise it returns false.
2884 Note: For x86 with "hotfix" it is sorried. */
2885 static bool
2886 ix86_profile_before_prologue (void)
2887 {
2888 return flag_fentry != 0;
2889 }
2890
2891 /* Function that is callable from the debugger to print the current
2892 options. */
2893 void
2894 ix86_debug_options (void)
2895 {
2896 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2897 ix86_arch_string, ix86_tune_string,
2898 ix86_fpmath, true);
2899
2900 if (opts)
2901 {
2902 fprintf (stderr, "%s\n\n", opts);
2903 free (opts);
2904 }
2905 else
2906 fputs ("<no options>\n\n", stderr);
2907
2908 return;
2909 }
2910 \f
2911 /* Override various settings based on options. If MAIN_ARGS_P, the
2912 options are from the command line, otherwise they are from
2913 attributes. */
2914
2915 static void
2916 ix86_option_override_internal (bool main_args_p)
2917 {
2918 int i;
2919 unsigned int ix86_arch_mask, ix86_tune_mask;
2920 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2921 const char *prefix;
2922 const char *suffix;
2923 const char *sw;
2924
2925 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2926 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2927 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2928 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2929 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2930 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2931 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2932 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2933 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2934 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2935 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2936 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2937 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2938 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2939 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2940 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2941 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2942 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2943 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2944 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2945 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2946 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2947 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2948 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2949 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2950 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2951 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2952 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2953 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2954 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2955 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2956 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2957 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2958 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2959 /* if this reaches 64, need to widen struct pta flags below */
2960
2961 static struct pta
2962 {
2963 const char *const name; /* processor name or nickname. */
2964 const enum processor_type processor;
2965 const enum attr_cpu schedule;
2966 const unsigned HOST_WIDE_INT flags;
2967 }
2968 const processor_alias_table[] =
2969 {
2970 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2971 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2972 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2973 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2974 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2975 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2976 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2977 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2978 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2979 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2980 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2981 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2982 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2983 PTA_MMX | PTA_SSE},
2984 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2985 PTA_MMX | PTA_SSE},
2986 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2987 PTA_MMX | PTA_SSE | PTA_SSE2},
2988 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2989 PTA_MMX |PTA_SSE | PTA_SSE2},
2990 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2991 PTA_MMX | PTA_SSE | PTA_SSE2},
2992 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2993 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2994 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2995 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2996 | PTA_CX16 | PTA_NO_SAHF},
2997 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2998 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2999 | PTA_SSSE3 | PTA_CX16},
3000 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3001 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3002 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3003 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3004 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3005 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3006 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3007 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3008 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3009 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3010 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3011 | PTA_RDRND | PTA_F16C},
3012 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3013 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3014 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3015 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3016 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3017 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3018 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3019 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3020 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3021 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3022 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3023 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3024 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3025 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3026 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3027 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3028 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3029 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3030 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3031 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3032 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3033 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3034 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3035 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3036 {"x86-64", PROCESSOR_K8, CPU_K8,
3037 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3038 {"k8", PROCESSOR_K8, CPU_K8,
3039 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3040 | PTA_SSE2 | PTA_NO_SAHF},
3041 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3042 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3043 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3044 {"opteron", PROCESSOR_K8, CPU_K8,
3045 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3046 | PTA_SSE2 | PTA_NO_SAHF},
3047 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3048 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3049 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3050 {"athlon64", PROCESSOR_K8, CPU_K8,
3051 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3052 | PTA_SSE2 | PTA_NO_SAHF},
3053 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3054 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3055 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3056 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3057 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3058 | PTA_SSE2 | PTA_NO_SAHF},
3059 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3060 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3061 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3062 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3063 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3064 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3065 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3066 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3067 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3068 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3069 | PTA_XOP | PTA_LWP},
3070 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3071 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3072 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3073 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3074 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3075 | PTA_FMA},
3076 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3077 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3078 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3079 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3080 PTA_HLE /* flags are only used for -march switch. */ },
3081 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3082 PTA_64BIT
3083 | PTA_HLE /* flags are only used for -march switch. */ },
3084 };
3085
3086 /* -mrecip options. */
3087 static struct
3088 {
3089 const char *string; /* option name */
3090 unsigned int mask; /* mask bits to set */
3091 }
3092 const recip_options[] =
3093 {
3094 { "all", RECIP_MASK_ALL },
3095 { "none", RECIP_MASK_NONE },
3096 { "div", RECIP_MASK_DIV },
3097 { "sqrt", RECIP_MASK_SQRT },
3098 { "vec-div", RECIP_MASK_VEC_DIV },
3099 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3100 };
3101
3102 int const pta_size = ARRAY_SIZE (processor_alias_table);
3103
3104 /* Set up prefix/suffix so the error messages refer to either the command
3105 line argument, or the attribute(target). */
3106 if (main_args_p)
3107 {
3108 prefix = "-m";
3109 suffix = "";
3110 sw = "switch";
3111 }
3112 else
3113 {
3114 prefix = "option(\"";
3115 suffix = "\")";
3116 sw = "attribute";
3117 }
3118
3119 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3120 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3121 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3122 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3123 #ifdef TARGET_BI_ARCH
3124 else
3125 {
3126 #if TARGET_BI_ARCH == 1
3127 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3128 is on and OPTION_MASK_ABI_X32 is off. We turn off
3129 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3130 -mx32. */
3131 if (TARGET_X32)
3132 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3133 #else
3134 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3135 on and OPTION_MASK_ABI_64 is off. We turn off
3136 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3137 -m64. */
3138 if (TARGET_LP64)
3139 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3140 #endif
3141 }
3142 #endif
3143
3144 if (TARGET_X32)
3145 {
3146 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3147 OPTION_MASK_ABI_64 for TARGET_X32. */
3148 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3149 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3150 }
3151 else if (TARGET_LP64)
3152 {
3153 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3154 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3155 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3156 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3157 }
3158
3159 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3160 SUBTARGET_OVERRIDE_OPTIONS;
3161 #endif
3162
3163 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3164 SUBSUBTARGET_OVERRIDE_OPTIONS;
3165 #endif
3166
3167 /* -fPIC is the default for x86_64. */
3168 if (TARGET_MACHO && TARGET_64BIT)
3169 flag_pic = 2;
3170
3171 /* Need to check -mtune=generic first. */
3172 if (ix86_tune_string)
3173 {
3174 if (!strcmp (ix86_tune_string, "generic")
3175 || !strcmp (ix86_tune_string, "i686")
3176 /* As special support for cross compilers we read -mtune=native
3177 as -mtune=generic. With native compilers we won't see the
3178 -mtune=native, as it was changed by the driver. */
3179 || !strcmp (ix86_tune_string, "native"))
3180 {
3181 if (TARGET_64BIT)
3182 ix86_tune_string = "generic64";
3183 else
3184 ix86_tune_string = "generic32";
3185 }
3186 /* If this call is for setting the option attribute, allow the
3187 generic32/generic64 that was previously set. */
3188 else if (!main_args_p
3189 && (!strcmp (ix86_tune_string, "generic32")
3190 || !strcmp (ix86_tune_string, "generic64")))
3191 ;
3192 else if (!strncmp (ix86_tune_string, "generic", 7))
3193 error ("bad value (%s) for %stune=%s %s",
3194 ix86_tune_string, prefix, suffix, sw);
3195 else if (!strcmp (ix86_tune_string, "x86-64"))
3196 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3197 "%stune=k8%s or %stune=generic%s instead as appropriate",
3198 prefix, suffix, prefix, suffix, prefix, suffix);
3199 }
3200 else
3201 {
3202 if (ix86_arch_string)
3203 ix86_tune_string = ix86_arch_string;
3204 if (!ix86_tune_string)
3205 {
3206 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3207 ix86_tune_defaulted = 1;
3208 }
3209
3210 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3211 need to use a sensible tune option. */
3212 if (!strcmp (ix86_tune_string, "generic")
3213 || !strcmp (ix86_tune_string, "x86-64")
3214 || !strcmp (ix86_tune_string, "i686"))
3215 {
3216 if (TARGET_64BIT)
3217 ix86_tune_string = "generic64";
3218 else
3219 ix86_tune_string = "generic32";
3220 }
3221 }
3222
3223 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3224 {
3225 /* rep; movq isn't available in 32-bit code. */
3226 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3227 ix86_stringop_alg = no_stringop;
3228 }
3229
3230 if (!ix86_arch_string)
3231 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3232 else
3233 ix86_arch_specified = 1;
3234
3235 if (global_options_set.x_ix86_pmode)
3236 {
3237 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3238 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3239 error ("address mode %qs not supported in the %s bit mode",
3240 TARGET_64BIT ? "short" : "long",
3241 TARGET_64BIT ? "64" : "32");
3242 }
3243 else
3244 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3245
3246 if (!global_options_set.x_ix86_abi)
3247 ix86_abi = DEFAULT_ABI;
3248
3249 if (global_options_set.x_ix86_cmodel)
3250 {
3251 switch (ix86_cmodel)
3252 {
3253 case CM_SMALL:
3254 case CM_SMALL_PIC:
3255 if (flag_pic)
3256 ix86_cmodel = CM_SMALL_PIC;
3257 if (!TARGET_64BIT)
3258 error ("code model %qs not supported in the %s bit mode",
3259 "small", "32");
3260 break;
3261
3262 case CM_MEDIUM:
3263 case CM_MEDIUM_PIC:
3264 if (flag_pic)
3265 ix86_cmodel = CM_MEDIUM_PIC;
3266 if (!TARGET_64BIT)
3267 error ("code model %qs not supported in the %s bit mode",
3268 "medium", "32");
3269 else if (TARGET_X32)
3270 error ("code model %qs not supported in x32 mode",
3271 "medium");
3272 break;
3273
3274 case CM_LARGE:
3275 case CM_LARGE_PIC:
3276 if (flag_pic)
3277 ix86_cmodel = CM_LARGE_PIC;
3278 if (!TARGET_64BIT)
3279 error ("code model %qs not supported in the %s bit mode",
3280 "large", "32");
3281 else if (TARGET_X32)
3282 error ("code model %qs not supported in x32 mode",
3283 "medium");
3284 break;
3285
3286 case CM_32:
3287 if (flag_pic)
3288 error ("code model %s does not support PIC mode", "32");
3289 if (TARGET_64BIT)
3290 error ("code model %qs not supported in the %s bit mode",
3291 "32", "64");
3292 break;
3293
3294 case CM_KERNEL:
3295 if (flag_pic)
3296 {
3297 error ("code model %s does not support PIC mode", "kernel");
3298 ix86_cmodel = CM_32;
3299 }
3300 if (!TARGET_64BIT)
3301 error ("code model %qs not supported in the %s bit mode",
3302 "kernel", "32");
3303 break;
3304
3305 default:
3306 gcc_unreachable ();
3307 }
3308 }
3309 else
3310 {
3311 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3312 use of rip-relative addressing. This eliminates fixups that
3313 would otherwise be needed if this object is to be placed in a
3314 DLL, and is essentially just as efficient as direct addressing. */
3315 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3316 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3317 else if (TARGET_64BIT)
3318 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3319 else
3320 ix86_cmodel = CM_32;
3321 }
3322 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3323 {
3324 error ("-masm=intel not supported in this configuration");
3325 ix86_asm_dialect = ASM_ATT;
3326 }
3327 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3328 sorry ("%i-bit mode not compiled in",
3329 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3330
3331 for (i = 0; i < pta_size; i++)
3332 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3333 {
3334 ix86_schedule = processor_alias_table[i].schedule;
3335 ix86_arch = processor_alias_table[i].processor;
3336 /* Default cpu tuning to the architecture. */
3337 ix86_tune = ix86_arch;
3338
3339 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3340 error ("CPU you selected does not support x86-64 "
3341 "instruction set");
3342
3343 if (processor_alias_table[i].flags & PTA_MMX
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3345 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3346 if (processor_alias_table[i].flags & PTA_3DNOW
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3348 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3349 if (processor_alias_table[i].flags & PTA_3DNOW_A
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3351 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3352 if (processor_alias_table[i].flags & PTA_SSE
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3354 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3355 if (processor_alias_table[i].flags & PTA_SSE2
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3357 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3358 if (processor_alias_table[i].flags & PTA_SSE3
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3360 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3361 if (processor_alias_table[i].flags & PTA_SSSE3
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3363 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3364 if (processor_alias_table[i].flags & PTA_SSE4_1
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3366 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3367 if (processor_alias_table[i].flags & PTA_SSE4_2
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3369 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3370 if (processor_alias_table[i].flags & PTA_AVX
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3372 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3373 if (processor_alias_table[i].flags & PTA_AVX2
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3375 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3376 if (processor_alias_table[i].flags & PTA_FMA
3377 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3378 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3379 if (processor_alias_table[i].flags & PTA_SSE4A
3380 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3381 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3382 if (processor_alias_table[i].flags & PTA_FMA4
3383 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3384 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3385 if (processor_alias_table[i].flags & PTA_XOP
3386 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3387 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3388 if (processor_alias_table[i].flags & PTA_LWP
3389 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3390 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3391 if (processor_alias_table[i].flags & PTA_ABM
3392 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3393 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3394 if (processor_alias_table[i].flags & PTA_BMI
3395 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3396 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3397 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3398 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3399 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3400 if (processor_alias_table[i].flags & PTA_TBM
3401 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3402 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3403 if (processor_alias_table[i].flags & PTA_BMI2
3404 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3405 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3406 if (processor_alias_table[i].flags & PTA_CX16
3407 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3408 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3409 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3410 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3411 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3412 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3413 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3414 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3415 if (processor_alias_table[i].flags & PTA_MOVBE
3416 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3417 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3418 if (processor_alias_table[i].flags & PTA_AES
3419 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3420 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3421 if (processor_alias_table[i].flags & PTA_PCLMUL
3422 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3423 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3424 if (processor_alias_table[i].flags & PTA_FSGSBASE
3425 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3426 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3427 if (processor_alias_table[i].flags & PTA_RDRND
3428 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3429 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3430 if (processor_alias_table[i].flags & PTA_F16C
3431 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3432 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3433 if (processor_alias_table[i].flags & PTA_RTM
3434 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3435 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3436 if (processor_alias_table[i].flags & PTA_HLE
3437 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3438 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3439 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3440 x86_prefetch_sse = true;
3441
3442 break;
3443 }
3444
3445 if (!strcmp (ix86_arch_string, "generic"))
3446 error ("generic CPU can be used only for %stune=%s %s",
3447 prefix, suffix, sw);
3448 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3449 error ("bad value (%s) for %sarch=%s %s",
3450 ix86_arch_string, prefix, suffix, sw);
3451
3452 ix86_arch_mask = 1u << ix86_arch;
3453 for (i = 0; i < X86_ARCH_LAST; ++i)
3454 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3455
3456 for (i = 0; i < pta_size; i++)
3457 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3458 {
3459 ix86_schedule = processor_alias_table[i].schedule;
3460 ix86_tune = processor_alias_table[i].processor;
3461 if (TARGET_64BIT)
3462 {
3463 if (!(processor_alias_table[i].flags & PTA_64BIT))
3464 {
3465 if (ix86_tune_defaulted)
3466 {
3467 ix86_tune_string = "x86-64";
3468 for (i = 0; i < pta_size; i++)
3469 if (! strcmp (ix86_tune_string,
3470 processor_alias_table[i].name))
3471 break;
3472 ix86_schedule = processor_alias_table[i].schedule;
3473 ix86_tune = processor_alias_table[i].processor;
3474 }
3475 else
3476 error ("CPU you selected does not support x86-64 "
3477 "instruction set");
3478 }
3479 }
3480 else
3481 {
3482 /* Adjust tuning when compiling for 32-bit ABI. */
3483 switch (ix86_tune)
3484 {
3485 case PROCESSOR_GENERIC64:
3486 ix86_tune = PROCESSOR_GENERIC32;
3487 ix86_schedule = CPU_PENTIUMPRO;
3488 break;
3489
3490 case PROCESSOR_CORE2_64:
3491 ix86_tune = PROCESSOR_CORE2_32;
3492 break;
3493
3494 case PROCESSOR_COREI7_64:
3495 ix86_tune = PROCESSOR_COREI7_32;
3496 break;
3497
3498 default:
3499 break;
3500 }
3501 }
3502 /* Intel CPUs have always interpreted SSE prefetch instructions as
3503 NOPs; so, we can enable SSE prefetch instructions even when
3504 -mtune (rather than -march) points us to a processor that has them.
3505 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3506 higher processors. */
3507 if (TARGET_CMOV
3508 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3509 x86_prefetch_sse = true;
3510 break;
3511 }
3512
3513 if (ix86_tune_specified && i == pta_size)
3514 error ("bad value (%s) for %stune=%s %s",
3515 ix86_tune_string, prefix, suffix, sw);
3516
3517 ix86_tune_mask = 1u << ix86_tune;
3518 for (i = 0; i < X86_TUNE_LAST; ++i)
3519 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3520
3521 #ifndef USE_IX86_FRAME_POINTER
3522 #define USE_IX86_FRAME_POINTER 0
3523 #endif
3524
3525 #ifndef USE_X86_64_FRAME_POINTER
3526 #define USE_X86_64_FRAME_POINTER 0
3527 #endif
3528
3529 /* Set the default values for switches whose default depends on TARGET_64BIT
3530 in case they weren't overwritten by command line options. */
3531 if (TARGET_64BIT)
3532 {
3533 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3534 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3535 if (flag_asynchronous_unwind_tables == 2)
3536 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3537 if (flag_pcc_struct_return == 2)
3538 flag_pcc_struct_return = 0;
3539 }
3540 else
3541 {
3542 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3543 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3544 if (flag_asynchronous_unwind_tables == 2)
3545 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3546 if (flag_pcc_struct_return == 2)
3547 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3548 }
3549
3550 if (optimize_size)
3551 ix86_cost = &ix86_size_cost;
3552 else
3553 ix86_cost = processor_target_table[ix86_tune].cost;
3554
3555 /* Arrange to set up i386_stack_locals for all functions. */
3556 init_machine_status = ix86_init_machine_status;
3557
3558 /* Validate -mregparm= value. */
3559 if (global_options_set.x_ix86_regparm)
3560 {
3561 if (TARGET_64BIT)
3562 warning (0, "-mregparm is ignored in 64-bit mode");
3563 if (ix86_regparm > REGPARM_MAX)
3564 {
3565 error ("-mregparm=%d is not between 0 and %d",
3566 ix86_regparm, REGPARM_MAX);
3567 ix86_regparm = 0;
3568 }
3569 }
3570 if (TARGET_64BIT)
3571 ix86_regparm = REGPARM_MAX;
3572
3573 /* Default align_* from the processor table. */
3574 if (align_loops == 0)
3575 {
3576 align_loops = processor_target_table[ix86_tune].align_loop;
3577 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3578 }
3579 if (align_jumps == 0)
3580 {
3581 align_jumps = processor_target_table[ix86_tune].align_jump;
3582 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3583 }
3584 if (align_functions == 0)
3585 {
3586 align_functions = processor_target_table[ix86_tune].align_func;
3587 }
3588
3589 /* Provide default for -mbranch-cost= value. */
3590 if (!global_options_set.x_ix86_branch_cost)
3591 ix86_branch_cost = ix86_cost->branch_cost;
3592
3593 if (TARGET_64BIT)
3594 {
3595 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3596
3597 /* Enable by default the SSE and MMX builtins. Do allow the user to
3598 explicitly disable any of these. In particular, disabling SSE and
3599 MMX for kernel code is extremely useful. */
3600 if (!ix86_arch_specified)
3601 ix86_isa_flags
3602 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3603 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3604
3605 if (TARGET_RTD)
3606 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3607 }
3608 else
3609 {
3610 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3611
3612 if (!ix86_arch_specified)
3613 ix86_isa_flags
3614 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3615
3616 /* i386 ABI does not specify red zone. It still makes sense to use it
3617 when programmer takes care to stack from being destroyed. */
3618 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3619 target_flags |= MASK_NO_RED_ZONE;
3620 }
3621
3622 /* Keep nonleaf frame pointers. */
3623 if (flag_omit_frame_pointer)
3624 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3625 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3626 flag_omit_frame_pointer = 1;
3627
3628 /* If we're doing fast math, we don't care about comparison order
3629 wrt NaNs. This lets us use a shorter comparison sequence. */
3630 if (flag_finite_math_only)
3631 target_flags &= ~MASK_IEEE_FP;
3632
3633 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3634 since the insns won't need emulation. */
3635 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3636 target_flags &= ~MASK_NO_FANCY_MATH_387;
3637
3638 /* Likewise, if the target doesn't have a 387, or we've specified
3639 software floating point, don't use 387 inline intrinsics. */
3640 if (!TARGET_80387)
3641 target_flags |= MASK_NO_FANCY_MATH_387;
3642
3643 /* Turn on MMX builtins for -msse. */
3644 if (TARGET_SSE)
3645 {
3646 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3647 x86_prefetch_sse = true;
3648 }
3649
3650 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3651 if (TARGET_SSE4_2 || TARGET_ABM)
3652 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3653
3654 /* Turn on lzcnt instruction for -mabm. */
3655 if (TARGET_ABM)
3656 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3657
3658 /* Validate -mpreferred-stack-boundary= value or default it to
3659 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3660 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3661 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3662 {
3663 int min = (TARGET_64BIT ? 4 : 2);
3664 int max = (TARGET_SEH ? 4 : 12);
3665
3666 if (ix86_preferred_stack_boundary_arg < min
3667 || ix86_preferred_stack_boundary_arg > max)
3668 {
3669 if (min == max)
3670 error ("-mpreferred-stack-boundary is not supported "
3671 "for this target");
3672 else
3673 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3674 ix86_preferred_stack_boundary_arg, min, max);
3675 }
3676 else
3677 ix86_preferred_stack_boundary
3678 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3679 }
3680
3681 /* Set the default value for -mstackrealign. */
3682 if (ix86_force_align_arg_pointer == -1)
3683 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3684
3685 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3686
3687 /* Validate -mincoming-stack-boundary= value or default it to
3688 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3689 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3690 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3691 {
3692 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3693 || ix86_incoming_stack_boundary_arg > 12)
3694 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3695 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3696 else
3697 {
3698 ix86_user_incoming_stack_boundary
3699 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3700 ix86_incoming_stack_boundary
3701 = ix86_user_incoming_stack_boundary;
3702 }
3703 }
3704
3705 /* Accept -msseregparm only if at least SSE support is enabled. */
3706 if (TARGET_SSEREGPARM
3707 && ! TARGET_SSE)
3708 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3709
3710 if (global_options_set.x_ix86_fpmath)
3711 {
3712 if (ix86_fpmath & FPMATH_SSE)
3713 {
3714 if (!TARGET_SSE)
3715 {
3716 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3717 ix86_fpmath = FPMATH_387;
3718 }
3719 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3720 {
3721 warning (0, "387 instruction set disabled, using SSE arithmetics");
3722 ix86_fpmath = FPMATH_SSE;
3723 }
3724 }
3725 }
3726 else
3727 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3728
3729 /* If the i387 is disabled, then do not return values in it. */
3730 if (!TARGET_80387)
3731 target_flags &= ~MASK_FLOAT_RETURNS;
3732
3733 /* Use external vectorized library in vectorizing intrinsics. */
3734 if (global_options_set.x_ix86_veclibabi_type)
3735 switch (ix86_veclibabi_type)
3736 {
3737 case ix86_veclibabi_type_svml:
3738 ix86_veclib_handler = ix86_veclibabi_svml;
3739 break;
3740
3741 case ix86_veclibabi_type_acml:
3742 ix86_veclib_handler = ix86_veclibabi_acml;
3743 break;
3744
3745 default:
3746 gcc_unreachable ();
3747 }
3748
3749 if ((!USE_IX86_FRAME_POINTER
3750 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3751 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3752 && !optimize_size)
3753 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3754
3755 /* ??? Unwind info is not correct around the CFG unless either a frame
3756 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3757 unwind info generation to be aware of the CFG and propagating states
3758 around edges. */
3759 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3760 || flag_exceptions || flag_non_call_exceptions)
3761 && flag_omit_frame_pointer
3762 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3763 {
3764 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3765 warning (0, "unwind tables currently require either a frame pointer "
3766 "or %saccumulate-outgoing-args%s for correctness",
3767 prefix, suffix);
3768 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3769 }
3770
3771 /* If stack probes are required, the space used for large function
3772 arguments on the stack must also be probed, so enable
3773 -maccumulate-outgoing-args so this happens in the prologue. */
3774 if (TARGET_STACK_PROBE
3775 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3776 {
3777 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3778 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3779 "for correctness", prefix, suffix);
3780 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3781 }
3782
3783 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3784 {
3785 char *p;
3786 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3787 p = strchr (internal_label_prefix, 'X');
3788 internal_label_prefix_len = p - internal_label_prefix;
3789 *p = '\0';
3790 }
3791
3792 /* When scheduling description is not available, disable scheduler pass
3793 so it won't slow down the compilation and make x87 code slower. */
3794 if (!TARGET_SCHEDULE)
3795 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3796
3797 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3798 ix86_cost->simultaneous_prefetches,
3799 global_options.x_param_values,
3800 global_options_set.x_param_values);
3801 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3802 global_options.x_param_values,
3803 global_options_set.x_param_values);
3804 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3805 global_options.x_param_values,
3806 global_options_set.x_param_values);
3807 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3808 global_options.x_param_values,
3809 global_options_set.x_param_values);
3810
3811 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3812 if (flag_prefetch_loop_arrays < 0
3813 && HAVE_prefetch
3814 && optimize >= 3
3815 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3816 flag_prefetch_loop_arrays = 1;
3817
3818 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3819 can be optimized to ap = __builtin_next_arg (0). */
3820 if (!TARGET_64BIT && !flag_split_stack)
3821 targetm.expand_builtin_va_start = NULL;
3822
3823 if (TARGET_64BIT)
3824 {
3825 ix86_gen_leave = gen_leave_rex64;
3826 if (Pmode == DImode)
3827 {
3828 ix86_gen_monitor = gen_sse3_monitor64_di;
3829 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3830 ix86_gen_tls_local_dynamic_base_64
3831 = gen_tls_local_dynamic_base_64_di;
3832 }
3833 else
3834 {
3835 ix86_gen_monitor = gen_sse3_monitor64_si;
3836 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3837 ix86_gen_tls_local_dynamic_base_64
3838 = gen_tls_local_dynamic_base_64_si;
3839 }
3840 }
3841 else
3842 {
3843 ix86_gen_leave = gen_leave;
3844 ix86_gen_monitor = gen_sse3_monitor;
3845 }
3846
3847 if (Pmode == DImode)
3848 {
3849 ix86_gen_add3 = gen_adddi3;
3850 ix86_gen_sub3 = gen_subdi3;
3851 ix86_gen_sub3_carry = gen_subdi3_carry;
3852 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3853 ix86_gen_andsp = gen_anddi3;
3854 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3855 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3856 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3857 }
3858 else
3859 {
3860 ix86_gen_add3 = gen_addsi3;
3861 ix86_gen_sub3 = gen_subsi3;
3862 ix86_gen_sub3_carry = gen_subsi3_carry;
3863 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3864 ix86_gen_andsp = gen_andsi3;
3865 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3866 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3867 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3868 }
3869
3870 #ifdef USE_IX86_CLD
3871 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3872 if (!TARGET_64BIT)
3873 target_flags |= MASK_CLD & ~target_flags_explicit;
3874 #endif
3875
3876 if (!TARGET_64BIT && flag_pic)
3877 {
3878 if (flag_fentry > 0)
3879 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3880 "with -fpic");
3881 flag_fentry = 0;
3882 }
3883 else if (TARGET_SEH)
3884 {
3885 if (flag_fentry == 0)
3886 sorry ("-mno-fentry isn%'t compatible with SEH");
3887 flag_fentry = 1;
3888 }
3889 else if (flag_fentry < 0)
3890 {
3891 #if defined(PROFILE_BEFORE_PROLOGUE)
3892 flag_fentry = 1;
3893 #else
3894 flag_fentry = 0;
3895 #endif
3896 }
3897
3898 if (TARGET_AVX)
3899 {
3900 /* When not optimize for size, enable vzeroupper optimization for
3901 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3902 AVX unaligned load/store. */
3903 if (!optimize_size)
3904 {
3905 if (flag_expensive_optimizations
3906 && !(target_flags_explicit & MASK_VZEROUPPER))
3907 target_flags |= MASK_VZEROUPPER;
3908 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3909 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3910 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3911 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3912 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3913 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3914 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3915 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3916 target_flags |= MASK_PREFER_AVX128;
3917 }
3918 }
3919 else
3920 {
3921 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3922 target_flags &= ~MASK_VZEROUPPER;
3923 }
3924
3925 if (ix86_recip_name)
3926 {
3927 char *p = ASTRDUP (ix86_recip_name);
3928 char *q;
3929 unsigned int mask, i;
3930 bool invert;
3931
3932 while ((q = strtok (p, ",")) != NULL)
3933 {
3934 p = NULL;
3935 if (*q == '!')
3936 {
3937 invert = true;
3938 q++;
3939 }
3940 else
3941 invert = false;
3942
3943 if (!strcmp (q, "default"))
3944 mask = RECIP_MASK_ALL;
3945 else
3946 {
3947 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3948 if (!strcmp (q, recip_options[i].string))
3949 {
3950 mask = recip_options[i].mask;
3951 break;
3952 }
3953
3954 if (i == ARRAY_SIZE (recip_options))
3955 {
3956 error ("unknown option for -mrecip=%s", q);
3957 invert = false;
3958 mask = RECIP_MASK_NONE;
3959 }
3960 }
3961
3962 recip_mask_explicit |= mask;
3963 if (invert)
3964 recip_mask &= ~mask;
3965 else
3966 recip_mask |= mask;
3967 }
3968 }
3969
3970 if (TARGET_RECIP)
3971 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3972 else if (target_flags_explicit & MASK_RECIP)
3973 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3974
3975 /* Save the initial options in case the user does function specific
3976 options. */
3977 if (main_args_p)
3978 target_option_default_node = target_option_current_node
3979 = build_target_option_node ();
3980 }
3981
3982 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3983
3984 static bool
3985 function_pass_avx256_p (const_rtx val)
3986 {
3987 if (!val)
3988 return false;
3989
3990 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3991 return true;
3992
3993 if (GET_CODE (val) == PARALLEL)
3994 {
3995 int i;
3996 rtx r;
3997
3998 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3999 {
4000 r = XVECEXP (val, 0, i);
4001 if (GET_CODE (r) == EXPR_LIST
4002 && XEXP (r, 0)
4003 && REG_P (XEXP (r, 0))
4004 && (GET_MODE (XEXP (r, 0)) == OImode
4005 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4006 return true;
4007 }
4008 }
4009
4010 return false;
4011 }
4012
4013 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4014
4015 static void
4016 ix86_option_override (void)
4017 {
4018 ix86_option_override_internal (true);
4019 }
4020
4021 /* Update register usage after having seen the compiler flags. */
4022
4023 static void
4024 ix86_conditional_register_usage (void)
4025 {
4026 int i;
4027 unsigned int j;
4028
4029 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4030 {
4031 if (fixed_regs[i] > 1)
4032 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4033 if (call_used_regs[i] > 1)
4034 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4035 }
4036
4037 /* The PIC register, if it exists, is fixed. */
4038 j = PIC_OFFSET_TABLE_REGNUM;
4039 if (j != INVALID_REGNUM)
4040 fixed_regs[j] = call_used_regs[j] = 1;
4041
4042 /* The 64-bit MS_ABI changes the set of call-used registers. */
4043 if (TARGET_64BIT_MS_ABI)
4044 {
4045 call_used_regs[SI_REG] = 0;
4046 call_used_regs[DI_REG] = 0;
4047 call_used_regs[XMM6_REG] = 0;
4048 call_used_regs[XMM7_REG] = 0;
4049 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4050 call_used_regs[i] = 0;
4051 }
4052
4053 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4054 other call-clobbered regs for 64-bit. */
4055 if (TARGET_64BIT)
4056 {
4057 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4058
4059 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4060 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4061 && call_used_regs[i])
4062 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4063 }
4064
4065 /* If MMX is disabled, squash the registers. */
4066 if (! TARGET_MMX)
4067 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4068 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4069 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4070
4071 /* If SSE is disabled, squash the registers. */
4072 if (! TARGET_SSE)
4073 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4074 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4075 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4076
4077 /* If the FPU is disabled, squash the registers. */
4078 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4079 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4080 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4081 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4082
4083 /* If 32-bit, squash the 64-bit registers. */
4084 if (! TARGET_64BIT)
4085 {
4086 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4087 reg_names[i] = "";
4088 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4089 reg_names[i] = "";
4090 }
4091 }
4092
4093 \f
4094 /* Save the current options */
4095
4096 static void
4097 ix86_function_specific_save (struct cl_target_option *ptr)
4098 {
4099 ptr->arch = ix86_arch;
4100 ptr->schedule = ix86_schedule;
4101 ptr->tune = ix86_tune;
4102 ptr->branch_cost = ix86_branch_cost;
4103 ptr->tune_defaulted = ix86_tune_defaulted;
4104 ptr->arch_specified = ix86_arch_specified;
4105 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4106 ptr->ix86_target_flags_explicit = target_flags_explicit;
4107 ptr->x_recip_mask_explicit = recip_mask_explicit;
4108
4109 /* The fields are char but the variables are not; make sure the
4110 values fit in the fields. */
4111 gcc_assert (ptr->arch == ix86_arch);
4112 gcc_assert (ptr->schedule == ix86_schedule);
4113 gcc_assert (ptr->tune == ix86_tune);
4114 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4115 }
4116
4117 /* Restore the current options */
4118
4119 static void
4120 ix86_function_specific_restore (struct cl_target_option *ptr)
4121 {
4122 enum processor_type old_tune = ix86_tune;
4123 enum processor_type old_arch = ix86_arch;
4124 unsigned int ix86_arch_mask, ix86_tune_mask;
4125 int i;
4126
4127 ix86_arch = (enum processor_type) ptr->arch;
4128 ix86_schedule = (enum attr_cpu) ptr->schedule;
4129 ix86_tune = (enum processor_type) ptr->tune;
4130 ix86_branch_cost = ptr->branch_cost;
4131 ix86_tune_defaulted = ptr->tune_defaulted;
4132 ix86_arch_specified = ptr->arch_specified;
4133 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4134 target_flags_explicit = ptr->ix86_target_flags_explicit;
4135 recip_mask_explicit = ptr->x_recip_mask_explicit;
4136
4137 /* Recreate the arch feature tests if the arch changed */
4138 if (old_arch != ix86_arch)
4139 {
4140 ix86_arch_mask = 1u << ix86_arch;
4141 for (i = 0; i < X86_ARCH_LAST; ++i)
4142 ix86_arch_features[i]
4143 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4144 }
4145
4146 /* Recreate the tune optimization tests */
4147 if (old_tune != ix86_tune)
4148 {
4149 ix86_tune_mask = 1u << ix86_tune;
4150 for (i = 0; i < X86_TUNE_LAST; ++i)
4151 ix86_tune_features[i]
4152 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4153 }
4154 }
4155
4156 /* Print the current options */
4157
4158 static void
4159 ix86_function_specific_print (FILE *file, int indent,
4160 struct cl_target_option *ptr)
4161 {
4162 char *target_string
4163 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4164 NULL, NULL, ptr->x_ix86_fpmath, false);
4165
4166 fprintf (file, "%*sarch = %d (%s)\n",
4167 indent, "",
4168 ptr->arch,
4169 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4170 ? cpu_names[ptr->arch]
4171 : "<unknown>"));
4172
4173 fprintf (file, "%*stune = %d (%s)\n",
4174 indent, "",
4175 ptr->tune,
4176 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4177 ? cpu_names[ptr->tune]
4178 : "<unknown>"));
4179
4180 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4181
4182 if (target_string)
4183 {
4184 fprintf (file, "%*s%s\n", indent, "", target_string);
4185 free (target_string);
4186 }
4187 }
4188
4189 \f
4190 /* Inner function to process the attribute((target(...))), take an argument and
4191 set the current options from the argument. If we have a list, recursively go
4192 over the list. */
4193
4194 static bool
4195 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4196 struct gcc_options *enum_opts_set)
4197 {
4198 char *next_optstr;
4199 bool ret = true;
4200
4201 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4202 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4203 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4204 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4205 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4206
4207 enum ix86_opt_type
4208 {
4209 ix86_opt_unknown,
4210 ix86_opt_yes,
4211 ix86_opt_no,
4212 ix86_opt_str,
4213 ix86_opt_enum,
4214 ix86_opt_isa
4215 };
4216
4217 static const struct
4218 {
4219 const char *string;
4220 size_t len;
4221 enum ix86_opt_type type;
4222 int opt;
4223 int mask;
4224 } attrs[] = {
4225 /* isa options */
4226 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4227 IX86_ATTR_ISA ("abm", OPT_mabm),
4228 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4229 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4230 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4231 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4232 IX86_ATTR_ISA ("aes", OPT_maes),
4233 IX86_ATTR_ISA ("avx", OPT_mavx),
4234 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4235 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4236 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4237 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4238 IX86_ATTR_ISA ("sse", OPT_msse),
4239 IX86_ATTR_ISA ("sse2", OPT_msse2),
4240 IX86_ATTR_ISA ("sse3", OPT_msse3),
4241 IX86_ATTR_ISA ("sse4", OPT_msse4),
4242 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4243 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4244 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4245 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4246 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4247 IX86_ATTR_ISA ("fma", OPT_mfma),
4248 IX86_ATTR_ISA ("xop", OPT_mxop),
4249 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4250 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4251 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4252 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4253 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4254 IX86_ATTR_ISA ("hle", OPT_mhle),
4255
4256 /* enum options */
4257 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4258
4259 /* string options */
4260 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4261 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4262
4263 /* flag options */
4264 IX86_ATTR_YES ("cld",
4265 OPT_mcld,
4266 MASK_CLD),
4267
4268 IX86_ATTR_NO ("fancy-math-387",
4269 OPT_mfancy_math_387,
4270 MASK_NO_FANCY_MATH_387),
4271
4272 IX86_ATTR_YES ("ieee-fp",
4273 OPT_mieee_fp,
4274 MASK_IEEE_FP),
4275
4276 IX86_ATTR_YES ("inline-all-stringops",
4277 OPT_minline_all_stringops,
4278 MASK_INLINE_ALL_STRINGOPS),
4279
4280 IX86_ATTR_YES ("inline-stringops-dynamically",
4281 OPT_minline_stringops_dynamically,
4282 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4283
4284 IX86_ATTR_NO ("align-stringops",
4285 OPT_mno_align_stringops,
4286 MASK_NO_ALIGN_STRINGOPS),
4287
4288 IX86_ATTR_YES ("recip",
4289 OPT_mrecip,
4290 MASK_RECIP),
4291
4292 };
4293
4294 /* If this is a list, recurse to get the options. */
4295 if (TREE_CODE (args) == TREE_LIST)
4296 {
4297 bool ret = true;
4298
4299 for (; args; args = TREE_CHAIN (args))
4300 if (TREE_VALUE (args)
4301 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4302 p_strings, enum_opts_set))
4303 ret = false;
4304
4305 return ret;
4306 }
4307
4308 else if (TREE_CODE (args) != STRING_CST)
4309 gcc_unreachable ();
4310
4311 /* Handle multiple arguments separated by commas. */
4312 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4313
4314 while (next_optstr && *next_optstr != '\0')
4315 {
4316 char *p = next_optstr;
4317 char *orig_p = p;
4318 char *comma = strchr (next_optstr, ',');
4319 const char *opt_string;
4320 size_t len, opt_len;
4321 int opt;
4322 bool opt_set_p;
4323 char ch;
4324 unsigned i;
4325 enum ix86_opt_type type = ix86_opt_unknown;
4326 int mask = 0;
4327
4328 if (comma)
4329 {
4330 *comma = '\0';
4331 len = comma - next_optstr;
4332 next_optstr = comma + 1;
4333 }
4334 else
4335 {
4336 len = strlen (p);
4337 next_optstr = NULL;
4338 }
4339
4340 /* Recognize no-xxx. */
4341 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4342 {
4343 opt_set_p = false;
4344 p += 3;
4345 len -= 3;
4346 }
4347 else
4348 opt_set_p = true;
4349
4350 /* Find the option. */
4351 ch = *p;
4352 opt = N_OPTS;
4353 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4354 {
4355 type = attrs[i].type;
4356 opt_len = attrs[i].len;
4357 if (ch == attrs[i].string[0]
4358 && ((type != ix86_opt_str && type != ix86_opt_enum)
4359 ? len == opt_len
4360 : len > opt_len)
4361 && memcmp (p, attrs[i].string, opt_len) == 0)
4362 {
4363 opt = attrs[i].opt;
4364 mask = attrs[i].mask;
4365 opt_string = attrs[i].string;
4366 break;
4367 }
4368 }
4369
4370 /* Process the option. */
4371 if (opt == N_OPTS)
4372 {
4373 error ("attribute(target(\"%s\")) is unknown", orig_p);
4374 ret = false;
4375 }
4376
4377 else if (type == ix86_opt_isa)
4378 {
4379 struct cl_decoded_option decoded;
4380
4381 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4382 ix86_handle_option (&global_options, &global_options_set,
4383 &decoded, input_location);
4384 }
4385
4386 else if (type == ix86_opt_yes || type == ix86_opt_no)
4387 {
4388 if (type == ix86_opt_no)
4389 opt_set_p = !opt_set_p;
4390
4391 if (opt_set_p)
4392 target_flags |= mask;
4393 else
4394 target_flags &= ~mask;
4395 }
4396
4397 else if (type == ix86_opt_str)
4398 {
4399 if (p_strings[opt])
4400 {
4401 error ("option(\"%s\") was already specified", opt_string);
4402 ret = false;
4403 }
4404 else
4405 p_strings[opt] = xstrdup (p + opt_len);
4406 }
4407
4408 else if (type == ix86_opt_enum)
4409 {
4410 bool arg_ok;
4411 int value;
4412
4413 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4414 if (arg_ok)
4415 set_option (&global_options, enum_opts_set, opt, value,
4416 p + opt_len, DK_UNSPECIFIED, input_location,
4417 global_dc);
4418 else
4419 {
4420 error ("attribute(target(\"%s\")) is unknown", orig_p);
4421 ret = false;
4422 }
4423 }
4424
4425 else
4426 gcc_unreachable ();
4427 }
4428
4429 return ret;
4430 }
4431
4432 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4433
4434 tree
4435 ix86_valid_target_attribute_tree (tree args)
4436 {
4437 const char *orig_arch_string = ix86_arch_string;
4438 const char *orig_tune_string = ix86_tune_string;
4439 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4440 int orig_tune_defaulted = ix86_tune_defaulted;
4441 int orig_arch_specified = ix86_arch_specified;
4442 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4443 tree t = NULL_TREE;
4444 int i;
4445 struct cl_target_option *def
4446 = TREE_TARGET_OPTION (target_option_default_node);
4447 struct gcc_options enum_opts_set;
4448
4449 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4450
4451 /* Process each of the options on the chain. */
4452 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4453 &enum_opts_set))
4454 return NULL_TREE;
4455
4456 /* If the changed options are different from the default, rerun
4457 ix86_option_override_internal, and then save the options away.
4458 The string options are are attribute options, and will be undone
4459 when we copy the save structure. */
4460 if (ix86_isa_flags != def->x_ix86_isa_flags
4461 || target_flags != def->x_target_flags
4462 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4463 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4464 || enum_opts_set.x_ix86_fpmath)
4465 {
4466 /* If we are using the default tune= or arch=, undo the string assigned,
4467 and use the default. */
4468 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4469 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4470 else if (!orig_arch_specified)
4471 ix86_arch_string = NULL;
4472
4473 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4474 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4475 else if (orig_tune_defaulted)
4476 ix86_tune_string = NULL;
4477
4478 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4479 if (enum_opts_set.x_ix86_fpmath)
4480 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4481 else if (!TARGET_64BIT && TARGET_SSE)
4482 {
4483 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4484 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4485 }
4486
4487 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4488 ix86_option_override_internal (false);
4489
4490 /* Add any builtin functions with the new isa if any. */
4491 ix86_add_new_builtins (ix86_isa_flags);
4492
4493 /* Save the current options unless we are validating options for
4494 #pragma. */
4495 t = build_target_option_node ();
4496
4497 ix86_arch_string = orig_arch_string;
4498 ix86_tune_string = orig_tune_string;
4499 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4500
4501 /* Free up memory allocated to hold the strings */
4502 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4503 free (option_strings[i]);
4504 }
4505
4506 return t;
4507 }
4508
4509 /* Hook to validate attribute((target("string"))). */
4510
4511 static bool
4512 ix86_valid_target_attribute_p (tree fndecl,
4513 tree ARG_UNUSED (name),
4514 tree args,
4515 int ARG_UNUSED (flags))
4516 {
4517 struct cl_target_option cur_target;
4518 bool ret = true;
4519 tree old_optimize = build_optimization_node ();
4520 tree new_target, new_optimize;
4521 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4522
4523 /* If the function changed the optimization levels as well as setting target
4524 options, start with the optimizations specified. */
4525 if (func_optimize && func_optimize != old_optimize)
4526 cl_optimization_restore (&global_options,
4527 TREE_OPTIMIZATION (func_optimize));
4528
4529 /* The target attributes may also change some optimization flags, so update
4530 the optimization options if necessary. */
4531 cl_target_option_save (&cur_target, &global_options);
4532 new_target = ix86_valid_target_attribute_tree (args);
4533 new_optimize = build_optimization_node ();
4534
4535 if (!new_target)
4536 ret = false;
4537
4538 else if (fndecl)
4539 {
4540 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4541
4542 if (old_optimize != new_optimize)
4543 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4544 }
4545
4546 cl_target_option_restore (&global_options, &cur_target);
4547
4548 if (old_optimize != new_optimize)
4549 cl_optimization_restore (&global_options,
4550 TREE_OPTIMIZATION (old_optimize));
4551
4552 return ret;
4553 }
4554
4555 \f
4556 /* Hook to determine if one function can safely inline another. */
4557
4558 static bool
4559 ix86_can_inline_p (tree caller, tree callee)
4560 {
4561 bool ret = false;
4562 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4563 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4564
4565 /* If callee has no option attributes, then it is ok to inline. */
4566 if (!callee_tree)
4567 ret = true;
4568
4569 /* If caller has no option attributes, but callee does then it is not ok to
4570 inline. */
4571 else if (!caller_tree)
4572 ret = false;
4573
4574 else
4575 {
4576 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4577 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4578
4579 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4580 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4581 function. */
4582 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4583 != callee_opts->x_ix86_isa_flags)
4584 ret = false;
4585
4586 /* See if we have the same non-isa options. */
4587 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4588 ret = false;
4589
4590 /* See if arch, tune, etc. are the same. */
4591 else if (caller_opts->arch != callee_opts->arch)
4592 ret = false;
4593
4594 else if (caller_opts->tune != callee_opts->tune)
4595 ret = false;
4596
4597 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4598 ret = false;
4599
4600 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4601 ret = false;
4602
4603 else
4604 ret = true;
4605 }
4606
4607 return ret;
4608 }
4609
4610 \f
4611 /* Remember the last target of ix86_set_current_function. */
4612 static GTY(()) tree ix86_previous_fndecl;
4613
4614 /* Establish appropriate back-end context for processing the function
4615 FNDECL. The argument might be NULL to indicate processing at top
4616 level, outside of any function scope. */
4617 static void
4618 ix86_set_current_function (tree fndecl)
4619 {
4620 /* Only change the context if the function changes. This hook is called
4621 several times in the course of compiling a function, and we don't want to
4622 slow things down too much or call target_reinit when it isn't safe. */
4623 if (fndecl && fndecl != ix86_previous_fndecl)
4624 {
4625 tree old_tree = (ix86_previous_fndecl
4626 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4627 : NULL_TREE);
4628
4629 tree new_tree = (fndecl
4630 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4631 : NULL_TREE);
4632
4633 ix86_previous_fndecl = fndecl;
4634 if (old_tree == new_tree)
4635 ;
4636
4637 else if (new_tree)
4638 {
4639 cl_target_option_restore (&global_options,
4640 TREE_TARGET_OPTION (new_tree));
4641 target_reinit ();
4642 }
4643
4644 else if (old_tree)
4645 {
4646 struct cl_target_option *def
4647 = TREE_TARGET_OPTION (target_option_current_node);
4648
4649 cl_target_option_restore (&global_options, def);
4650 target_reinit ();
4651 }
4652 }
4653 }
4654
4655 \f
4656 /* Return true if this goes in large data/bss. */
4657
4658 static bool
4659 ix86_in_large_data_p (tree exp)
4660 {
4661 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4662 return false;
4663
4664 /* Functions are never large data. */
4665 if (TREE_CODE (exp) == FUNCTION_DECL)
4666 return false;
4667
4668 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4669 {
4670 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4671 if (strcmp (section, ".ldata") == 0
4672 || strcmp (section, ".lbss") == 0)
4673 return true;
4674 return false;
4675 }
4676 else
4677 {
4678 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4679
4680 /* If this is an incomplete type with size 0, then we can't put it
4681 in data because it might be too big when completed. */
4682 if (!size || size > ix86_section_threshold)
4683 return true;
4684 }
4685
4686 return false;
4687 }
4688
4689 /* Switch to the appropriate section for output of DECL.
4690 DECL is either a `VAR_DECL' node or a constant of some sort.
4691 RELOC indicates whether forming the initial value of DECL requires
4692 link-time relocations. */
4693
4694 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4695 ATTRIBUTE_UNUSED;
4696
4697 static section *
4698 x86_64_elf_select_section (tree decl, int reloc,
4699 unsigned HOST_WIDE_INT align)
4700 {
4701 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4702 && ix86_in_large_data_p (decl))
4703 {
4704 const char *sname = NULL;
4705 unsigned int flags = SECTION_WRITE;
4706 switch (categorize_decl_for_section (decl, reloc))
4707 {
4708 case SECCAT_DATA:
4709 sname = ".ldata";
4710 break;
4711 case SECCAT_DATA_REL:
4712 sname = ".ldata.rel";
4713 break;
4714 case SECCAT_DATA_REL_LOCAL:
4715 sname = ".ldata.rel.local";
4716 break;
4717 case SECCAT_DATA_REL_RO:
4718 sname = ".ldata.rel.ro";
4719 break;
4720 case SECCAT_DATA_REL_RO_LOCAL:
4721 sname = ".ldata.rel.ro.local";
4722 break;
4723 case SECCAT_BSS:
4724 sname = ".lbss";
4725 flags |= SECTION_BSS;
4726 break;
4727 case SECCAT_RODATA:
4728 case SECCAT_RODATA_MERGE_STR:
4729 case SECCAT_RODATA_MERGE_STR_INIT:
4730 case SECCAT_RODATA_MERGE_CONST:
4731 sname = ".lrodata";
4732 flags = 0;
4733 break;
4734 case SECCAT_SRODATA:
4735 case SECCAT_SDATA:
4736 case SECCAT_SBSS:
4737 gcc_unreachable ();
4738 case SECCAT_TEXT:
4739 case SECCAT_TDATA:
4740 case SECCAT_TBSS:
4741 /* We don't split these for medium model. Place them into
4742 default sections and hope for best. */
4743 break;
4744 }
4745 if (sname)
4746 {
4747 /* We might get called with string constants, but get_named_section
4748 doesn't like them as they are not DECLs. Also, we need to set
4749 flags in that case. */
4750 if (!DECL_P (decl))
4751 return get_section (sname, flags, NULL);
4752 return get_named_section (decl, sname, reloc);
4753 }
4754 }
4755 return default_elf_select_section (decl, reloc, align);
4756 }
4757
4758 /* Build up a unique section name, expressed as a
4759 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4760 RELOC indicates whether the initial value of EXP requires
4761 link-time relocations. */
4762
4763 static void ATTRIBUTE_UNUSED
4764 x86_64_elf_unique_section (tree decl, int reloc)
4765 {
4766 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4767 && ix86_in_large_data_p (decl))
4768 {
4769 const char *prefix = NULL;
4770 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4771 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4772
4773 switch (categorize_decl_for_section (decl, reloc))
4774 {
4775 case SECCAT_DATA:
4776 case SECCAT_DATA_REL:
4777 case SECCAT_DATA_REL_LOCAL:
4778 case SECCAT_DATA_REL_RO:
4779 case SECCAT_DATA_REL_RO_LOCAL:
4780 prefix = one_only ? ".ld" : ".ldata";
4781 break;
4782 case SECCAT_BSS:
4783 prefix = one_only ? ".lb" : ".lbss";
4784 break;
4785 case SECCAT_RODATA:
4786 case SECCAT_RODATA_MERGE_STR:
4787 case SECCAT_RODATA_MERGE_STR_INIT:
4788 case SECCAT_RODATA_MERGE_CONST:
4789 prefix = one_only ? ".lr" : ".lrodata";
4790 break;
4791 case SECCAT_SRODATA:
4792 case SECCAT_SDATA:
4793 case SECCAT_SBSS:
4794 gcc_unreachable ();
4795 case SECCAT_TEXT:
4796 case SECCAT_TDATA:
4797 case SECCAT_TBSS:
4798 /* We don't split these for medium model. Place them into
4799 default sections and hope for best. */
4800 break;
4801 }
4802 if (prefix)
4803 {
4804 const char *name, *linkonce;
4805 char *string;
4806
4807 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4808 name = targetm.strip_name_encoding (name);
4809
4810 /* If we're using one_only, then there needs to be a .gnu.linkonce
4811 prefix to the section name. */
4812 linkonce = one_only ? ".gnu.linkonce" : "";
4813
4814 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4815
4816 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4817 return;
4818 }
4819 }
4820 default_unique_section (decl, reloc);
4821 }
4822
4823 #ifdef COMMON_ASM_OP
4824 /* This says how to output assembler code to declare an
4825 uninitialized external linkage data object.
4826
4827 For medium model x86-64 we need to use .largecomm opcode for
4828 large objects. */
4829 void
4830 x86_elf_aligned_common (FILE *file,
4831 const char *name, unsigned HOST_WIDE_INT size,
4832 int align)
4833 {
4834 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4835 && size > (unsigned int)ix86_section_threshold)
4836 fputs (".largecomm\t", file);
4837 else
4838 fputs (COMMON_ASM_OP, file);
4839 assemble_name (file, name);
4840 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4841 size, align / BITS_PER_UNIT);
4842 }
4843 #endif
4844
4845 /* Utility function for targets to use in implementing
4846 ASM_OUTPUT_ALIGNED_BSS. */
4847
4848 void
4849 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4850 const char *name, unsigned HOST_WIDE_INT size,
4851 int align)
4852 {
4853 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4854 && size > (unsigned int)ix86_section_threshold)
4855 switch_to_section (get_named_section (decl, ".lbss", 0));
4856 else
4857 switch_to_section (bss_section);
4858 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4859 #ifdef ASM_DECLARE_OBJECT_NAME
4860 last_assemble_variable_decl = decl;
4861 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4862 #else
4863 /* Standard thing is just output label for the object. */
4864 ASM_OUTPUT_LABEL (file, name);
4865 #endif /* ASM_DECLARE_OBJECT_NAME */
4866 ASM_OUTPUT_SKIP (file, size ? size : 1);
4867 }
4868 \f
4869 /* Decide whether we must probe the stack before any space allocation
4870 on this target. It's essentially TARGET_STACK_PROBE except when
4871 -fstack-check causes the stack to be already probed differently. */
4872
4873 bool
4874 ix86_target_stack_probe (void)
4875 {
4876 /* Do not probe the stack twice if static stack checking is enabled. */
4877 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4878 return false;
4879
4880 return TARGET_STACK_PROBE;
4881 }
4882 \f
4883 /* Decide whether we can make a sibling call to a function. DECL is the
4884 declaration of the function being targeted by the call and EXP is the
4885 CALL_EXPR representing the call. */
4886
4887 static bool
4888 ix86_function_ok_for_sibcall (tree decl, tree exp)
4889 {
4890 tree type, decl_or_type;
4891 rtx a, b;
4892
4893 /* If we are generating position-independent code, we cannot sibcall
4894 optimize any indirect call, or a direct call to a global function,
4895 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4896 if (!TARGET_MACHO
4897 && !TARGET_64BIT
4898 && flag_pic
4899 && (!decl || !targetm.binds_local_p (decl)))
4900 return false;
4901
4902 /* If we need to align the outgoing stack, then sibcalling would
4903 unalign the stack, which may break the called function. */
4904 if (ix86_minimum_incoming_stack_boundary (true)
4905 < PREFERRED_STACK_BOUNDARY)
4906 return false;
4907
4908 if (decl)
4909 {
4910 decl_or_type = decl;
4911 type = TREE_TYPE (decl);
4912 }
4913 else
4914 {
4915 /* We're looking at the CALL_EXPR, we need the type of the function. */
4916 type = CALL_EXPR_FN (exp); /* pointer expression */
4917 type = TREE_TYPE (type); /* pointer type */
4918 type = TREE_TYPE (type); /* function type */
4919 decl_or_type = type;
4920 }
4921
4922 /* Check that the return value locations are the same. Like
4923 if we are returning floats on the 80387 register stack, we cannot
4924 make a sibcall from a function that doesn't return a float to a
4925 function that does or, conversely, from a function that does return
4926 a float to a function that doesn't; the necessary stack adjustment
4927 would not be executed. This is also the place we notice
4928 differences in the return value ABI. Note that it is ok for one
4929 of the functions to have void return type as long as the return
4930 value of the other is passed in a register. */
4931 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4932 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4933 cfun->decl, false);
4934 if (STACK_REG_P (a) || STACK_REG_P (b))
4935 {
4936 if (!rtx_equal_p (a, b))
4937 return false;
4938 }
4939 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4940 {
4941 /* Disable sibcall if we need to generate vzeroupper after
4942 callee returns. */
4943 if (TARGET_VZEROUPPER
4944 && cfun->machine->callee_return_avx256_p
4945 && !cfun->machine->caller_return_avx256_p)
4946 return false;
4947 }
4948 else if (!rtx_equal_p (a, b))
4949 return false;
4950
4951 if (TARGET_64BIT)
4952 {
4953 /* The SYSV ABI has more call-clobbered registers;
4954 disallow sibcalls from MS to SYSV. */
4955 if (cfun->machine->call_abi == MS_ABI
4956 && ix86_function_type_abi (type) == SYSV_ABI)
4957 return false;
4958 }
4959 else
4960 {
4961 /* If this call is indirect, we'll need to be able to use a
4962 call-clobbered register for the address of the target function.
4963 Make sure that all such registers are not used for passing
4964 parameters. Note that DLLIMPORT functions are indirect. */
4965 if (!decl
4966 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4967 {
4968 if (ix86_function_regparm (type, NULL) >= 3)
4969 {
4970 /* ??? Need to count the actual number of registers to be used,
4971 not the possible number of registers. Fix later. */
4972 return false;
4973 }
4974 }
4975 }
4976
4977 /* Otherwise okay. That also includes certain types of indirect calls. */
4978 return true;
4979 }
4980
4981 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4982 and "sseregparm" calling convention attributes;
4983 arguments as in struct attribute_spec.handler. */
4984
4985 static tree
4986 ix86_handle_cconv_attribute (tree *node, tree name,
4987 tree args,
4988 int flags ATTRIBUTE_UNUSED,
4989 bool *no_add_attrs)
4990 {
4991 if (TREE_CODE (*node) != FUNCTION_TYPE
4992 && TREE_CODE (*node) != METHOD_TYPE
4993 && TREE_CODE (*node) != FIELD_DECL
4994 && TREE_CODE (*node) != TYPE_DECL)
4995 {
4996 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4997 name);
4998 *no_add_attrs = true;
4999 return NULL_TREE;
5000 }
5001
5002 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5003 if (is_attribute_p ("regparm", name))
5004 {
5005 tree cst;
5006
5007 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5008 {
5009 error ("fastcall and regparm attributes are not compatible");
5010 }
5011
5012 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("regparam and thiscall attributes are not compatible");
5015 }
5016
5017 cst = TREE_VALUE (args);
5018 if (TREE_CODE (cst) != INTEGER_CST)
5019 {
5020 warning (OPT_Wattributes,
5021 "%qE attribute requires an integer constant argument",
5022 name);
5023 *no_add_attrs = true;
5024 }
5025 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5026 {
5027 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5028 name, REGPARM_MAX);
5029 *no_add_attrs = true;
5030 }
5031
5032 return NULL_TREE;
5033 }
5034
5035 if (TARGET_64BIT)
5036 {
5037 /* Do not warn when emulating the MS ABI. */
5038 if ((TREE_CODE (*node) != FUNCTION_TYPE
5039 && TREE_CODE (*node) != METHOD_TYPE)
5040 || ix86_function_type_abi (*node) != MS_ABI)
5041 warning (OPT_Wattributes, "%qE attribute ignored",
5042 name);
5043 *no_add_attrs = true;
5044 return NULL_TREE;
5045 }
5046
5047 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5048 if (is_attribute_p ("fastcall", name))
5049 {
5050 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5051 {
5052 error ("fastcall and cdecl attributes are not compatible");
5053 }
5054 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5055 {
5056 error ("fastcall and stdcall attributes are not compatible");
5057 }
5058 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5059 {
5060 error ("fastcall and regparm attributes are not compatible");
5061 }
5062 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5063 {
5064 error ("fastcall and thiscall attributes are not compatible");
5065 }
5066 }
5067
5068 /* Can combine stdcall with fastcall (redundant), regparm and
5069 sseregparm. */
5070 else if (is_attribute_p ("stdcall", name))
5071 {
5072 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5073 {
5074 error ("stdcall and cdecl attributes are not compatible");
5075 }
5076 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5077 {
5078 error ("stdcall and fastcall attributes are not compatible");
5079 }
5080 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5081 {
5082 error ("stdcall and thiscall attributes are not compatible");
5083 }
5084 }
5085
5086 /* Can combine cdecl with regparm and sseregparm. */
5087 else if (is_attribute_p ("cdecl", name))
5088 {
5089 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5090 {
5091 error ("stdcall and cdecl attributes are not compatible");
5092 }
5093 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5094 {
5095 error ("fastcall and cdecl attributes are not compatible");
5096 }
5097 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5098 {
5099 error ("cdecl and thiscall attributes are not compatible");
5100 }
5101 }
5102 else if (is_attribute_p ("thiscall", name))
5103 {
5104 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5105 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5106 name);
5107 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5108 {
5109 error ("stdcall and thiscall attributes are not compatible");
5110 }
5111 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5112 {
5113 error ("fastcall and thiscall attributes are not compatible");
5114 }
5115 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5116 {
5117 error ("cdecl and thiscall attributes are not compatible");
5118 }
5119 }
5120
5121 /* Can combine sseregparm with all attributes. */
5122
5123 return NULL_TREE;
5124 }
5125
5126 /* The transactional memory builtins are implicitly regparm or fastcall
5127 depending on the ABI. Override the generic do-nothing attribute that
5128 these builtins were declared with, and replace it with one of the two
5129 attributes that we expect elsewhere. */
5130
5131 static tree
5132 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5133 tree args ATTRIBUTE_UNUSED,
5134 int flags ATTRIBUTE_UNUSED,
5135 bool *no_add_attrs)
5136 {
5137 tree alt;
5138
5139 /* In no case do we want to add the placeholder attribute. */
5140 *no_add_attrs = true;
5141
5142 /* The 64-bit ABI is unchanged for transactional memory. */
5143 if (TARGET_64BIT)
5144 return NULL_TREE;
5145
5146 /* ??? Is there a better way to validate 32-bit windows? We have
5147 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5148 if (CHECK_STACK_LIMIT > 0)
5149 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5150 else
5151 {
5152 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5153 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5154 }
5155 decl_attributes (node, alt, flags);
5156
5157 return NULL_TREE;
5158 }
5159
5160 /* This function determines from TYPE the calling-convention. */
5161
5162 unsigned int
5163 ix86_get_callcvt (const_tree type)
5164 {
5165 unsigned int ret = 0;
5166 bool is_stdarg;
5167 tree attrs;
5168
5169 if (TARGET_64BIT)
5170 return IX86_CALLCVT_CDECL;
5171
5172 attrs = TYPE_ATTRIBUTES (type);
5173 if (attrs != NULL_TREE)
5174 {
5175 if (lookup_attribute ("cdecl", attrs))
5176 ret |= IX86_CALLCVT_CDECL;
5177 else if (lookup_attribute ("stdcall", attrs))
5178 ret |= IX86_CALLCVT_STDCALL;
5179 else if (lookup_attribute ("fastcall", attrs))
5180 ret |= IX86_CALLCVT_FASTCALL;
5181 else if (lookup_attribute ("thiscall", attrs))
5182 ret |= IX86_CALLCVT_THISCALL;
5183
5184 /* Regparam isn't allowed for thiscall and fastcall. */
5185 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5186 {
5187 if (lookup_attribute ("regparm", attrs))
5188 ret |= IX86_CALLCVT_REGPARM;
5189 if (lookup_attribute ("sseregparm", attrs))
5190 ret |= IX86_CALLCVT_SSEREGPARM;
5191 }
5192
5193 if (IX86_BASE_CALLCVT(ret) != 0)
5194 return ret;
5195 }
5196
5197 is_stdarg = stdarg_p (type);
5198 if (TARGET_RTD && !is_stdarg)
5199 return IX86_CALLCVT_STDCALL | ret;
5200
5201 if (ret != 0
5202 || is_stdarg
5203 || TREE_CODE (type) != METHOD_TYPE
5204 || ix86_function_type_abi (type) != MS_ABI)
5205 return IX86_CALLCVT_CDECL | ret;
5206
5207 return IX86_CALLCVT_THISCALL;
5208 }
5209
5210 /* Return 0 if the attributes for two types are incompatible, 1 if they
5211 are compatible, and 2 if they are nearly compatible (which causes a
5212 warning to be generated). */
5213
5214 static int
5215 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5216 {
5217 unsigned int ccvt1, ccvt2;
5218
5219 if (TREE_CODE (type1) != FUNCTION_TYPE
5220 && TREE_CODE (type1) != METHOD_TYPE)
5221 return 1;
5222
5223 ccvt1 = ix86_get_callcvt (type1);
5224 ccvt2 = ix86_get_callcvt (type2);
5225 if (ccvt1 != ccvt2)
5226 return 0;
5227 if (ix86_function_regparm (type1, NULL)
5228 != ix86_function_regparm (type2, NULL))
5229 return 0;
5230
5231 return 1;
5232 }
5233 \f
5234 /* Return the regparm value for a function with the indicated TYPE and DECL.
5235 DECL may be NULL when calling function indirectly
5236 or considering a libcall. */
5237
5238 static int
5239 ix86_function_regparm (const_tree type, const_tree decl)
5240 {
5241 tree attr;
5242 int regparm;
5243 unsigned int ccvt;
5244
5245 if (TARGET_64BIT)
5246 return (ix86_function_type_abi (type) == SYSV_ABI
5247 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5248 ccvt = ix86_get_callcvt (type);
5249 regparm = ix86_regparm;
5250
5251 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5252 {
5253 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5254 if (attr)
5255 {
5256 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5257 return regparm;
5258 }
5259 }
5260 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5261 return 2;
5262 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5263 return 1;
5264
5265 /* Use register calling convention for local functions when possible. */
5266 if (decl
5267 && TREE_CODE (decl) == FUNCTION_DECL
5268 && optimize
5269 && !(profile_flag && !flag_fentry))
5270 {
5271 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5272 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5273 if (i && i->local && i->can_change_signature)
5274 {
5275 int local_regparm, globals = 0, regno;
5276
5277 /* Make sure no regparm register is taken by a
5278 fixed register variable. */
5279 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5280 if (fixed_regs[local_regparm])
5281 break;
5282
5283 /* We don't want to use regparm(3) for nested functions as
5284 these use a static chain pointer in the third argument. */
5285 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5286 local_regparm = 2;
5287
5288 /* In 32-bit mode save a register for the split stack. */
5289 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5290 local_regparm = 2;
5291
5292 /* Each fixed register usage increases register pressure,
5293 so less registers should be used for argument passing.
5294 This functionality can be overriden by an explicit
5295 regparm value. */
5296 for (regno = 0; regno <= DI_REG; regno++)
5297 if (fixed_regs[regno])
5298 globals++;
5299
5300 local_regparm
5301 = globals < local_regparm ? local_regparm - globals : 0;
5302
5303 if (local_regparm > regparm)
5304 regparm = local_regparm;
5305 }
5306 }
5307
5308 return regparm;
5309 }
5310
5311 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5312 DFmode (2) arguments in SSE registers for a function with the
5313 indicated TYPE and DECL. DECL may be NULL when calling function
5314 indirectly or considering a libcall. Otherwise return 0. */
5315
5316 static int
5317 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5318 {
5319 gcc_assert (!TARGET_64BIT);
5320
5321 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5322 by the sseregparm attribute. */
5323 if (TARGET_SSEREGPARM
5324 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5325 {
5326 if (!TARGET_SSE)
5327 {
5328 if (warn)
5329 {
5330 if (decl)
5331 error ("calling %qD with attribute sseregparm without "
5332 "SSE/SSE2 enabled", decl);
5333 else
5334 error ("calling %qT with attribute sseregparm without "
5335 "SSE/SSE2 enabled", type);
5336 }
5337 return 0;
5338 }
5339
5340 return 2;
5341 }
5342
5343 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5344 (and DFmode for SSE2) arguments in SSE registers. */
5345 if (decl && TARGET_SSE_MATH && optimize
5346 && !(profile_flag && !flag_fentry))
5347 {
5348 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5349 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5350 if (i && i->local && i->can_change_signature)
5351 return TARGET_SSE2 ? 2 : 1;
5352 }
5353
5354 return 0;
5355 }
5356
5357 /* Return true if EAX is live at the start of the function. Used by
5358 ix86_expand_prologue to determine if we need special help before
5359 calling allocate_stack_worker. */
5360
5361 static bool
5362 ix86_eax_live_at_start_p (void)
5363 {
5364 /* Cheat. Don't bother working forward from ix86_function_regparm
5365 to the function type to whether an actual argument is located in
5366 eax. Instead just look at cfg info, which is still close enough
5367 to correct at this point. This gives false positives for broken
5368 functions that might use uninitialized data that happens to be
5369 allocated in eax, but who cares? */
5370 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5371 }
5372
5373 static bool
5374 ix86_keep_aggregate_return_pointer (tree fntype)
5375 {
5376 tree attr;
5377
5378 if (!TARGET_64BIT)
5379 {
5380 attr = lookup_attribute ("callee_pop_aggregate_return",
5381 TYPE_ATTRIBUTES (fntype));
5382 if (attr)
5383 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5384
5385 /* For 32-bit MS-ABI the default is to keep aggregate
5386 return pointer. */
5387 if (ix86_function_type_abi (fntype) == MS_ABI)
5388 return true;
5389 }
5390 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5391 }
5392
5393 /* Value is the number of bytes of arguments automatically
5394 popped when returning from a subroutine call.
5395 FUNDECL is the declaration node of the function (as a tree),
5396 FUNTYPE is the data type of the function (as a tree),
5397 or for a library call it is an identifier node for the subroutine name.
5398 SIZE is the number of bytes of arguments passed on the stack.
5399
5400 On the 80386, the RTD insn may be used to pop them if the number
5401 of args is fixed, but if the number is variable then the caller
5402 must pop them all. RTD can't be used for library calls now
5403 because the library is compiled with the Unix compiler.
5404 Use of RTD is a selectable option, since it is incompatible with
5405 standard Unix calling sequences. If the option is not selected,
5406 the caller must always pop the args.
5407
5408 The attribute stdcall is equivalent to RTD on a per module basis. */
5409
5410 static int
5411 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5412 {
5413 unsigned int ccvt;
5414
5415 /* None of the 64-bit ABIs pop arguments. */
5416 if (TARGET_64BIT)
5417 return 0;
5418
5419 ccvt = ix86_get_callcvt (funtype);
5420
5421 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5422 | IX86_CALLCVT_THISCALL)) != 0
5423 && ! stdarg_p (funtype))
5424 return size;
5425
5426 /* Lose any fake structure return argument if it is passed on the stack. */
5427 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5428 && !ix86_keep_aggregate_return_pointer (funtype))
5429 {
5430 int nregs = ix86_function_regparm (funtype, fundecl);
5431 if (nregs == 0)
5432 return GET_MODE_SIZE (Pmode);
5433 }
5434
5435 return 0;
5436 }
5437 \f
5438 /* Argument support functions. */
5439
5440 /* Return true when register may be used to pass function parameters. */
5441 bool
5442 ix86_function_arg_regno_p (int regno)
5443 {
5444 int i;
5445 const int *parm_regs;
5446
5447 if (!TARGET_64BIT)
5448 {
5449 if (TARGET_MACHO)
5450 return (regno < REGPARM_MAX
5451 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5452 else
5453 return (regno < REGPARM_MAX
5454 || (TARGET_MMX && MMX_REGNO_P (regno)
5455 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5456 || (TARGET_SSE && SSE_REGNO_P (regno)
5457 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5458 }
5459
5460 if (TARGET_MACHO)
5461 {
5462 if (SSE_REGNO_P (regno) && TARGET_SSE)
5463 return true;
5464 }
5465 else
5466 {
5467 if (TARGET_SSE && SSE_REGNO_P (regno)
5468 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5469 return true;
5470 }
5471
5472 /* TODO: The function should depend on current function ABI but
5473 builtins.c would need updating then. Therefore we use the
5474 default ABI. */
5475
5476 /* RAX is used as hidden argument to va_arg functions. */
5477 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5478 return true;
5479
5480 if (ix86_abi == MS_ABI)
5481 parm_regs = x86_64_ms_abi_int_parameter_registers;
5482 else
5483 parm_regs = x86_64_int_parameter_registers;
5484 for (i = 0; i < (ix86_abi == MS_ABI
5485 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5486 if (regno == parm_regs[i])
5487 return true;
5488 return false;
5489 }
5490
5491 /* Return if we do not know how to pass TYPE solely in registers. */
5492
5493 static bool
5494 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5495 {
5496 if (must_pass_in_stack_var_size_or_pad (mode, type))
5497 return true;
5498
5499 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5500 The layout_type routine is crafty and tries to trick us into passing
5501 currently unsupported vector types on the stack by using TImode. */
5502 return (!TARGET_64BIT && mode == TImode
5503 && type && TREE_CODE (type) != VECTOR_TYPE);
5504 }
5505
5506 /* It returns the size, in bytes, of the area reserved for arguments passed
5507 in registers for the function represented by fndecl dependent to the used
5508 abi format. */
5509 int
5510 ix86_reg_parm_stack_space (const_tree fndecl)
5511 {
5512 enum calling_abi call_abi = SYSV_ABI;
5513 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5514 call_abi = ix86_function_abi (fndecl);
5515 else
5516 call_abi = ix86_function_type_abi (fndecl);
5517 if (TARGET_64BIT && call_abi == MS_ABI)
5518 return 32;
5519 return 0;
5520 }
5521
5522 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5523 call abi used. */
5524 enum calling_abi
5525 ix86_function_type_abi (const_tree fntype)
5526 {
5527 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5528 {
5529 enum calling_abi abi = ix86_abi;
5530 if (abi == SYSV_ABI)
5531 {
5532 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5533 abi = MS_ABI;
5534 }
5535 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5536 abi = SYSV_ABI;
5537 return abi;
5538 }
5539 return ix86_abi;
5540 }
5541
5542 static bool
5543 ix86_function_ms_hook_prologue (const_tree fn)
5544 {
5545 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5546 {
5547 if (decl_function_context (fn) != NULL_TREE)
5548 error_at (DECL_SOURCE_LOCATION (fn),
5549 "ms_hook_prologue is not compatible with nested function");
5550 else
5551 return true;
5552 }
5553 return false;
5554 }
5555
5556 static enum calling_abi
5557 ix86_function_abi (const_tree fndecl)
5558 {
5559 if (! fndecl)
5560 return ix86_abi;
5561 return ix86_function_type_abi (TREE_TYPE (fndecl));
5562 }
5563
5564 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5565 call abi used. */
5566 enum calling_abi
5567 ix86_cfun_abi (void)
5568 {
5569 if (! cfun)
5570 return ix86_abi;
5571 return cfun->machine->call_abi;
5572 }
5573
5574 /* Write the extra assembler code needed to declare a function properly. */
5575
5576 void
5577 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5578 tree decl)
5579 {
5580 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5581
5582 if (is_ms_hook)
5583 {
5584 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5585 unsigned int filler_cc = 0xcccccccc;
5586
5587 for (i = 0; i < filler_count; i += 4)
5588 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5589 }
5590
5591 #ifdef SUBTARGET_ASM_UNWIND_INIT
5592 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5593 #endif
5594
5595 ASM_OUTPUT_LABEL (asm_out_file, fname);
5596
5597 /* Output magic byte marker, if hot-patch attribute is set. */
5598 if (is_ms_hook)
5599 {
5600 if (TARGET_64BIT)
5601 {
5602 /* leaq [%rsp + 0], %rsp */
5603 asm_fprintf (asm_out_file, ASM_BYTE
5604 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5605 }
5606 else
5607 {
5608 /* movl.s %edi, %edi
5609 push %ebp
5610 movl.s %esp, %ebp */
5611 asm_fprintf (asm_out_file, ASM_BYTE
5612 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5613 }
5614 }
5615 }
5616
5617 /* regclass.c */
5618 extern void init_regs (void);
5619
5620 /* Implementation of call abi switching target hook. Specific to FNDECL
5621 the specific call register sets are set. See also
5622 ix86_conditional_register_usage for more details. */
5623 void
5624 ix86_call_abi_override (const_tree fndecl)
5625 {
5626 if (fndecl == NULL_TREE)
5627 cfun->machine->call_abi = ix86_abi;
5628 else
5629 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5630 }
5631
5632 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5633 expensive re-initialization of init_regs each time we switch function context
5634 since this is needed only during RTL expansion. */
5635 static void
5636 ix86_maybe_switch_abi (void)
5637 {
5638 if (TARGET_64BIT &&
5639 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5640 reinit_regs ();
5641 }
5642
5643 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5644 for a call to a function whose data type is FNTYPE.
5645 For a library call, FNTYPE is 0. */
5646
5647 void
5648 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5649 tree fntype, /* tree ptr for function decl */
5650 rtx libname, /* SYMBOL_REF of library name or 0 */
5651 tree fndecl,
5652 int caller)
5653 {
5654 struct cgraph_local_info *i;
5655 tree fnret_type;
5656
5657 memset (cum, 0, sizeof (*cum));
5658
5659 /* Initialize for the current callee. */
5660 if (caller)
5661 {
5662 cfun->machine->callee_pass_avx256_p = false;
5663 cfun->machine->callee_return_avx256_p = false;
5664 }
5665
5666 if (fndecl)
5667 {
5668 i = cgraph_local_info (fndecl);
5669 cum->call_abi = ix86_function_abi (fndecl);
5670 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5671 }
5672 else
5673 {
5674 i = NULL;
5675 cum->call_abi = ix86_function_type_abi (fntype);
5676 if (fntype)
5677 fnret_type = TREE_TYPE (fntype);
5678 else
5679 fnret_type = NULL;
5680 }
5681
5682 if (TARGET_VZEROUPPER && fnret_type)
5683 {
5684 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5685 false);
5686 if (function_pass_avx256_p (fnret_value))
5687 {
5688 /* The return value of this function uses 256bit AVX modes. */
5689 if (caller)
5690 cfun->machine->callee_return_avx256_p = true;
5691 else
5692 cfun->machine->caller_return_avx256_p = true;
5693 }
5694 }
5695
5696 cum->caller = caller;
5697
5698 /* Set up the number of registers to use for passing arguments. */
5699
5700 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5701 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5702 "or subtarget optimization implying it");
5703 cum->nregs = ix86_regparm;
5704 if (TARGET_64BIT)
5705 {
5706 cum->nregs = (cum->call_abi == SYSV_ABI
5707 ? X86_64_REGPARM_MAX
5708 : X86_64_MS_REGPARM_MAX);
5709 }
5710 if (TARGET_SSE)
5711 {
5712 cum->sse_nregs = SSE_REGPARM_MAX;
5713 if (TARGET_64BIT)
5714 {
5715 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5716 ? X86_64_SSE_REGPARM_MAX
5717 : X86_64_MS_SSE_REGPARM_MAX);
5718 }
5719 }
5720 if (TARGET_MMX)
5721 cum->mmx_nregs = MMX_REGPARM_MAX;
5722 cum->warn_avx = true;
5723 cum->warn_sse = true;
5724 cum->warn_mmx = true;
5725
5726 /* Because type might mismatch in between caller and callee, we need to
5727 use actual type of function for local calls.
5728 FIXME: cgraph_analyze can be told to actually record if function uses
5729 va_start so for local functions maybe_vaarg can be made aggressive
5730 helping K&R code.
5731 FIXME: once typesytem is fixed, we won't need this code anymore. */
5732 if (i && i->local && i->can_change_signature)
5733 fntype = TREE_TYPE (fndecl);
5734 cum->maybe_vaarg = (fntype
5735 ? (!prototype_p (fntype) || stdarg_p (fntype))
5736 : !libname);
5737
5738 if (!TARGET_64BIT)
5739 {
5740 /* If there are variable arguments, then we won't pass anything
5741 in registers in 32-bit mode. */
5742 if (stdarg_p (fntype))
5743 {
5744 cum->nregs = 0;
5745 cum->sse_nregs = 0;
5746 cum->mmx_nregs = 0;
5747 cum->warn_avx = 0;
5748 cum->warn_sse = 0;
5749 cum->warn_mmx = 0;
5750 return;
5751 }
5752
5753 /* Use ecx and edx registers if function has fastcall attribute,
5754 else look for regparm information. */
5755 if (fntype)
5756 {
5757 unsigned int ccvt = ix86_get_callcvt (fntype);
5758 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5759 {
5760 cum->nregs = 1;
5761 cum->fastcall = 1; /* Same first register as in fastcall. */
5762 }
5763 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5764 {
5765 cum->nregs = 2;
5766 cum->fastcall = 1;
5767 }
5768 else
5769 cum->nregs = ix86_function_regparm (fntype, fndecl);
5770 }
5771
5772 /* Set up the number of SSE registers used for passing SFmode
5773 and DFmode arguments. Warn for mismatching ABI. */
5774 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5775 }
5776 }
5777
5778 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5779 But in the case of vector types, it is some vector mode.
5780
5781 When we have only some of our vector isa extensions enabled, then there
5782 are some modes for which vector_mode_supported_p is false. For these
5783 modes, the generic vector support in gcc will choose some non-vector mode
5784 in order to implement the type. By computing the natural mode, we'll
5785 select the proper ABI location for the operand and not depend on whatever
5786 the middle-end decides to do with these vector types.
5787
5788 The midde-end can't deal with the vector types > 16 bytes. In this
5789 case, we return the original mode and warn ABI change if CUM isn't
5790 NULL. */
5791
5792 static enum machine_mode
5793 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5794 {
5795 enum machine_mode mode = TYPE_MODE (type);
5796
5797 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5798 {
5799 HOST_WIDE_INT size = int_size_in_bytes (type);
5800 if ((size == 8 || size == 16 || size == 32)
5801 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5802 && TYPE_VECTOR_SUBPARTS (type) > 1)
5803 {
5804 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5805
5806 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5807 mode = MIN_MODE_VECTOR_FLOAT;
5808 else
5809 mode = MIN_MODE_VECTOR_INT;
5810
5811 /* Get the mode which has this inner mode and number of units. */
5812 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5813 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5814 && GET_MODE_INNER (mode) == innermode)
5815 {
5816 if (size == 32 && !TARGET_AVX)
5817 {
5818 static bool warnedavx;
5819
5820 if (cum
5821 && !warnedavx
5822 && cum->warn_avx)
5823 {
5824 warnedavx = true;
5825 warning (0, "AVX vector argument without AVX "
5826 "enabled changes the ABI");
5827 }
5828 return TYPE_MODE (type);
5829 }
5830 else
5831 return mode;
5832 }
5833
5834 gcc_unreachable ();
5835 }
5836 }
5837
5838 return mode;
5839 }
5840
5841 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5842 this may not agree with the mode that the type system has chosen for the
5843 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5844 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5845
5846 static rtx
5847 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5848 unsigned int regno)
5849 {
5850 rtx tmp;
5851
5852 if (orig_mode != BLKmode)
5853 tmp = gen_rtx_REG (orig_mode, regno);
5854 else
5855 {
5856 tmp = gen_rtx_REG (mode, regno);
5857 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5858 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5859 }
5860
5861 return tmp;
5862 }
5863
5864 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5865 of this code is to classify each 8bytes of incoming argument by the register
5866 class and assign registers accordingly. */
5867
5868 /* Return the union class of CLASS1 and CLASS2.
5869 See the x86-64 PS ABI for details. */
5870
5871 static enum x86_64_reg_class
5872 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5873 {
5874 /* Rule #1: If both classes are equal, this is the resulting class. */
5875 if (class1 == class2)
5876 return class1;
5877
5878 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5879 the other class. */
5880 if (class1 == X86_64_NO_CLASS)
5881 return class2;
5882 if (class2 == X86_64_NO_CLASS)
5883 return class1;
5884
5885 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5886 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5887 return X86_64_MEMORY_CLASS;
5888
5889 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5890 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5891 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5892 return X86_64_INTEGERSI_CLASS;
5893 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5894 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5895 return X86_64_INTEGER_CLASS;
5896
5897 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5898 MEMORY is used. */
5899 if (class1 == X86_64_X87_CLASS
5900 || class1 == X86_64_X87UP_CLASS
5901 || class1 == X86_64_COMPLEX_X87_CLASS
5902 || class2 == X86_64_X87_CLASS
5903 || class2 == X86_64_X87UP_CLASS
5904 || class2 == X86_64_COMPLEX_X87_CLASS)
5905 return X86_64_MEMORY_CLASS;
5906
5907 /* Rule #6: Otherwise class SSE is used. */
5908 return X86_64_SSE_CLASS;
5909 }
5910
5911 /* Classify the argument of type TYPE and mode MODE.
5912 CLASSES will be filled by the register class used to pass each word
5913 of the operand. The number of words is returned. In case the parameter
5914 should be passed in memory, 0 is returned. As a special case for zero
5915 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5916
5917 BIT_OFFSET is used internally for handling records and specifies offset
5918 of the offset in bits modulo 256 to avoid overflow cases.
5919
5920 See the x86-64 PS ABI for details.
5921 */
5922
5923 static int
5924 classify_argument (enum machine_mode mode, const_tree type,
5925 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5926 {
5927 HOST_WIDE_INT bytes =
5928 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5929 int words
5930 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5931
5932 /* Variable sized entities are always passed/returned in memory. */
5933 if (bytes < 0)
5934 return 0;
5935
5936 if (mode != VOIDmode
5937 && targetm.calls.must_pass_in_stack (mode, type))
5938 return 0;
5939
5940 if (type && AGGREGATE_TYPE_P (type))
5941 {
5942 int i;
5943 tree field;
5944 enum x86_64_reg_class subclasses[MAX_CLASSES];
5945
5946 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5947 if (bytes > 32)
5948 return 0;
5949
5950 for (i = 0; i < words; i++)
5951 classes[i] = X86_64_NO_CLASS;
5952
5953 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5954 signalize memory class, so handle it as special case. */
5955 if (!words)
5956 {
5957 classes[0] = X86_64_NO_CLASS;
5958 return 1;
5959 }
5960
5961 /* Classify each field of record and merge classes. */
5962 switch (TREE_CODE (type))
5963 {
5964 case RECORD_TYPE:
5965 /* And now merge the fields of structure. */
5966 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5967 {
5968 if (TREE_CODE (field) == FIELD_DECL)
5969 {
5970 int num;
5971
5972 if (TREE_TYPE (field) == error_mark_node)
5973 continue;
5974
5975 /* Bitfields are always classified as integer. Handle them
5976 early, since later code would consider them to be
5977 misaligned integers. */
5978 if (DECL_BIT_FIELD (field))
5979 {
5980 for (i = (int_bit_position (field)
5981 + (bit_offset % 64)) / 8 / 8;
5982 i < ((int_bit_position (field) + (bit_offset % 64))
5983 + tree_low_cst (DECL_SIZE (field), 0)
5984 + 63) / 8 / 8; i++)
5985 classes[i] =
5986 merge_classes (X86_64_INTEGER_CLASS,
5987 classes[i]);
5988 }
5989 else
5990 {
5991 int pos;
5992
5993 type = TREE_TYPE (field);
5994
5995 /* Flexible array member is ignored. */
5996 if (TYPE_MODE (type) == BLKmode
5997 && TREE_CODE (type) == ARRAY_TYPE
5998 && TYPE_SIZE (type) == NULL_TREE
5999 && TYPE_DOMAIN (type) != NULL_TREE
6000 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6001 == NULL_TREE))
6002 {
6003 static bool warned;
6004
6005 if (!warned && warn_psabi)
6006 {
6007 warned = true;
6008 inform (input_location,
6009 "the ABI of passing struct with"
6010 " a flexible array member has"
6011 " changed in GCC 4.4");
6012 }
6013 continue;
6014 }
6015 num = classify_argument (TYPE_MODE (type), type,
6016 subclasses,
6017 (int_bit_position (field)
6018 + bit_offset) % 256);
6019 if (!num)
6020 return 0;
6021 pos = (int_bit_position (field)
6022 + (bit_offset % 64)) / 8 / 8;
6023 for (i = 0; i < num && (i + pos) < words; i++)
6024 classes[i + pos] =
6025 merge_classes (subclasses[i], classes[i + pos]);
6026 }
6027 }
6028 }
6029 break;
6030
6031 case ARRAY_TYPE:
6032 /* Arrays are handled as small records. */
6033 {
6034 int num;
6035 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6036 TREE_TYPE (type), subclasses, bit_offset);
6037 if (!num)
6038 return 0;
6039
6040 /* The partial classes are now full classes. */
6041 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6042 subclasses[0] = X86_64_SSE_CLASS;
6043 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6044 && !((bit_offset % 64) == 0 && bytes == 4))
6045 subclasses[0] = X86_64_INTEGER_CLASS;
6046
6047 for (i = 0; i < words; i++)
6048 classes[i] = subclasses[i % num];
6049
6050 break;
6051 }
6052 case UNION_TYPE:
6053 case QUAL_UNION_TYPE:
6054 /* Unions are similar to RECORD_TYPE but offset is always 0.
6055 */
6056 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6057 {
6058 if (TREE_CODE (field) == FIELD_DECL)
6059 {
6060 int num;
6061
6062 if (TREE_TYPE (field) == error_mark_node)
6063 continue;
6064
6065 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6066 TREE_TYPE (field), subclasses,
6067 bit_offset);
6068 if (!num)
6069 return 0;
6070 for (i = 0; i < num; i++)
6071 classes[i] = merge_classes (subclasses[i], classes[i]);
6072 }
6073 }
6074 break;
6075
6076 default:
6077 gcc_unreachable ();
6078 }
6079
6080 if (words > 2)
6081 {
6082 /* When size > 16 bytes, if the first one isn't
6083 X86_64_SSE_CLASS or any other ones aren't
6084 X86_64_SSEUP_CLASS, everything should be passed in
6085 memory. */
6086 if (classes[0] != X86_64_SSE_CLASS)
6087 return 0;
6088
6089 for (i = 1; i < words; i++)
6090 if (classes[i] != X86_64_SSEUP_CLASS)
6091 return 0;
6092 }
6093
6094 /* Final merger cleanup. */
6095 for (i = 0; i < words; i++)
6096 {
6097 /* If one class is MEMORY, everything should be passed in
6098 memory. */
6099 if (classes[i] == X86_64_MEMORY_CLASS)
6100 return 0;
6101
6102 /* The X86_64_SSEUP_CLASS should be always preceded by
6103 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6104 if (classes[i] == X86_64_SSEUP_CLASS
6105 && classes[i - 1] != X86_64_SSE_CLASS
6106 && classes[i - 1] != X86_64_SSEUP_CLASS)
6107 {
6108 /* The first one should never be X86_64_SSEUP_CLASS. */
6109 gcc_assert (i != 0);
6110 classes[i] = X86_64_SSE_CLASS;
6111 }
6112
6113 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6114 everything should be passed in memory. */
6115 if (classes[i] == X86_64_X87UP_CLASS
6116 && (classes[i - 1] != X86_64_X87_CLASS))
6117 {
6118 static bool warned;
6119
6120 /* The first one should never be X86_64_X87UP_CLASS. */
6121 gcc_assert (i != 0);
6122 if (!warned && warn_psabi)
6123 {
6124 warned = true;
6125 inform (input_location,
6126 "the ABI of passing union with long double"
6127 " has changed in GCC 4.4");
6128 }
6129 return 0;
6130 }
6131 }
6132 return words;
6133 }
6134
6135 /* Compute alignment needed. We align all types to natural boundaries with
6136 exception of XFmode that is aligned to 64bits. */
6137 if (mode != VOIDmode && mode != BLKmode)
6138 {
6139 int mode_alignment = GET_MODE_BITSIZE (mode);
6140
6141 if (mode == XFmode)
6142 mode_alignment = 128;
6143 else if (mode == XCmode)
6144 mode_alignment = 256;
6145 if (COMPLEX_MODE_P (mode))
6146 mode_alignment /= 2;
6147 /* Misaligned fields are always returned in memory. */
6148 if (bit_offset % mode_alignment)
6149 return 0;
6150 }
6151
6152 /* for V1xx modes, just use the base mode */
6153 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6154 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6155 mode = GET_MODE_INNER (mode);
6156
6157 /* Classification of atomic types. */
6158 switch (mode)
6159 {
6160 case SDmode:
6161 case DDmode:
6162 classes[0] = X86_64_SSE_CLASS;
6163 return 1;
6164 case TDmode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 return 2;
6168 case DImode:
6169 case SImode:
6170 case HImode:
6171 case QImode:
6172 case CSImode:
6173 case CHImode:
6174 case CQImode:
6175 {
6176 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6177
6178 if (size <= 32)
6179 {
6180 classes[0] = X86_64_INTEGERSI_CLASS;
6181 return 1;
6182 }
6183 else if (size <= 64)
6184 {
6185 classes[0] = X86_64_INTEGER_CLASS;
6186 return 1;
6187 }
6188 else if (size <= 64+32)
6189 {
6190 classes[0] = X86_64_INTEGER_CLASS;
6191 classes[1] = X86_64_INTEGERSI_CLASS;
6192 return 2;
6193 }
6194 else if (size <= 64+64)
6195 {
6196 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6197 return 2;
6198 }
6199 else
6200 gcc_unreachable ();
6201 }
6202 case CDImode:
6203 case TImode:
6204 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6205 return 2;
6206 case COImode:
6207 case OImode:
6208 /* OImode shouldn't be used directly. */
6209 gcc_unreachable ();
6210 case CTImode:
6211 return 0;
6212 case SFmode:
6213 if (!(bit_offset % 64))
6214 classes[0] = X86_64_SSESF_CLASS;
6215 else
6216 classes[0] = X86_64_SSE_CLASS;
6217 return 1;
6218 case DFmode:
6219 classes[0] = X86_64_SSEDF_CLASS;
6220 return 1;
6221 case XFmode:
6222 classes[0] = X86_64_X87_CLASS;
6223 classes[1] = X86_64_X87UP_CLASS;
6224 return 2;
6225 case TFmode:
6226 classes[0] = X86_64_SSE_CLASS;
6227 classes[1] = X86_64_SSEUP_CLASS;
6228 return 2;
6229 case SCmode:
6230 classes[0] = X86_64_SSE_CLASS;
6231 if (!(bit_offset % 64))
6232 return 1;
6233 else
6234 {
6235 static bool warned;
6236
6237 if (!warned && warn_psabi)
6238 {
6239 warned = true;
6240 inform (input_location,
6241 "the ABI of passing structure with complex float"
6242 " member has changed in GCC 4.4");
6243 }
6244 classes[1] = X86_64_SSESF_CLASS;
6245 return 2;
6246 }
6247 case DCmode:
6248 classes[0] = X86_64_SSEDF_CLASS;
6249 classes[1] = X86_64_SSEDF_CLASS;
6250 return 2;
6251 case XCmode:
6252 classes[0] = X86_64_COMPLEX_X87_CLASS;
6253 return 1;
6254 case TCmode:
6255 /* This modes is larger than 16 bytes. */
6256 return 0;
6257 case V8SFmode:
6258 case V8SImode:
6259 case V32QImode:
6260 case V16HImode:
6261 case V4DFmode:
6262 case V4DImode:
6263 classes[0] = X86_64_SSE_CLASS;
6264 classes[1] = X86_64_SSEUP_CLASS;
6265 classes[2] = X86_64_SSEUP_CLASS;
6266 classes[3] = X86_64_SSEUP_CLASS;
6267 return 4;
6268 case V4SFmode:
6269 case V4SImode:
6270 case V16QImode:
6271 case V8HImode:
6272 case V2DFmode:
6273 case V2DImode:
6274 classes[0] = X86_64_SSE_CLASS;
6275 classes[1] = X86_64_SSEUP_CLASS;
6276 return 2;
6277 case V1TImode:
6278 case V1DImode:
6279 case V2SFmode:
6280 case V2SImode:
6281 case V4HImode:
6282 case V8QImode:
6283 classes[0] = X86_64_SSE_CLASS;
6284 return 1;
6285 case BLKmode:
6286 case VOIDmode:
6287 return 0;
6288 default:
6289 gcc_assert (VECTOR_MODE_P (mode));
6290
6291 if (bytes > 16)
6292 return 0;
6293
6294 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6295
6296 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6297 classes[0] = X86_64_INTEGERSI_CLASS;
6298 else
6299 classes[0] = X86_64_INTEGER_CLASS;
6300 classes[1] = X86_64_INTEGER_CLASS;
6301 return 1 + (bytes > 8);
6302 }
6303 }
6304
6305 /* Examine the argument and return set number of register required in each
6306 class. Return 0 iff parameter should be passed in memory. */
6307 static int
6308 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6309 int *int_nregs, int *sse_nregs)
6310 {
6311 enum x86_64_reg_class regclass[MAX_CLASSES];
6312 int n = classify_argument (mode, type, regclass, 0);
6313
6314 *int_nregs = 0;
6315 *sse_nregs = 0;
6316 if (!n)
6317 return 0;
6318 for (n--; n >= 0; n--)
6319 switch (regclass[n])
6320 {
6321 case X86_64_INTEGER_CLASS:
6322 case X86_64_INTEGERSI_CLASS:
6323 (*int_nregs)++;
6324 break;
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 (*sse_nregs)++;
6329 break;
6330 case X86_64_NO_CLASS:
6331 case X86_64_SSEUP_CLASS:
6332 break;
6333 case X86_64_X87_CLASS:
6334 case X86_64_X87UP_CLASS:
6335 if (!in_return)
6336 return 0;
6337 break;
6338 case X86_64_COMPLEX_X87_CLASS:
6339 return in_return ? 2 : 0;
6340 case X86_64_MEMORY_CLASS:
6341 gcc_unreachable ();
6342 }
6343 return 1;
6344 }
6345
6346 /* Construct container for the argument used by GCC interface. See
6347 FUNCTION_ARG for the detailed description. */
6348
6349 static rtx
6350 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6351 const_tree type, int in_return, int nintregs, int nsseregs,
6352 const int *intreg, int sse_regno)
6353 {
6354 /* The following variables hold the static issued_error state. */
6355 static bool issued_sse_arg_error;
6356 static bool issued_sse_ret_error;
6357 static bool issued_x87_ret_error;
6358
6359 enum machine_mode tmpmode;
6360 int bytes =
6361 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6362 enum x86_64_reg_class regclass[MAX_CLASSES];
6363 int n;
6364 int i;
6365 int nexps = 0;
6366 int needed_sseregs, needed_intregs;
6367 rtx exp[MAX_CLASSES];
6368 rtx ret;
6369
6370 n = classify_argument (mode, type, regclass, 0);
6371 if (!n)
6372 return NULL;
6373 if (!examine_argument (mode, type, in_return, &needed_intregs,
6374 &needed_sseregs))
6375 return NULL;
6376 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6377 return NULL;
6378
6379 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6380 some less clueful developer tries to use floating-point anyway. */
6381 if (needed_sseregs && !TARGET_SSE)
6382 {
6383 if (in_return)
6384 {
6385 if (!issued_sse_ret_error)
6386 {
6387 error ("SSE register return with SSE disabled");
6388 issued_sse_ret_error = true;
6389 }
6390 }
6391 else if (!issued_sse_arg_error)
6392 {
6393 error ("SSE register argument with SSE disabled");
6394 issued_sse_arg_error = true;
6395 }
6396 return NULL;
6397 }
6398
6399 /* Likewise, error if the ABI requires us to return values in the
6400 x87 registers and the user specified -mno-80387. */
6401 if (!TARGET_80387 && in_return)
6402 for (i = 0; i < n; i++)
6403 if (regclass[i] == X86_64_X87_CLASS
6404 || regclass[i] == X86_64_X87UP_CLASS
6405 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6406 {
6407 if (!issued_x87_ret_error)
6408 {
6409 error ("x87 register return with x87 disabled");
6410 issued_x87_ret_error = true;
6411 }
6412 return NULL;
6413 }
6414
6415 /* First construct simple cases. Avoid SCmode, since we want to use
6416 single register to pass this type. */
6417 if (n == 1 && mode != SCmode)
6418 switch (regclass[0])
6419 {
6420 case X86_64_INTEGER_CLASS:
6421 case X86_64_INTEGERSI_CLASS:
6422 return gen_rtx_REG (mode, intreg[0]);
6423 case X86_64_SSE_CLASS:
6424 case X86_64_SSESF_CLASS:
6425 case X86_64_SSEDF_CLASS:
6426 if (mode != BLKmode)
6427 return gen_reg_or_parallel (mode, orig_mode,
6428 SSE_REGNO (sse_regno));
6429 break;
6430 case X86_64_X87_CLASS:
6431 case X86_64_COMPLEX_X87_CLASS:
6432 return gen_rtx_REG (mode, FIRST_STACK_REG);
6433 case X86_64_NO_CLASS:
6434 /* Zero sized array, struct or class. */
6435 return NULL;
6436 default:
6437 gcc_unreachable ();
6438 }
6439 if (n == 2
6440 && regclass[0] == X86_64_SSE_CLASS
6441 && regclass[1] == X86_64_SSEUP_CLASS
6442 && mode != BLKmode)
6443 return gen_reg_or_parallel (mode, orig_mode,
6444 SSE_REGNO (sse_regno));
6445 if (n == 4
6446 && regclass[0] == X86_64_SSE_CLASS
6447 && regclass[1] == X86_64_SSEUP_CLASS
6448 && regclass[2] == X86_64_SSEUP_CLASS
6449 && regclass[3] == X86_64_SSEUP_CLASS
6450 && mode != BLKmode)
6451 return gen_reg_or_parallel (mode, orig_mode,
6452 SSE_REGNO (sse_regno));
6453 if (n == 2
6454 && regclass[0] == X86_64_X87_CLASS
6455 && regclass[1] == X86_64_X87UP_CLASS)
6456 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6457
6458 if (n == 2
6459 && regclass[0] == X86_64_INTEGER_CLASS
6460 && regclass[1] == X86_64_INTEGER_CLASS
6461 && (mode == CDImode || mode == TImode || mode == TFmode)
6462 && intreg[0] + 1 == intreg[1])
6463 return gen_rtx_REG (mode, intreg[0]);
6464
6465 /* Otherwise figure out the entries of the PARALLEL. */
6466 for (i = 0; i < n; i++)
6467 {
6468 int pos;
6469
6470 switch (regclass[i])
6471 {
6472 case X86_64_NO_CLASS:
6473 break;
6474 case X86_64_INTEGER_CLASS:
6475 case X86_64_INTEGERSI_CLASS:
6476 /* Merge TImodes on aligned occasions here too. */
6477 if (i * 8 + 8 > bytes)
6478 tmpmode
6479 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6480 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6481 tmpmode = SImode;
6482 else
6483 tmpmode = DImode;
6484 /* We've requested 24 bytes we
6485 don't have mode for. Use DImode. */
6486 if (tmpmode == BLKmode)
6487 tmpmode = DImode;
6488 exp [nexps++]
6489 = gen_rtx_EXPR_LIST (VOIDmode,
6490 gen_rtx_REG (tmpmode, *intreg),
6491 GEN_INT (i*8));
6492 intreg++;
6493 break;
6494 case X86_64_SSESF_CLASS:
6495 exp [nexps++]
6496 = gen_rtx_EXPR_LIST (VOIDmode,
6497 gen_rtx_REG (SFmode,
6498 SSE_REGNO (sse_regno)),
6499 GEN_INT (i*8));
6500 sse_regno++;
6501 break;
6502 case X86_64_SSEDF_CLASS:
6503 exp [nexps++]
6504 = gen_rtx_EXPR_LIST (VOIDmode,
6505 gen_rtx_REG (DFmode,
6506 SSE_REGNO (sse_regno)),
6507 GEN_INT (i*8));
6508 sse_regno++;
6509 break;
6510 case X86_64_SSE_CLASS:
6511 pos = i;
6512 switch (n)
6513 {
6514 case 1:
6515 tmpmode = DImode;
6516 break;
6517 case 2:
6518 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6519 {
6520 tmpmode = TImode;
6521 i++;
6522 }
6523 else
6524 tmpmode = DImode;
6525 break;
6526 case 4:
6527 gcc_assert (i == 0
6528 && regclass[1] == X86_64_SSEUP_CLASS
6529 && regclass[2] == X86_64_SSEUP_CLASS
6530 && regclass[3] == X86_64_SSEUP_CLASS);
6531 tmpmode = OImode;
6532 i += 3;
6533 break;
6534 default:
6535 gcc_unreachable ();
6536 }
6537 exp [nexps++]
6538 = gen_rtx_EXPR_LIST (VOIDmode,
6539 gen_rtx_REG (tmpmode,
6540 SSE_REGNO (sse_regno)),
6541 GEN_INT (pos*8));
6542 sse_regno++;
6543 break;
6544 default:
6545 gcc_unreachable ();
6546 }
6547 }
6548
6549 /* Empty aligned struct, union or class. */
6550 if (nexps == 0)
6551 return NULL;
6552
6553 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6554 for (i = 0; i < nexps; i++)
6555 XVECEXP (ret, 0, i) = exp [i];
6556 return ret;
6557 }
6558
6559 /* Update the data in CUM to advance over an argument of mode MODE
6560 and data type TYPE. (TYPE is null for libcalls where that information
6561 may not be available.) */
6562
6563 static void
6564 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6565 const_tree type, HOST_WIDE_INT bytes,
6566 HOST_WIDE_INT words)
6567 {
6568 switch (mode)
6569 {
6570 default:
6571 break;
6572
6573 case BLKmode:
6574 if (bytes < 0)
6575 break;
6576 /* FALLTHRU */
6577
6578 case DImode:
6579 case SImode:
6580 case HImode:
6581 case QImode:
6582 cum->words += words;
6583 cum->nregs -= words;
6584 cum->regno += words;
6585
6586 if (cum->nregs <= 0)
6587 {
6588 cum->nregs = 0;
6589 cum->regno = 0;
6590 }
6591 break;
6592
6593 case OImode:
6594 /* OImode shouldn't be used directly. */
6595 gcc_unreachable ();
6596
6597 case DFmode:
6598 if (cum->float_in_sse < 2)
6599 break;
6600 case SFmode:
6601 if (cum->float_in_sse < 1)
6602 break;
6603 /* FALLTHRU */
6604
6605 case V8SFmode:
6606 case V8SImode:
6607 case V32QImode:
6608 case V16HImode:
6609 case V4DFmode:
6610 case V4DImode:
6611 case TImode:
6612 case V16QImode:
6613 case V8HImode:
6614 case V4SImode:
6615 case V2DImode:
6616 case V4SFmode:
6617 case V2DFmode:
6618 if (!type || !AGGREGATE_TYPE_P (type))
6619 {
6620 cum->sse_words += words;
6621 cum->sse_nregs -= 1;
6622 cum->sse_regno += 1;
6623 if (cum->sse_nregs <= 0)
6624 {
6625 cum->sse_nregs = 0;
6626 cum->sse_regno = 0;
6627 }
6628 }
6629 break;
6630
6631 case V8QImode:
6632 case V4HImode:
6633 case V2SImode:
6634 case V2SFmode:
6635 case V1TImode:
6636 case V1DImode:
6637 if (!type || !AGGREGATE_TYPE_P (type))
6638 {
6639 cum->mmx_words += words;
6640 cum->mmx_nregs -= 1;
6641 cum->mmx_regno += 1;
6642 if (cum->mmx_nregs <= 0)
6643 {
6644 cum->mmx_nregs = 0;
6645 cum->mmx_regno = 0;
6646 }
6647 }
6648 break;
6649 }
6650 }
6651
6652 static void
6653 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6654 const_tree type, HOST_WIDE_INT words, bool named)
6655 {
6656 int int_nregs, sse_nregs;
6657
6658 /* Unnamed 256bit vector mode parameters are passed on stack. */
6659 if (!named && VALID_AVX256_REG_MODE (mode))
6660 return;
6661
6662 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6663 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6664 {
6665 cum->nregs -= int_nregs;
6666 cum->sse_nregs -= sse_nregs;
6667 cum->regno += int_nregs;
6668 cum->sse_regno += sse_nregs;
6669 }
6670 else
6671 {
6672 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6673 cum->words = (cum->words + align - 1) & ~(align - 1);
6674 cum->words += words;
6675 }
6676 }
6677
6678 static void
6679 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6680 HOST_WIDE_INT words)
6681 {
6682 /* Otherwise, this should be passed indirect. */
6683 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6684
6685 cum->words += words;
6686 if (cum->nregs > 0)
6687 {
6688 cum->nregs -= 1;
6689 cum->regno += 1;
6690 }
6691 }
6692
6693 /* Update the data in CUM to advance over an argument of mode MODE and
6694 data type TYPE. (TYPE is null for libcalls where that information
6695 may not be available.) */
6696
6697 static void
6698 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6699 const_tree type, bool named)
6700 {
6701 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6702 HOST_WIDE_INT bytes, words;
6703
6704 if (mode == BLKmode)
6705 bytes = int_size_in_bytes (type);
6706 else
6707 bytes = GET_MODE_SIZE (mode);
6708 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6709
6710 if (type)
6711 mode = type_natural_mode (type, NULL);
6712
6713 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6714 function_arg_advance_ms_64 (cum, bytes, words);
6715 else if (TARGET_64BIT)
6716 function_arg_advance_64 (cum, mode, type, words, named);
6717 else
6718 function_arg_advance_32 (cum, mode, type, bytes, words);
6719 }
6720
6721 /* Define where to put the arguments to a function.
6722 Value is zero to push the argument on the stack,
6723 or a hard register in which to store the argument.
6724
6725 MODE is the argument's machine mode.
6726 TYPE is the data type of the argument (as a tree).
6727 This is null for libcalls where that information may
6728 not be available.
6729 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6730 the preceding args and about the function being called.
6731 NAMED is nonzero if this argument is a named parameter
6732 (otherwise it is an extra parameter matching an ellipsis). */
6733
6734 static rtx
6735 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6736 enum machine_mode orig_mode, const_tree type,
6737 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6738 {
6739 static bool warnedsse, warnedmmx;
6740
6741 /* Avoid the AL settings for the Unix64 ABI. */
6742 if (mode == VOIDmode)
6743 return constm1_rtx;
6744
6745 switch (mode)
6746 {
6747 default:
6748 break;
6749
6750 case BLKmode:
6751 if (bytes < 0)
6752 break;
6753 /* FALLTHRU */
6754 case DImode:
6755 case SImode:
6756 case HImode:
6757 case QImode:
6758 if (words <= cum->nregs)
6759 {
6760 int regno = cum->regno;
6761
6762 /* Fastcall allocates the first two DWORD (SImode) or
6763 smaller arguments to ECX and EDX if it isn't an
6764 aggregate type . */
6765 if (cum->fastcall)
6766 {
6767 if (mode == BLKmode
6768 || mode == DImode
6769 || (type && AGGREGATE_TYPE_P (type)))
6770 break;
6771
6772 /* ECX not EAX is the first allocated register. */
6773 if (regno == AX_REG)
6774 regno = CX_REG;
6775 }
6776 return gen_rtx_REG (mode, regno);
6777 }
6778 break;
6779
6780 case DFmode:
6781 if (cum->float_in_sse < 2)
6782 break;
6783 case SFmode:
6784 if (cum->float_in_sse < 1)
6785 break;
6786 /* FALLTHRU */
6787 case TImode:
6788 /* In 32bit, we pass TImode in xmm registers. */
6789 case V16QImode:
6790 case V8HImode:
6791 case V4SImode:
6792 case V2DImode:
6793 case V4SFmode:
6794 case V2DFmode:
6795 if (!type || !AGGREGATE_TYPE_P (type))
6796 {
6797 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6798 {
6799 warnedsse = true;
6800 warning (0, "SSE vector argument without SSE enabled "
6801 "changes the ABI");
6802 }
6803 if (cum->sse_nregs)
6804 return gen_reg_or_parallel (mode, orig_mode,
6805 cum->sse_regno + FIRST_SSE_REG);
6806 }
6807 break;
6808
6809 case OImode:
6810 /* OImode shouldn't be used directly. */
6811 gcc_unreachable ();
6812
6813 case V8SFmode:
6814 case V8SImode:
6815 case V32QImode:
6816 case V16HImode:
6817 case V4DFmode:
6818 case V4DImode:
6819 if (!type || !AGGREGATE_TYPE_P (type))
6820 {
6821 if (cum->sse_nregs)
6822 return gen_reg_or_parallel (mode, orig_mode,
6823 cum->sse_regno + FIRST_SSE_REG);
6824 }
6825 break;
6826
6827 case V8QImode:
6828 case V4HImode:
6829 case V2SImode:
6830 case V2SFmode:
6831 case V1TImode:
6832 case V1DImode:
6833 if (!type || !AGGREGATE_TYPE_P (type))
6834 {
6835 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6836 {
6837 warnedmmx = true;
6838 warning (0, "MMX vector argument without MMX enabled "
6839 "changes the ABI");
6840 }
6841 if (cum->mmx_nregs)
6842 return gen_reg_or_parallel (mode, orig_mode,
6843 cum->mmx_regno + FIRST_MMX_REG);
6844 }
6845 break;
6846 }
6847
6848 return NULL_RTX;
6849 }
6850
6851 static rtx
6852 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6853 enum machine_mode orig_mode, const_tree type, bool named)
6854 {
6855 /* Handle a hidden AL argument containing number of registers
6856 for varargs x86-64 functions. */
6857 if (mode == VOIDmode)
6858 return GEN_INT (cum->maybe_vaarg
6859 ? (cum->sse_nregs < 0
6860 ? X86_64_SSE_REGPARM_MAX
6861 : cum->sse_regno)
6862 : -1);
6863
6864 switch (mode)
6865 {
6866 default:
6867 break;
6868
6869 case V8SFmode:
6870 case V8SImode:
6871 case V32QImode:
6872 case V16HImode:
6873 case V4DFmode:
6874 case V4DImode:
6875 /* Unnamed 256bit vector mode parameters are passed on stack. */
6876 if (!named)
6877 return NULL;
6878 break;
6879 }
6880
6881 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6882 cum->sse_nregs,
6883 &x86_64_int_parameter_registers [cum->regno],
6884 cum->sse_regno);
6885 }
6886
6887 static rtx
6888 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6889 enum machine_mode orig_mode, bool named,
6890 HOST_WIDE_INT bytes)
6891 {
6892 unsigned int regno;
6893
6894 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6895 We use value of -2 to specify that current function call is MSABI. */
6896 if (mode == VOIDmode)
6897 return GEN_INT (-2);
6898
6899 /* If we've run out of registers, it goes on the stack. */
6900 if (cum->nregs == 0)
6901 return NULL_RTX;
6902
6903 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6904
6905 /* Only floating point modes are passed in anything but integer regs. */
6906 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6907 {
6908 if (named)
6909 regno = cum->regno + FIRST_SSE_REG;
6910 else
6911 {
6912 rtx t1, t2;
6913
6914 /* Unnamed floating parameters are passed in both the
6915 SSE and integer registers. */
6916 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6917 t2 = gen_rtx_REG (mode, regno);
6918 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6919 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6920 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6921 }
6922 }
6923 /* Handle aggregated types passed in register. */
6924 if (orig_mode == BLKmode)
6925 {
6926 if (bytes > 0 && bytes <= 8)
6927 mode = (bytes > 4 ? DImode : SImode);
6928 if (mode == BLKmode)
6929 mode = DImode;
6930 }
6931
6932 return gen_reg_or_parallel (mode, orig_mode, regno);
6933 }
6934
6935 /* Return where to put the arguments to a function.
6936 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6937
6938 MODE is the argument's machine mode. TYPE is the data type of the
6939 argument. It is null for libcalls where that information may not be
6940 available. CUM gives information about the preceding args and about
6941 the function being called. NAMED is nonzero if this argument is a
6942 named parameter (otherwise it is an extra parameter matching an
6943 ellipsis). */
6944
6945 static rtx
6946 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6947 const_tree type, bool named)
6948 {
6949 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6950 enum machine_mode mode = omode;
6951 HOST_WIDE_INT bytes, words;
6952 rtx arg;
6953
6954 if (mode == BLKmode)
6955 bytes = int_size_in_bytes (type);
6956 else
6957 bytes = GET_MODE_SIZE (mode);
6958 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6959
6960 /* To simplify the code below, represent vector types with a vector mode
6961 even if MMX/SSE are not active. */
6962 if (type && TREE_CODE (type) == VECTOR_TYPE)
6963 mode = type_natural_mode (type, cum);
6964
6965 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6966 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6967 else if (TARGET_64BIT)
6968 arg = function_arg_64 (cum, mode, omode, type, named);
6969 else
6970 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6971
6972 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6973 {
6974 /* This argument uses 256bit AVX modes. */
6975 if (cum->caller)
6976 cfun->machine->callee_pass_avx256_p = true;
6977 else
6978 cfun->machine->caller_pass_avx256_p = true;
6979 }
6980
6981 return arg;
6982 }
6983
6984 /* A C expression that indicates when an argument must be passed by
6985 reference. If nonzero for an argument, a copy of that argument is
6986 made in memory and a pointer to the argument is passed instead of
6987 the argument itself. The pointer is passed in whatever way is
6988 appropriate for passing a pointer to that type. */
6989
6990 static bool
6991 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6992 enum machine_mode mode ATTRIBUTE_UNUSED,
6993 const_tree type, bool named ATTRIBUTE_UNUSED)
6994 {
6995 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6996
6997 /* See Windows x64 Software Convention. */
6998 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6999 {
7000 int msize = (int) GET_MODE_SIZE (mode);
7001 if (type)
7002 {
7003 /* Arrays are passed by reference. */
7004 if (TREE_CODE (type) == ARRAY_TYPE)
7005 return true;
7006
7007 if (AGGREGATE_TYPE_P (type))
7008 {
7009 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7010 are passed by reference. */
7011 msize = int_size_in_bytes (type);
7012 }
7013 }
7014
7015 /* __m128 is passed by reference. */
7016 switch (msize) {
7017 case 1: case 2: case 4: case 8:
7018 break;
7019 default:
7020 return true;
7021 }
7022 }
7023 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7024 return 1;
7025
7026 return 0;
7027 }
7028
7029 /* Return true when TYPE should be 128bit aligned for 32bit argument
7030 passing ABI. XXX: This function is obsolete and is only used for
7031 checking psABI compatibility with previous versions of GCC. */
7032
7033 static bool
7034 ix86_compat_aligned_value_p (const_tree type)
7035 {
7036 enum machine_mode mode = TYPE_MODE (type);
7037 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7038 || mode == TDmode
7039 || mode == TFmode
7040 || mode == TCmode)
7041 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7042 return true;
7043 if (TYPE_ALIGN (type) < 128)
7044 return false;
7045
7046 if (AGGREGATE_TYPE_P (type))
7047 {
7048 /* Walk the aggregates recursively. */
7049 switch (TREE_CODE (type))
7050 {
7051 case RECORD_TYPE:
7052 case UNION_TYPE:
7053 case QUAL_UNION_TYPE:
7054 {
7055 tree field;
7056
7057 /* Walk all the structure fields. */
7058 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7059 {
7060 if (TREE_CODE (field) == FIELD_DECL
7061 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7062 return true;
7063 }
7064 break;
7065 }
7066
7067 case ARRAY_TYPE:
7068 /* Just for use if some languages passes arrays by value. */
7069 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7070 return true;
7071 break;
7072
7073 default:
7074 gcc_unreachable ();
7075 }
7076 }
7077 return false;
7078 }
7079
7080 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7081 XXX: This function is obsolete and is only used for checking psABI
7082 compatibility with previous versions of GCC. */
7083
7084 static unsigned int
7085 ix86_compat_function_arg_boundary (enum machine_mode mode,
7086 const_tree type, unsigned int align)
7087 {
7088 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7089 natural boundaries. */
7090 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7091 {
7092 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7093 make an exception for SSE modes since these require 128bit
7094 alignment.
7095
7096 The handling here differs from field_alignment. ICC aligns MMX
7097 arguments to 4 byte boundaries, while structure fields are aligned
7098 to 8 byte boundaries. */
7099 if (!type)
7100 {
7101 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7102 align = PARM_BOUNDARY;
7103 }
7104 else
7105 {
7106 if (!ix86_compat_aligned_value_p (type))
7107 align = PARM_BOUNDARY;
7108 }
7109 }
7110 if (align > BIGGEST_ALIGNMENT)
7111 align = BIGGEST_ALIGNMENT;
7112 return align;
7113 }
7114
7115 /* Return true when TYPE should be 128bit aligned for 32bit argument
7116 passing ABI. */
7117
7118 static bool
7119 ix86_contains_aligned_value_p (const_tree type)
7120 {
7121 enum machine_mode mode = TYPE_MODE (type);
7122
7123 if (mode == XFmode || mode == XCmode)
7124 return false;
7125
7126 if (TYPE_ALIGN (type) < 128)
7127 return false;
7128
7129 if (AGGREGATE_TYPE_P (type))
7130 {
7131 /* Walk the aggregates recursively. */
7132 switch (TREE_CODE (type))
7133 {
7134 case RECORD_TYPE:
7135 case UNION_TYPE:
7136 case QUAL_UNION_TYPE:
7137 {
7138 tree field;
7139
7140 /* Walk all the structure fields. */
7141 for (field = TYPE_FIELDS (type);
7142 field;
7143 field = DECL_CHAIN (field))
7144 {
7145 if (TREE_CODE (field) == FIELD_DECL
7146 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7147 return true;
7148 }
7149 break;
7150 }
7151
7152 case ARRAY_TYPE:
7153 /* Just for use if some languages passes arrays by value. */
7154 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7155 return true;
7156 break;
7157
7158 default:
7159 gcc_unreachable ();
7160 }
7161 }
7162 else
7163 return TYPE_ALIGN (type) >= 128;
7164
7165 return false;
7166 }
7167
7168 /* Gives the alignment boundary, in bits, of an argument with the
7169 specified mode and type. */
7170
7171 static unsigned int
7172 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7173 {
7174 unsigned int align;
7175 if (type)
7176 {
7177 /* Since the main variant type is used for call, we convert it to
7178 the main variant type. */
7179 type = TYPE_MAIN_VARIANT (type);
7180 align = TYPE_ALIGN (type);
7181 }
7182 else
7183 align = GET_MODE_ALIGNMENT (mode);
7184 if (align < PARM_BOUNDARY)
7185 align = PARM_BOUNDARY;
7186 else
7187 {
7188 static bool warned;
7189 unsigned int saved_align = align;
7190
7191 if (!TARGET_64BIT)
7192 {
7193 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7194 if (!type)
7195 {
7196 if (mode == XFmode || mode == XCmode)
7197 align = PARM_BOUNDARY;
7198 }
7199 else if (!ix86_contains_aligned_value_p (type))
7200 align = PARM_BOUNDARY;
7201
7202 if (align < 128)
7203 align = PARM_BOUNDARY;
7204 }
7205
7206 if (warn_psabi
7207 && !warned
7208 && align != ix86_compat_function_arg_boundary (mode, type,
7209 saved_align))
7210 {
7211 warned = true;
7212 inform (input_location,
7213 "The ABI for passing parameters with %d-byte"
7214 " alignment has changed in GCC 4.6",
7215 align / BITS_PER_UNIT);
7216 }
7217 }
7218
7219 return align;
7220 }
7221
7222 /* Return true if N is a possible register number of function value. */
7223
7224 static bool
7225 ix86_function_value_regno_p (const unsigned int regno)
7226 {
7227 switch (regno)
7228 {
7229 case AX_REG:
7230 return true;
7231
7232 case FIRST_FLOAT_REG:
7233 /* TODO: The function should depend on current function ABI but
7234 builtins.c would need updating then. Therefore we use the
7235 default ABI. */
7236 if (TARGET_64BIT && ix86_abi == MS_ABI)
7237 return false;
7238 return TARGET_FLOAT_RETURNS_IN_80387;
7239
7240 case FIRST_SSE_REG:
7241 return TARGET_SSE;
7242
7243 case FIRST_MMX_REG:
7244 if (TARGET_MACHO || TARGET_64BIT)
7245 return false;
7246 return TARGET_MMX;
7247 }
7248
7249 return false;
7250 }
7251
7252 /* Define how to find the value returned by a function.
7253 VALTYPE is the data type of the value (as a tree).
7254 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7255 otherwise, FUNC is 0. */
7256
7257 static rtx
7258 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7259 const_tree fntype, const_tree fn)
7260 {
7261 unsigned int regno;
7262
7263 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7264 we normally prevent this case when mmx is not available. However
7265 some ABIs may require the result to be returned like DImode. */
7266 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7267 regno = FIRST_MMX_REG;
7268
7269 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7270 we prevent this case when sse is not available. However some ABIs
7271 may require the result to be returned like integer TImode. */
7272 else if (mode == TImode
7273 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7274 regno = FIRST_SSE_REG;
7275
7276 /* 32-byte vector modes in %ymm0. */
7277 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7278 regno = FIRST_SSE_REG;
7279
7280 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7281 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7282 regno = FIRST_FLOAT_REG;
7283 else
7284 /* Most things go in %eax. */
7285 regno = AX_REG;
7286
7287 /* Override FP return register with %xmm0 for local functions when
7288 SSE math is enabled or for functions with sseregparm attribute. */
7289 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7290 {
7291 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7292 if ((sse_level >= 1 && mode == SFmode)
7293 || (sse_level == 2 && mode == DFmode))
7294 regno = FIRST_SSE_REG;
7295 }
7296
7297 /* OImode shouldn't be used directly. */
7298 gcc_assert (mode != OImode);
7299
7300 return gen_rtx_REG (orig_mode, regno);
7301 }
7302
7303 static rtx
7304 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7305 const_tree valtype)
7306 {
7307 rtx ret;
7308
7309 /* Handle libcalls, which don't provide a type node. */
7310 if (valtype == NULL)
7311 {
7312 unsigned int regno;
7313
7314 switch (mode)
7315 {
7316 case SFmode:
7317 case SCmode:
7318 case DFmode:
7319 case DCmode:
7320 case TFmode:
7321 case SDmode:
7322 case DDmode:
7323 case TDmode:
7324 regno = FIRST_SSE_REG;
7325 break;
7326 case XFmode:
7327 case XCmode:
7328 regno = FIRST_FLOAT_REG;
7329 break;
7330 case TCmode:
7331 return NULL;
7332 default:
7333 regno = AX_REG;
7334 }
7335
7336 return gen_rtx_REG (mode, regno);
7337 }
7338 else if (POINTER_TYPE_P (valtype))
7339 {
7340 /* Pointers are always returned in word_mode. */
7341 mode = word_mode;
7342 }
7343
7344 ret = construct_container (mode, orig_mode, valtype, 1,
7345 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7346 x86_64_int_return_registers, 0);
7347
7348 /* For zero sized structures, construct_container returns NULL, but we
7349 need to keep rest of compiler happy by returning meaningful value. */
7350 if (!ret)
7351 ret = gen_rtx_REG (orig_mode, AX_REG);
7352
7353 return ret;
7354 }
7355
7356 static rtx
7357 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7358 {
7359 unsigned int regno = AX_REG;
7360
7361 if (TARGET_SSE)
7362 {
7363 switch (GET_MODE_SIZE (mode))
7364 {
7365 case 16:
7366 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7367 && !COMPLEX_MODE_P (mode))
7368 regno = FIRST_SSE_REG;
7369 break;
7370 case 8:
7371 case 4:
7372 if (mode == SFmode || mode == DFmode)
7373 regno = FIRST_SSE_REG;
7374 break;
7375 default:
7376 break;
7377 }
7378 }
7379 return gen_rtx_REG (orig_mode, regno);
7380 }
7381
7382 static rtx
7383 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7384 enum machine_mode orig_mode, enum machine_mode mode)
7385 {
7386 const_tree fn, fntype;
7387
7388 fn = NULL_TREE;
7389 if (fntype_or_decl && DECL_P (fntype_or_decl))
7390 fn = fntype_or_decl;
7391 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7392
7393 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7394 return function_value_ms_64 (orig_mode, mode);
7395 else if (TARGET_64BIT)
7396 return function_value_64 (orig_mode, mode, valtype);
7397 else
7398 return function_value_32 (orig_mode, mode, fntype, fn);
7399 }
7400
7401 static rtx
7402 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7403 bool outgoing ATTRIBUTE_UNUSED)
7404 {
7405 enum machine_mode mode, orig_mode;
7406
7407 orig_mode = TYPE_MODE (valtype);
7408 mode = type_natural_mode (valtype, NULL);
7409 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7410 }
7411
7412 /* Pointer function arguments and return values are promoted to
7413 word_mode. */
7414
7415 static enum machine_mode
7416 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7417 int *punsignedp, const_tree fntype,
7418 int for_return)
7419 {
7420 if (type != NULL_TREE && POINTER_TYPE_P (type))
7421 {
7422 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7423 return word_mode;
7424 }
7425 return default_promote_function_mode (type, mode, punsignedp, fntype,
7426 for_return);
7427 }
7428
7429 rtx
7430 ix86_libcall_value (enum machine_mode mode)
7431 {
7432 return ix86_function_value_1 (NULL, NULL, mode, mode);
7433 }
7434
7435 /* Return true iff type is returned in memory. */
7436
7437 static bool ATTRIBUTE_UNUSED
7438 return_in_memory_32 (const_tree type, enum machine_mode mode)
7439 {
7440 HOST_WIDE_INT size;
7441
7442 if (mode == BLKmode)
7443 return true;
7444
7445 size = int_size_in_bytes (type);
7446
7447 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7448 return false;
7449
7450 if (VECTOR_MODE_P (mode) || mode == TImode)
7451 {
7452 /* User-created vectors small enough to fit in EAX. */
7453 if (size < 8)
7454 return false;
7455
7456 /* MMX/3dNow values are returned in MM0,
7457 except when it doesn't exits or the ABI prescribes otherwise. */
7458 if (size == 8)
7459 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7460
7461 /* SSE values are returned in XMM0, except when it doesn't exist. */
7462 if (size == 16)
7463 return !TARGET_SSE;
7464
7465 /* AVX values are returned in YMM0, except when it doesn't exist. */
7466 if (size == 32)
7467 return !TARGET_AVX;
7468 }
7469
7470 if (mode == XFmode)
7471 return false;
7472
7473 if (size > 12)
7474 return true;
7475
7476 /* OImode shouldn't be used directly. */
7477 gcc_assert (mode != OImode);
7478
7479 return false;
7480 }
7481
7482 static bool ATTRIBUTE_UNUSED
7483 return_in_memory_64 (const_tree type, enum machine_mode mode)
7484 {
7485 int needed_intregs, needed_sseregs;
7486 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7487 }
7488
7489 static bool ATTRIBUTE_UNUSED
7490 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7491 {
7492 HOST_WIDE_INT size = int_size_in_bytes (type);
7493
7494 /* __m128 is returned in xmm0. */
7495 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7496 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7497 return false;
7498
7499 /* Otherwise, the size must be exactly in [1248]. */
7500 return size != 1 && size != 2 && size != 4 && size != 8;
7501 }
7502
7503 static bool
7504 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7505 {
7506 #ifdef SUBTARGET_RETURN_IN_MEMORY
7507 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7508 #else
7509 const enum machine_mode mode = type_natural_mode (type, NULL);
7510
7511 if (TARGET_64BIT)
7512 {
7513 if (ix86_function_type_abi (fntype) == MS_ABI)
7514 return return_in_memory_ms_64 (type, mode);
7515 else
7516 return return_in_memory_64 (type, mode);
7517 }
7518 else
7519 return return_in_memory_32 (type, mode);
7520 #endif
7521 }
7522
7523 /* When returning SSE vector types, we have a choice of either
7524 (1) being abi incompatible with a -march switch, or
7525 (2) generating an error.
7526 Given no good solution, I think the safest thing is one warning.
7527 The user won't be able to use -Werror, but....
7528
7529 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7530 called in response to actually generating a caller or callee that
7531 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7532 via aggregate_value_p for general type probing from tree-ssa. */
7533
7534 static rtx
7535 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7536 {
7537 static bool warnedsse, warnedmmx;
7538
7539 if (!TARGET_64BIT && type)
7540 {
7541 /* Look at the return type of the function, not the function type. */
7542 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7543
7544 if (!TARGET_SSE && !warnedsse)
7545 {
7546 if (mode == TImode
7547 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7548 {
7549 warnedsse = true;
7550 warning (0, "SSE vector return without SSE enabled "
7551 "changes the ABI");
7552 }
7553 }
7554
7555 if (!TARGET_MMX && !warnedmmx)
7556 {
7557 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7558 {
7559 warnedmmx = true;
7560 warning (0, "MMX vector return without MMX enabled "
7561 "changes the ABI");
7562 }
7563 }
7564 }
7565
7566 return NULL;
7567 }
7568
7569 \f
7570 /* Create the va_list data type. */
7571
7572 /* Returns the calling convention specific va_list date type.
7573 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7574
7575 static tree
7576 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7577 {
7578 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7579
7580 /* For i386 we use plain pointer to argument area. */
7581 if (!TARGET_64BIT || abi == MS_ABI)
7582 return build_pointer_type (char_type_node);
7583
7584 record = lang_hooks.types.make_type (RECORD_TYPE);
7585 type_decl = build_decl (BUILTINS_LOCATION,
7586 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7587
7588 f_gpr = build_decl (BUILTINS_LOCATION,
7589 FIELD_DECL, get_identifier ("gp_offset"),
7590 unsigned_type_node);
7591 f_fpr = build_decl (BUILTINS_LOCATION,
7592 FIELD_DECL, get_identifier ("fp_offset"),
7593 unsigned_type_node);
7594 f_ovf = build_decl (BUILTINS_LOCATION,
7595 FIELD_DECL, get_identifier ("overflow_arg_area"),
7596 ptr_type_node);
7597 f_sav = build_decl (BUILTINS_LOCATION,
7598 FIELD_DECL, get_identifier ("reg_save_area"),
7599 ptr_type_node);
7600
7601 va_list_gpr_counter_field = f_gpr;
7602 va_list_fpr_counter_field = f_fpr;
7603
7604 DECL_FIELD_CONTEXT (f_gpr) = record;
7605 DECL_FIELD_CONTEXT (f_fpr) = record;
7606 DECL_FIELD_CONTEXT (f_ovf) = record;
7607 DECL_FIELD_CONTEXT (f_sav) = record;
7608
7609 TYPE_STUB_DECL (record) = type_decl;
7610 TYPE_NAME (record) = type_decl;
7611 TYPE_FIELDS (record) = f_gpr;
7612 DECL_CHAIN (f_gpr) = f_fpr;
7613 DECL_CHAIN (f_fpr) = f_ovf;
7614 DECL_CHAIN (f_ovf) = f_sav;
7615
7616 layout_type (record);
7617
7618 /* The correct type is an array type of one element. */
7619 return build_array_type (record, build_index_type (size_zero_node));
7620 }
7621
7622 /* Setup the builtin va_list data type and for 64-bit the additional
7623 calling convention specific va_list data types. */
7624
7625 static tree
7626 ix86_build_builtin_va_list (void)
7627 {
7628 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7629
7630 /* Initialize abi specific va_list builtin types. */
7631 if (TARGET_64BIT)
7632 {
7633 tree t;
7634 if (ix86_abi == MS_ABI)
7635 {
7636 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7637 if (TREE_CODE (t) != RECORD_TYPE)
7638 t = build_variant_type_copy (t);
7639 sysv_va_list_type_node = t;
7640 }
7641 else
7642 {
7643 t = ret;
7644 if (TREE_CODE (t) != RECORD_TYPE)
7645 t = build_variant_type_copy (t);
7646 sysv_va_list_type_node = t;
7647 }
7648 if (ix86_abi != MS_ABI)
7649 {
7650 t = ix86_build_builtin_va_list_abi (MS_ABI);
7651 if (TREE_CODE (t) != RECORD_TYPE)
7652 t = build_variant_type_copy (t);
7653 ms_va_list_type_node = t;
7654 }
7655 else
7656 {
7657 t = ret;
7658 if (TREE_CODE (t) != RECORD_TYPE)
7659 t = build_variant_type_copy (t);
7660 ms_va_list_type_node = t;
7661 }
7662 }
7663
7664 return ret;
7665 }
7666
7667 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7668
7669 static void
7670 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7671 {
7672 rtx save_area, mem;
7673 alias_set_type set;
7674 int i, max;
7675
7676 /* GPR size of varargs save area. */
7677 if (cfun->va_list_gpr_size)
7678 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7679 else
7680 ix86_varargs_gpr_size = 0;
7681
7682 /* FPR size of varargs save area. We don't need it if we don't pass
7683 anything in SSE registers. */
7684 if (TARGET_SSE && cfun->va_list_fpr_size)
7685 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7686 else
7687 ix86_varargs_fpr_size = 0;
7688
7689 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7690 return;
7691
7692 save_area = frame_pointer_rtx;
7693 set = get_varargs_alias_set ();
7694
7695 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7696 if (max > X86_64_REGPARM_MAX)
7697 max = X86_64_REGPARM_MAX;
7698
7699 for (i = cum->regno; i < max; i++)
7700 {
7701 mem = gen_rtx_MEM (word_mode,
7702 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7703 MEM_NOTRAP_P (mem) = 1;
7704 set_mem_alias_set (mem, set);
7705 emit_move_insn (mem,
7706 gen_rtx_REG (word_mode,
7707 x86_64_int_parameter_registers[i]));
7708 }
7709
7710 if (ix86_varargs_fpr_size)
7711 {
7712 enum machine_mode smode;
7713 rtx label, test;
7714
7715 /* Now emit code to save SSE registers. The AX parameter contains number
7716 of SSE parameter registers used to call this function, though all we
7717 actually check here is the zero/non-zero status. */
7718
7719 label = gen_label_rtx ();
7720 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7721 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7722 label));
7723
7724 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7725 we used movdqa (i.e. TImode) instead? Perhaps even better would
7726 be if we could determine the real mode of the data, via a hook
7727 into pass_stdarg. Ignore all that for now. */
7728 smode = V4SFmode;
7729 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7730 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7731
7732 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7733 if (max > X86_64_SSE_REGPARM_MAX)
7734 max = X86_64_SSE_REGPARM_MAX;
7735
7736 for (i = cum->sse_regno; i < max; ++i)
7737 {
7738 mem = plus_constant (Pmode, save_area,
7739 i * 16 + ix86_varargs_gpr_size);
7740 mem = gen_rtx_MEM (smode, mem);
7741 MEM_NOTRAP_P (mem) = 1;
7742 set_mem_alias_set (mem, set);
7743 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7744
7745 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7746 }
7747
7748 emit_label (label);
7749 }
7750 }
7751
7752 static void
7753 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7754 {
7755 alias_set_type set = get_varargs_alias_set ();
7756 int i;
7757
7758 /* Reset to zero, as there might be a sysv vaarg used
7759 before. */
7760 ix86_varargs_gpr_size = 0;
7761 ix86_varargs_fpr_size = 0;
7762
7763 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7764 {
7765 rtx reg, mem;
7766
7767 mem = gen_rtx_MEM (Pmode,
7768 plus_constant (Pmode, virtual_incoming_args_rtx,
7769 i * UNITS_PER_WORD));
7770 MEM_NOTRAP_P (mem) = 1;
7771 set_mem_alias_set (mem, set);
7772
7773 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7774 emit_move_insn (mem, reg);
7775 }
7776 }
7777
7778 static void
7779 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7780 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7781 int no_rtl)
7782 {
7783 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7784 CUMULATIVE_ARGS next_cum;
7785 tree fntype;
7786
7787 /* This argument doesn't appear to be used anymore. Which is good,
7788 because the old code here didn't suppress rtl generation. */
7789 gcc_assert (!no_rtl);
7790
7791 if (!TARGET_64BIT)
7792 return;
7793
7794 fntype = TREE_TYPE (current_function_decl);
7795
7796 /* For varargs, we do not want to skip the dummy va_dcl argument.
7797 For stdargs, we do want to skip the last named argument. */
7798 next_cum = *cum;
7799 if (stdarg_p (fntype))
7800 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7801 true);
7802
7803 if (cum->call_abi == MS_ABI)
7804 setup_incoming_varargs_ms_64 (&next_cum);
7805 else
7806 setup_incoming_varargs_64 (&next_cum);
7807 }
7808
7809 /* Checks if TYPE is of kind va_list char *. */
7810
7811 static bool
7812 is_va_list_char_pointer (tree type)
7813 {
7814 tree canonic;
7815
7816 /* For 32-bit it is always true. */
7817 if (!TARGET_64BIT)
7818 return true;
7819 canonic = ix86_canonical_va_list_type (type);
7820 return (canonic == ms_va_list_type_node
7821 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7822 }
7823
7824 /* Implement va_start. */
7825
7826 static void
7827 ix86_va_start (tree valist, rtx nextarg)
7828 {
7829 HOST_WIDE_INT words, n_gpr, n_fpr;
7830 tree f_gpr, f_fpr, f_ovf, f_sav;
7831 tree gpr, fpr, ovf, sav, t;
7832 tree type;
7833 rtx ovf_rtx;
7834
7835 if (flag_split_stack
7836 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7837 {
7838 unsigned int scratch_regno;
7839
7840 /* When we are splitting the stack, we can't refer to the stack
7841 arguments using internal_arg_pointer, because they may be on
7842 the old stack. The split stack prologue will arrange to
7843 leave a pointer to the old stack arguments in a scratch
7844 register, which we here copy to a pseudo-register. The split
7845 stack prologue can't set the pseudo-register directly because
7846 it (the prologue) runs before any registers have been saved. */
7847
7848 scratch_regno = split_stack_prologue_scratch_regno ();
7849 if (scratch_regno != INVALID_REGNUM)
7850 {
7851 rtx reg, seq;
7852
7853 reg = gen_reg_rtx (Pmode);
7854 cfun->machine->split_stack_varargs_pointer = reg;
7855
7856 start_sequence ();
7857 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7858 seq = get_insns ();
7859 end_sequence ();
7860
7861 push_topmost_sequence ();
7862 emit_insn_after (seq, entry_of_function ());
7863 pop_topmost_sequence ();
7864 }
7865 }
7866
7867 /* Only 64bit target needs something special. */
7868 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7869 {
7870 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7871 std_expand_builtin_va_start (valist, nextarg);
7872 else
7873 {
7874 rtx va_r, next;
7875
7876 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7877 next = expand_binop (ptr_mode, add_optab,
7878 cfun->machine->split_stack_varargs_pointer,
7879 crtl->args.arg_offset_rtx,
7880 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7881 convert_move (va_r, next, 0);
7882 }
7883 return;
7884 }
7885
7886 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7887 f_fpr = DECL_CHAIN (f_gpr);
7888 f_ovf = DECL_CHAIN (f_fpr);
7889 f_sav = DECL_CHAIN (f_ovf);
7890
7891 valist = build_simple_mem_ref (valist);
7892 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7893 /* The following should be folded into the MEM_REF offset. */
7894 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7895 f_gpr, NULL_TREE);
7896 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7897 f_fpr, NULL_TREE);
7898 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7899 f_ovf, NULL_TREE);
7900 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7901 f_sav, NULL_TREE);
7902
7903 /* Count number of gp and fp argument registers used. */
7904 words = crtl->args.info.words;
7905 n_gpr = crtl->args.info.regno;
7906 n_fpr = crtl->args.info.sse_regno;
7907
7908 if (cfun->va_list_gpr_size)
7909 {
7910 type = TREE_TYPE (gpr);
7911 t = build2 (MODIFY_EXPR, type,
7912 gpr, build_int_cst (type, n_gpr * 8));
7913 TREE_SIDE_EFFECTS (t) = 1;
7914 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7915 }
7916
7917 if (TARGET_SSE && cfun->va_list_fpr_size)
7918 {
7919 type = TREE_TYPE (fpr);
7920 t = build2 (MODIFY_EXPR, type, fpr,
7921 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7922 TREE_SIDE_EFFECTS (t) = 1;
7923 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7924 }
7925
7926 /* Find the overflow area. */
7927 type = TREE_TYPE (ovf);
7928 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7929 ovf_rtx = crtl->args.internal_arg_pointer;
7930 else
7931 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7932 t = make_tree (type, ovf_rtx);
7933 if (words != 0)
7934 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7935 t = build2 (MODIFY_EXPR, type, ovf, t);
7936 TREE_SIDE_EFFECTS (t) = 1;
7937 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7938
7939 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7940 {
7941 /* Find the register save area.
7942 Prologue of the function save it right above stack frame. */
7943 type = TREE_TYPE (sav);
7944 t = make_tree (type, frame_pointer_rtx);
7945 if (!ix86_varargs_gpr_size)
7946 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7947 t = build2 (MODIFY_EXPR, type, sav, t);
7948 TREE_SIDE_EFFECTS (t) = 1;
7949 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7950 }
7951 }
7952
7953 /* Implement va_arg. */
7954
7955 static tree
7956 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7957 gimple_seq *post_p)
7958 {
7959 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7960 tree f_gpr, f_fpr, f_ovf, f_sav;
7961 tree gpr, fpr, ovf, sav, t;
7962 int size, rsize;
7963 tree lab_false, lab_over = NULL_TREE;
7964 tree addr, t2;
7965 rtx container;
7966 int indirect_p = 0;
7967 tree ptrtype;
7968 enum machine_mode nat_mode;
7969 unsigned int arg_boundary;
7970
7971 /* Only 64bit target needs something special. */
7972 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7973 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7974
7975 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7976 f_fpr = DECL_CHAIN (f_gpr);
7977 f_ovf = DECL_CHAIN (f_fpr);
7978 f_sav = DECL_CHAIN (f_ovf);
7979
7980 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7981 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7982 valist = build_va_arg_indirect_ref (valist);
7983 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7984 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7985 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7986
7987 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7988 if (indirect_p)
7989 type = build_pointer_type (type);
7990 size = int_size_in_bytes (type);
7991 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7992
7993 nat_mode = type_natural_mode (type, NULL);
7994 switch (nat_mode)
7995 {
7996 case V8SFmode:
7997 case V8SImode:
7998 case V32QImode:
7999 case V16HImode:
8000 case V4DFmode:
8001 case V4DImode:
8002 /* Unnamed 256bit vector mode parameters are passed on stack. */
8003 if (!TARGET_64BIT_MS_ABI)
8004 {
8005 container = NULL;
8006 break;
8007 }
8008
8009 default:
8010 container = construct_container (nat_mode, TYPE_MODE (type),
8011 type, 0, X86_64_REGPARM_MAX,
8012 X86_64_SSE_REGPARM_MAX, intreg,
8013 0);
8014 break;
8015 }
8016
8017 /* Pull the value out of the saved registers. */
8018
8019 addr = create_tmp_var (ptr_type_node, "addr");
8020
8021 if (container)
8022 {
8023 int needed_intregs, needed_sseregs;
8024 bool need_temp;
8025 tree int_addr, sse_addr;
8026
8027 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8028 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8029
8030 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8031
8032 need_temp = (!REG_P (container)
8033 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8034 || TYPE_ALIGN (type) > 128));
8035
8036 /* In case we are passing structure, verify that it is consecutive block
8037 on the register save area. If not we need to do moves. */
8038 if (!need_temp && !REG_P (container))
8039 {
8040 /* Verify that all registers are strictly consecutive */
8041 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8042 {
8043 int i;
8044
8045 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8046 {
8047 rtx slot = XVECEXP (container, 0, i);
8048 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8049 || INTVAL (XEXP (slot, 1)) != i * 16)
8050 need_temp = 1;
8051 }
8052 }
8053 else
8054 {
8055 int i;
8056
8057 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8058 {
8059 rtx slot = XVECEXP (container, 0, i);
8060 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8061 || INTVAL (XEXP (slot, 1)) != i * 8)
8062 need_temp = 1;
8063 }
8064 }
8065 }
8066 if (!need_temp)
8067 {
8068 int_addr = addr;
8069 sse_addr = addr;
8070 }
8071 else
8072 {
8073 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8074 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8075 }
8076
8077 /* First ensure that we fit completely in registers. */
8078 if (needed_intregs)
8079 {
8080 t = build_int_cst (TREE_TYPE (gpr),
8081 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8082 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8083 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8084 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8085 gimplify_and_add (t, pre_p);
8086 }
8087 if (needed_sseregs)
8088 {
8089 t = build_int_cst (TREE_TYPE (fpr),
8090 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8091 + X86_64_REGPARM_MAX * 8);
8092 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8093 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8094 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8095 gimplify_and_add (t, pre_p);
8096 }
8097
8098 /* Compute index to start of area used for integer regs. */
8099 if (needed_intregs)
8100 {
8101 /* int_addr = gpr + sav; */
8102 t = fold_build_pointer_plus (sav, gpr);
8103 gimplify_assign (int_addr, t, pre_p);
8104 }
8105 if (needed_sseregs)
8106 {
8107 /* sse_addr = fpr + sav; */
8108 t = fold_build_pointer_plus (sav, fpr);
8109 gimplify_assign (sse_addr, t, pre_p);
8110 }
8111 if (need_temp)
8112 {
8113 int i, prev_size = 0;
8114 tree temp = create_tmp_var (type, "va_arg_tmp");
8115
8116 /* addr = &temp; */
8117 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8118 gimplify_assign (addr, t, pre_p);
8119
8120 for (i = 0; i < XVECLEN (container, 0); i++)
8121 {
8122 rtx slot = XVECEXP (container, 0, i);
8123 rtx reg = XEXP (slot, 0);
8124 enum machine_mode mode = GET_MODE (reg);
8125 tree piece_type;
8126 tree addr_type;
8127 tree daddr_type;
8128 tree src_addr, src;
8129 int src_offset;
8130 tree dest_addr, dest;
8131 int cur_size = GET_MODE_SIZE (mode);
8132
8133 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8134 prev_size = INTVAL (XEXP (slot, 1));
8135 if (prev_size + cur_size > size)
8136 {
8137 cur_size = size - prev_size;
8138 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8139 if (mode == BLKmode)
8140 mode = QImode;
8141 }
8142 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8143 if (mode == GET_MODE (reg))
8144 addr_type = build_pointer_type (piece_type);
8145 else
8146 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8147 true);
8148 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8149 true);
8150
8151 if (SSE_REGNO_P (REGNO (reg)))
8152 {
8153 src_addr = sse_addr;
8154 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8155 }
8156 else
8157 {
8158 src_addr = int_addr;
8159 src_offset = REGNO (reg) * 8;
8160 }
8161 src_addr = fold_convert (addr_type, src_addr);
8162 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8163
8164 dest_addr = fold_convert (daddr_type, addr);
8165 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8166 if (cur_size == GET_MODE_SIZE (mode))
8167 {
8168 src = build_va_arg_indirect_ref (src_addr);
8169 dest = build_va_arg_indirect_ref (dest_addr);
8170
8171 gimplify_assign (dest, src, pre_p);
8172 }
8173 else
8174 {
8175 tree copy
8176 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8177 3, dest_addr, src_addr,
8178 size_int (cur_size));
8179 gimplify_and_add (copy, pre_p);
8180 }
8181 prev_size += cur_size;
8182 }
8183 }
8184
8185 if (needed_intregs)
8186 {
8187 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8188 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8189 gimplify_assign (gpr, t, pre_p);
8190 }
8191
8192 if (needed_sseregs)
8193 {
8194 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8195 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8196 gimplify_assign (fpr, t, pre_p);
8197 }
8198
8199 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8200
8201 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8202 }
8203
8204 /* ... otherwise out of the overflow area. */
8205
8206 /* When we align parameter on stack for caller, if the parameter
8207 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8208 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8209 here with caller. */
8210 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8211 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8212 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8213
8214 /* Care for on-stack alignment if needed. */
8215 if (arg_boundary <= 64 || size == 0)
8216 t = ovf;
8217 else
8218 {
8219 HOST_WIDE_INT align = arg_boundary / 8;
8220 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8221 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8222 build_int_cst (TREE_TYPE (t), -align));
8223 }
8224
8225 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8226 gimplify_assign (addr, t, pre_p);
8227
8228 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8229 gimplify_assign (unshare_expr (ovf), t, pre_p);
8230
8231 if (container)
8232 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8233
8234 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8235 addr = fold_convert (ptrtype, addr);
8236
8237 if (indirect_p)
8238 addr = build_va_arg_indirect_ref (addr);
8239 return build_va_arg_indirect_ref (addr);
8240 }
8241 \f
8242 /* Return true if OPNUM's MEM should be matched
8243 in movabs* patterns. */
8244
8245 bool
8246 ix86_check_movabs (rtx insn, int opnum)
8247 {
8248 rtx set, mem;
8249
8250 set = PATTERN (insn);
8251 if (GET_CODE (set) == PARALLEL)
8252 set = XVECEXP (set, 0, 0);
8253 gcc_assert (GET_CODE (set) == SET);
8254 mem = XEXP (set, opnum);
8255 while (GET_CODE (mem) == SUBREG)
8256 mem = SUBREG_REG (mem);
8257 gcc_assert (MEM_P (mem));
8258 return volatile_ok || !MEM_VOLATILE_P (mem);
8259 }
8260 \f
8261 /* Initialize the table of extra 80387 mathematical constants. */
8262
8263 static void
8264 init_ext_80387_constants (void)
8265 {
8266 static const char * cst[5] =
8267 {
8268 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8269 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8270 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8271 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8272 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8273 };
8274 int i;
8275
8276 for (i = 0; i < 5; i++)
8277 {
8278 real_from_string (&ext_80387_constants_table[i], cst[i]);
8279 /* Ensure each constant is rounded to XFmode precision. */
8280 real_convert (&ext_80387_constants_table[i],
8281 XFmode, &ext_80387_constants_table[i]);
8282 }
8283
8284 ext_80387_constants_init = 1;
8285 }
8286
8287 /* Return non-zero if the constant is something that
8288 can be loaded with a special instruction. */
8289
8290 int
8291 standard_80387_constant_p (rtx x)
8292 {
8293 enum machine_mode mode = GET_MODE (x);
8294
8295 REAL_VALUE_TYPE r;
8296
8297 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8298 return -1;
8299
8300 if (x == CONST0_RTX (mode))
8301 return 1;
8302 if (x == CONST1_RTX (mode))
8303 return 2;
8304
8305 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8306
8307 /* For XFmode constants, try to find a special 80387 instruction when
8308 optimizing for size or on those CPUs that benefit from them. */
8309 if (mode == XFmode
8310 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8311 {
8312 int i;
8313
8314 if (! ext_80387_constants_init)
8315 init_ext_80387_constants ();
8316
8317 for (i = 0; i < 5; i++)
8318 if (real_identical (&r, &ext_80387_constants_table[i]))
8319 return i + 3;
8320 }
8321
8322 /* Load of the constant -0.0 or -1.0 will be split as
8323 fldz;fchs or fld1;fchs sequence. */
8324 if (real_isnegzero (&r))
8325 return 8;
8326 if (real_identical (&r, &dconstm1))
8327 return 9;
8328
8329 return 0;
8330 }
8331
8332 /* Return the opcode of the special instruction to be used to load
8333 the constant X. */
8334
8335 const char *
8336 standard_80387_constant_opcode (rtx x)
8337 {
8338 switch (standard_80387_constant_p (x))
8339 {
8340 case 1:
8341 return "fldz";
8342 case 2:
8343 return "fld1";
8344 case 3:
8345 return "fldlg2";
8346 case 4:
8347 return "fldln2";
8348 case 5:
8349 return "fldl2e";
8350 case 6:
8351 return "fldl2t";
8352 case 7:
8353 return "fldpi";
8354 case 8:
8355 case 9:
8356 return "#";
8357 default:
8358 gcc_unreachable ();
8359 }
8360 }
8361
8362 /* Return the CONST_DOUBLE representing the 80387 constant that is
8363 loaded by the specified special instruction. The argument IDX
8364 matches the return value from standard_80387_constant_p. */
8365
8366 rtx
8367 standard_80387_constant_rtx (int idx)
8368 {
8369 int i;
8370
8371 if (! ext_80387_constants_init)
8372 init_ext_80387_constants ();
8373
8374 switch (idx)
8375 {
8376 case 3:
8377 case 4:
8378 case 5:
8379 case 6:
8380 case 7:
8381 i = idx - 3;
8382 break;
8383
8384 default:
8385 gcc_unreachable ();
8386 }
8387
8388 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8389 XFmode);
8390 }
8391
8392 /* Return 1 if X is all 0s and 2 if x is all 1s
8393 in supported SSE/AVX vector mode. */
8394
8395 int
8396 standard_sse_constant_p (rtx x)
8397 {
8398 enum machine_mode mode = GET_MODE (x);
8399
8400 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8401 return 1;
8402 if (vector_all_ones_operand (x, mode))
8403 switch (mode)
8404 {
8405 case V16QImode:
8406 case V8HImode:
8407 case V4SImode:
8408 case V2DImode:
8409 if (TARGET_SSE2)
8410 return 2;
8411 case V32QImode:
8412 case V16HImode:
8413 case V8SImode:
8414 case V4DImode:
8415 if (TARGET_AVX2)
8416 return 2;
8417 default:
8418 break;
8419 }
8420
8421 return 0;
8422 }
8423
8424 /* Return the opcode of the special instruction to be used to load
8425 the constant X. */
8426
8427 const char *
8428 standard_sse_constant_opcode (rtx insn, rtx x)
8429 {
8430 switch (standard_sse_constant_p (x))
8431 {
8432 case 1:
8433 switch (get_attr_mode (insn))
8434 {
8435 case MODE_TI:
8436 return "%vpxor\t%0, %d0";
8437 case MODE_V2DF:
8438 return "%vxorpd\t%0, %d0";
8439 case MODE_V4SF:
8440 return "%vxorps\t%0, %d0";
8441
8442 case MODE_OI:
8443 return "vpxor\t%x0, %x0, %x0";
8444 case MODE_V4DF:
8445 return "vxorpd\t%x0, %x0, %x0";
8446 case MODE_V8SF:
8447 return "vxorps\t%x0, %x0, %x0";
8448
8449 default:
8450 break;
8451 }
8452
8453 case 2:
8454 if (TARGET_AVX)
8455 return "vpcmpeqd\t%0, %0, %0";
8456 else
8457 return "pcmpeqd\t%0, %0";
8458
8459 default:
8460 break;
8461 }
8462 gcc_unreachable ();
8463 }
8464
8465 /* Returns true if OP contains a symbol reference */
8466
8467 bool
8468 symbolic_reference_mentioned_p (rtx op)
8469 {
8470 const char *fmt;
8471 int i;
8472
8473 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8474 return true;
8475
8476 fmt = GET_RTX_FORMAT (GET_CODE (op));
8477 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8478 {
8479 if (fmt[i] == 'E')
8480 {
8481 int j;
8482
8483 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8484 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8485 return true;
8486 }
8487
8488 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8489 return true;
8490 }
8491
8492 return false;
8493 }
8494
8495 /* Return true if it is appropriate to emit `ret' instructions in the
8496 body of a function. Do this only if the epilogue is simple, needing a
8497 couple of insns. Prior to reloading, we can't tell how many registers
8498 must be saved, so return false then. Return false if there is no frame
8499 marker to de-allocate. */
8500
8501 bool
8502 ix86_can_use_return_insn_p (void)
8503 {
8504 struct ix86_frame frame;
8505
8506 if (! reload_completed || frame_pointer_needed)
8507 return 0;
8508
8509 /* Don't allow more than 32k pop, since that's all we can do
8510 with one instruction. */
8511 if (crtl->args.pops_args && crtl->args.size >= 32768)
8512 return 0;
8513
8514 ix86_compute_frame_layout (&frame);
8515 return (frame.stack_pointer_offset == UNITS_PER_WORD
8516 && (frame.nregs + frame.nsseregs) == 0);
8517 }
8518 \f
8519 /* Value should be nonzero if functions must have frame pointers.
8520 Zero means the frame pointer need not be set up (and parms may
8521 be accessed via the stack pointer) in functions that seem suitable. */
8522
8523 static bool
8524 ix86_frame_pointer_required (void)
8525 {
8526 /* If we accessed previous frames, then the generated code expects
8527 to be able to access the saved ebp value in our frame. */
8528 if (cfun->machine->accesses_prev_frame)
8529 return true;
8530
8531 /* Several x86 os'es need a frame pointer for other reasons,
8532 usually pertaining to setjmp. */
8533 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8534 return true;
8535
8536 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8537 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8538 return true;
8539
8540 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8541 turns off the frame pointer by default. Turn it back on now if
8542 we've not got a leaf function. */
8543 if (TARGET_OMIT_LEAF_FRAME_POINTER
8544 && (!current_function_is_leaf
8545 || ix86_current_function_calls_tls_descriptor))
8546 return true;
8547
8548 if (crtl->profile && !flag_fentry)
8549 return true;
8550
8551 return false;
8552 }
8553
8554 /* Record that the current function accesses previous call frames. */
8555
8556 void
8557 ix86_setup_frame_addresses (void)
8558 {
8559 cfun->machine->accesses_prev_frame = 1;
8560 }
8561 \f
8562 #ifndef USE_HIDDEN_LINKONCE
8563 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8564 # define USE_HIDDEN_LINKONCE 1
8565 # else
8566 # define USE_HIDDEN_LINKONCE 0
8567 # endif
8568 #endif
8569
8570 static int pic_labels_used;
8571
8572 /* Fills in the label name that should be used for a pc thunk for
8573 the given register. */
8574
8575 static void
8576 get_pc_thunk_name (char name[32], unsigned int regno)
8577 {
8578 gcc_assert (!TARGET_64BIT);
8579
8580 if (USE_HIDDEN_LINKONCE)
8581 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8582 else
8583 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8584 }
8585
8586
8587 /* This function generates code for -fpic that loads %ebx with
8588 the return address of the caller and then returns. */
8589
8590 static void
8591 ix86_code_end (void)
8592 {
8593 rtx xops[2];
8594 int regno;
8595
8596 for (regno = AX_REG; regno <= SP_REG; regno++)
8597 {
8598 char name[32];
8599 tree decl;
8600
8601 if (!(pic_labels_used & (1 << regno)))
8602 continue;
8603
8604 get_pc_thunk_name (name, regno);
8605
8606 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8607 get_identifier (name),
8608 build_function_type_list (void_type_node, NULL_TREE));
8609 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8610 NULL_TREE, void_type_node);
8611 TREE_PUBLIC (decl) = 1;
8612 TREE_STATIC (decl) = 1;
8613 DECL_IGNORED_P (decl) = 1;
8614
8615 #if TARGET_MACHO
8616 if (TARGET_MACHO)
8617 {
8618 switch_to_section (darwin_sections[text_coal_section]);
8619 fputs ("\t.weak_definition\t", asm_out_file);
8620 assemble_name (asm_out_file, name);
8621 fputs ("\n\t.private_extern\t", asm_out_file);
8622 assemble_name (asm_out_file, name);
8623 putc ('\n', asm_out_file);
8624 ASM_OUTPUT_LABEL (asm_out_file, name);
8625 DECL_WEAK (decl) = 1;
8626 }
8627 else
8628 #endif
8629 if (USE_HIDDEN_LINKONCE)
8630 {
8631 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8632
8633 targetm.asm_out.unique_section (decl, 0);
8634 switch_to_section (get_named_section (decl, NULL, 0));
8635
8636 targetm.asm_out.globalize_label (asm_out_file, name);
8637 fputs ("\t.hidden\t", asm_out_file);
8638 assemble_name (asm_out_file, name);
8639 putc ('\n', asm_out_file);
8640 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8641 }
8642 else
8643 {
8644 switch_to_section (text_section);
8645 ASM_OUTPUT_LABEL (asm_out_file, name);
8646 }
8647
8648 DECL_INITIAL (decl) = make_node (BLOCK);
8649 current_function_decl = decl;
8650 init_function_start (decl);
8651 first_function_block_is_cold = false;
8652 /* Make sure unwind info is emitted for the thunk if needed. */
8653 final_start_function (emit_barrier (), asm_out_file, 1);
8654
8655 /* Pad stack IP move with 4 instructions (two NOPs count
8656 as one instruction). */
8657 if (TARGET_PAD_SHORT_FUNCTION)
8658 {
8659 int i = 8;
8660
8661 while (i--)
8662 fputs ("\tnop\n", asm_out_file);
8663 }
8664
8665 xops[0] = gen_rtx_REG (Pmode, regno);
8666 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8667 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8668 fputs ("\tret\n", asm_out_file);
8669 final_end_function ();
8670 init_insn_lengths ();
8671 free_after_compilation (cfun);
8672 set_cfun (NULL);
8673 current_function_decl = NULL;
8674 }
8675
8676 if (flag_split_stack)
8677 file_end_indicate_split_stack ();
8678 }
8679
8680 /* Emit code for the SET_GOT patterns. */
8681
8682 const char *
8683 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8684 {
8685 rtx xops[3];
8686
8687 xops[0] = dest;
8688
8689 if (TARGET_VXWORKS_RTP && flag_pic)
8690 {
8691 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8692 xops[2] = gen_rtx_MEM (Pmode,
8693 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8694 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8695
8696 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8697 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8698 an unadorned address. */
8699 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8700 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8701 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8702 return "";
8703 }
8704
8705 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8706
8707 if (!flag_pic)
8708 {
8709 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8710
8711 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8712
8713 #if TARGET_MACHO
8714 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8715 is what will be referenced by the Mach-O PIC subsystem. */
8716 if (!label)
8717 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8718 #endif
8719
8720 targetm.asm_out.internal_label (asm_out_file, "L",
8721 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8722 }
8723 else
8724 {
8725 char name[32];
8726 get_pc_thunk_name (name, REGNO (dest));
8727 pic_labels_used |= 1 << REGNO (dest);
8728
8729 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8730 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8731 output_asm_insn ("call\t%X2", xops);
8732 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8733 is what will be referenced by the Mach-O PIC subsystem. */
8734 #if TARGET_MACHO
8735 if (!label)
8736 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8737 else
8738 targetm.asm_out.internal_label (asm_out_file, "L",
8739 CODE_LABEL_NUMBER (label));
8740 #endif
8741 }
8742
8743 if (!TARGET_MACHO)
8744 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8745
8746 return "";
8747 }
8748
8749 /* Generate an "push" pattern for input ARG. */
8750
8751 static rtx
8752 gen_push (rtx arg)
8753 {
8754 struct machine_function *m = cfun->machine;
8755
8756 if (m->fs.cfa_reg == stack_pointer_rtx)
8757 m->fs.cfa_offset += UNITS_PER_WORD;
8758 m->fs.sp_offset += UNITS_PER_WORD;
8759
8760 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8761 arg = gen_rtx_REG (word_mode, REGNO (arg));
8762
8763 return gen_rtx_SET (VOIDmode,
8764 gen_rtx_MEM (word_mode,
8765 gen_rtx_PRE_DEC (Pmode,
8766 stack_pointer_rtx)),
8767 arg);
8768 }
8769
8770 /* Generate an "pop" pattern for input ARG. */
8771
8772 static rtx
8773 gen_pop (rtx arg)
8774 {
8775 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8776 arg = gen_rtx_REG (word_mode, REGNO (arg));
8777
8778 return gen_rtx_SET (VOIDmode,
8779 arg,
8780 gen_rtx_MEM (word_mode,
8781 gen_rtx_POST_INC (Pmode,
8782 stack_pointer_rtx)));
8783 }
8784
8785 /* Return >= 0 if there is an unused call-clobbered register available
8786 for the entire function. */
8787
8788 static unsigned int
8789 ix86_select_alt_pic_regnum (void)
8790 {
8791 if (current_function_is_leaf
8792 && !crtl->profile
8793 && !ix86_current_function_calls_tls_descriptor)
8794 {
8795 int i, drap;
8796 /* Can't use the same register for both PIC and DRAP. */
8797 if (crtl->drap_reg)
8798 drap = REGNO (crtl->drap_reg);
8799 else
8800 drap = -1;
8801 for (i = 2; i >= 0; --i)
8802 if (i != drap && !df_regs_ever_live_p (i))
8803 return i;
8804 }
8805
8806 return INVALID_REGNUM;
8807 }
8808
8809 /* Return TRUE if we need to save REGNO. */
8810
8811 static bool
8812 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8813 {
8814 if (pic_offset_table_rtx
8815 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8816 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8817 || crtl->profile
8818 || crtl->calls_eh_return
8819 || crtl->uses_const_pool))
8820 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8821
8822 if (crtl->calls_eh_return && maybe_eh_return)
8823 {
8824 unsigned i;
8825 for (i = 0; ; i++)
8826 {
8827 unsigned test = EH_RETURN_DATA_REGNO (i);
8828 if (test == INVALID_REGNUM)
8829 break;
8830 if (test == regno)
8831 return true;
8832 }
8833 }
8834
8835 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8836 return true;
8837
8838 return (df_regs_ever_live_p (regno)
8839 && !call_used_regs[regno]
8840 && !fixed_regs[regno]
8841 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8842 }
8843
8844 /* Return number of saved general prupose registers. */
8845
8846 static int
8847 ix86_nsaved_regs (void)
8848 {
8849 int nregs = 0;
8850 int regno;
8851
8852 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8853 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8854 nregs ++;
8855 return nregs;
8856 }
8857
8858 /* Return number of saved SSE registrers. */
8859
8860 static int
8861 ix86_nsaved_sseregs (void)
8862 {
8863 int nregs = 0;
8864 int regno;
8865
8866 if (!TARGET_64BIT_MS_ABI)
8867 return 0;
8868 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8869 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8870 nregs ++;
8871 return nregs;
8872 }
8873
8874 /* Given FROM and TO register numbers, say whether this elimination is
8875 allowed. If stack alignment is needed, we can only replace argument
8876 pointer with hard frame pointer, or replace frame pointer with stack
8877 pointer. Otherwise, frame pointer elimination is automatically
8878 handled and all other eliminations are valid. */
8879
8880 static bool
8881 ix86_can_eliminate (const int from, const int to)
8882 {
8883 if (stack_realign_fp)
8884 return ((from == ARG_POINTER_REGNUM
8885 && to == HARD_FRAME_POINTER_REGNUM)
8886 || (from == FRAME_POINTER_REGNUM
8887 && to == STACK_POINTER_REGNUM));
8888 else
8889 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8890 }
8891
8892 /* Return the offset between two registers, one to be eliminated, and the other
8893 its replacement, at the start of a routine. */
8894
8895 HOST_WIDE_INT
8896 ix86_initial_elimination_offset (int from, int to)
8897 {
8898 struct ix86_frame frame;
8899 ix86_compute_frame_layout (&frame);
8900
8901 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8902 return frame.hard_frame_pointer_offset;
8903 else if (from == FRAME_POINTER_REGNUM
8904 && to == HARD_FRAME_POINTER_REGNUM)
8905 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8906 else
8907 {
8908 gcc_assert (to == STACK_POINTER_REGNUM);
8909
8910 if (from == ARG_POINTER_REGNUM)
8911 return frame.stack_pointer_offset;
8912
8913 gcc_assert (from == FRAME_POINTER_REGNUM);
8914 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8915 }
8916 }
8917
8918 /* In a dynamically-aligned function, we can't know the offset from
8919 stack pointer to frame pointer, so we must ensure that setjmp
8920 eliminates fp against the hard fp (%ebp) rather than trying to
8921 index from %esp up to the top of the frame across a gap that is
8922 of unknown (at compile-time) size. */
8923 static rtx
8924 ix86_builtin_setjmp_frame_value (void)
8925 {
8926 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8927 }
8928
8929 /* When using -fsplit-stack, the allocation routines set a field in
8930 the TCB to the bottom of the stack plus this much space, measured
8931 in bytes. */
8932
8933 #define SPLIT_STACK_AVAILABLE 256
8934
8935 /* Fill structure ix86_frame about frame of currently computed function. */
8936
8937 static void
8938 ix86_compute_frame_layout (struct ix86_frame *frame)
8939 {
8940 unsigned int stack_alignment_needed;
8941 HOST_WIDE_INT offset;
8942 unsigned int preferred_alignment;
8943 HOST_WIDE_INT size = get_frame_size ();
8944 HOST_WIDE_INT to_allocate;
8945
8946 frame->nregs = ix86_nsaved_regs ();
8947 frame->nsseregs = ix86_nsaved_sseregs ();
8948
8949 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8950 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8951
8952 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8953 function prologues and leaf. */
8954 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8955 && (!current_function_is_leaf || cfun->calls_alloca != 0
8956 || ix86_current_function_calls_tls_descriptor))
8957 {
8958 preferred_alignment = 16;
8959 stack_alignment_needed = 16;
8960 crtl->preferred_stack_boundary = 128;
8961 crtl->stack_alignment_needed = 128;
8962 }
8963
8964 gcc_assert (!size || stack_alignment_needed);
8965 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8966 gcc_assert (preferred_alignment <= stack_alignment_needed);
8967
8968 /* For SEH we have to limit the amount of code movement into the prologue.
8969 At present we do this via a BLOCKAGE, at which point there's very little
8970 scheduling that can be done, which means that there's very little point
8971 in doing anything except PUSHs. */
8972 if (TARGET_SEH)
8973 cfun->machine->use_fast_prologue_epilogue = false;
8974
8975 /* During reload iteration the amount of registers saved can change.
8976 Recompute the value as needed. Do not recompute when amount of registers
8977 didn't change as reload does multiple calls to the function and does not
8978 expect the decision to change within single iteration. */
8979 else if (!optimize_function_for_size_p (cfun)
8980 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8981 {
8982 int count = frame->nregs;
8983 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8984
8985 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8986
8987 /* The fast prologue uses move instead of push to save registers. This
8988 is significantly longer, but also executes faster as modern hardware
8989 can execute the moves in parallel, but can't do that for push/pop.
8990
8991 Be careful about choosing what prologue to emit: When function takes
8992 many instructions to execute we may use slow version as well as in
8993 case function is known to be outside hot spot (this is known with
8994 feedback only). Weight the size of function by number of registers
8995 to save as it is cheap to use one or two push instructions but very
8996 slow to use many of them. */
8997 if (count)
8998 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8999 if (node->frequency < NODE_FREQUENCY_NORMAL
9000 || (flag_branch_probabilities
9001 && node->frequency < NODE_FREQUENCY_HOT))
9002 cfun->machine->use_fast_prologue_epilogue = false;
9003 else
9004 cfun->machine->use_fast_prologue_epilogue
9005 = !expensive_function_p (count);
9006 }
9007
9008 frame->save_regs_using_mov
9009 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9010 /* If static stack checking is enabled and done with probes,
9011 the registers need to be saved before allocating the frame. */
9012 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9013
9014 /* Skip return address. */
9015 offset = UNITS_PER_WORD;
9016
9017 /* Skip pushed static chain. */
9018 if (ix86_static_chain_on_stack)
9019 offset += UNITS_PER_WORD;
9020
9021 /* Skip saved base pointer. */
9022 if (frame_pointer_needed)
9023 offset += UNITS_PER_WORD;
9024 frame->hfp_save_offset = offset;
9025
9026 /* The traditional frame pointer location is at the top of the frame. */
9027 frame->hard_frame_pointer_offset = offset;
9028
9029 /* Register save area */
9030 offset += frame->nregs * UNITS_PER_WORD;
9031 frame->reg_save_offset = offset;
9032
9033 /* Align and set SSE register save area. */
9034 if (frame->nsseregs)
9035 {
9036 /* The only ABI that has saved SSE registers (Win64) also has a
9037 16-byte aligned default stack, and thus we don't need to be
9038 within the re-aligned local stack frame to save them. */
9039 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9040 offset = (offset + 16 - 1) & -16;
9041 offset += frame->nsseregs * 16;
9042 }
9043 frame->sse_reg_save_offset = offset;
9044
9045 /* The re-aligned stack starts here. Values before this point are not
9046 directly comparable with values below this point. In order to make
9047 sure that no value happens to be the same before and after, force
9048 the alignment computation below to add a non-zero value. */
9049 if (stack_realign_fp)
9050 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9051
9052 /* Va-arg area */
9053 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9054 offset += frame->va_arg_size;
9055
9056 /* Align start of frame for local function. */
9057 if (stack_realign_fp
9058 || offset != frame->sse_reg_save_offset
9059 || size != 0
9060 || !current_function_is_leaf
9061 || cfun->calls_alloca
9062 || ix86_current_function_calls_tls_descriptor)
9063 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9064
9065 /* Frame pointer points here. */
9066 frame->frame_pointer_offset = offset;
9067
9068 offset += size;
9069
9070 /* Add outgoing arguments area. Can be skipped if we eliminated
9071 all the function calls as dead code.
9072 Skipping is however impossible when function calls alloca. Alloca
9073 expander assumes that last crtl->outgoing_args_size
9074 of stack frame are unused. */
9075 if (ACCUMULATE_OUTGOING_ARGS
9076 && (!current_function_is_leaf || cfun->calls_alloca
9077 || ix86_current_function_calls_tls_descriptor))
9078 {
9079 offset += crtl->outgoing_args_size;
9080 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9081 }
9082 else
9083 frame->outgoing_arguments_size = 0;
9084
9085 /* Align stack boundary. Only needed if we're calling another function
9086 or using alloca. */
9087 if (!current_function_is_leaf || cfun->calls_alloca
9088 || ix86_current_function_calls_tls_descriptor)
9089 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9090
9091 /* We've reached end of stack frame. */
9092 frame->stack_pointer_offset = offset;
9093
9094 /* Size prologue needs to allocate. */
9095 to_allocate = offset - frame->sse_reg_save_offset;
9096
9097 if ((!to_allocate && frame->nregs <= 1)
9098 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9099 frame->save_regs_using_mov = false;
9100
9101 if (ix86_using_red_zone ()
9102 && current_function_sp_is_unchanging
9103 && current_function_is_leaf
9104 && !ix86_current_function_calls_tls_descriptor)
9105 {
9106 frame->red_zone_size = to_allocate;
9107 if (frame->save_regs_using_mov)
9108 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9109 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9110 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9111 }
9112 else
9113 frame->red_zone_size = 0;
9114 frame->stack_pointer_offset -= frame->red_zone_size;
9115
9116 /* The SEH frame pointer location is near the bottom of the frame.
9117 This is enforced by the fact that the difference between the
9118 stack pointer and the frame pointer is limited to 240 bytes in
9119 the unwind data structure. */
9120 if (TARGET_SEH)
9121 {
9122 HOST_WIDE_INT diff;
9123
9124 /* If we can leave the frame pointer where it is, do so. */
9125 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9126 if (diff > 240 || (diff & 15) != 0)
9127 {
9128 /* Ideally we'd determine what portion of the local stack frame
9129 (within the constraint of the lowest 240) is most heavily used.
9130 But without that complication, simply bias the frame pointer
9131 by 128 bytes so as to maximize the amount of the local stack
9132 frame that is addressable with 8-bit offsets. */
9133 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9134 }
9135 }
9136 }
9137
9138 /* This is semi-inlined memory_address_length, but simplified
9139 since we know that we're always dealing with reg+offset, and
9140 to avoid having to create and discard all that rtl. */
9141
9142 static inline int
9143 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9144 {
9145 int len = 4;
9146
9147 if (offset == 0)
9148 {
9149 /* EBP and R13 cannot be encoded without an offset. */
9150 len = (regno == BP_REG || regno == R13_REG);
9151 }
9152 else if (IN_RANGE (offset, -128, 127))
9153 len = 1;
9154
9155 /* ESP and R12 must be encoded with a SIB byte. */
9156 if (regno == SP_REG || regno == R12_REG)
9157 len++;
9158
9159 return len;
9160 }
9161
9162 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9163 The valid base registers are taken from CFUN->MACHINE->FS. */
9164
9165 static rtx
9166 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9167 {
9168 const struct machine_function *m = cfun->machine;
9169 rtx base_reg = NULL;
9170 HOST_WIDE_INT base_offset = 0;
9171
9172 if (m->use_fast_prologue_epilogue)
9173 {
9174 /* Choose the base register most likely to allow the most scheduling
9175 opportunities. Generally FP is valid througout the function,
9176 while DRAP must be reloaded within the epilogue. But choose either
9177 over the SP due to increased encoding size. */
9178
9179 if (m->fs.fp_valid)
9180 {
9181 base_reg = hard_frame_pointer_rtx;
9182 base_offset = m->fs.fp_offset - cfa_offset;
9183 }
9184 else if (m->fs.drap_valid)
9185 {
9186 base_reg = crtl->drap_reg;
9187 base_offset = 0 - cfa_offset;
9188 }
9189 else if (m->fs.sp_valid)
9190 {
9191 base_reg = stack_pointer_rtx;
9192 base_offset = m->fs.sp_offset - cfa_offset;
9193 }
9194 }
9195 else
9196 {
9197 HOST_WIDE_INT toffset;
9198 int len = 16, tlen;
9199
9200 /* Choose the base register with the smallest address encoding.
9201 With a tie, choose FP > DRAP > SP. */
9202 if (m->fs.sp_valid)
9203 {
9204 base_reg = stack_pointer_rtx;
9205 base_offset = m->fs.sp_offset - cfa_offset;
9206 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9207 }
9208 if (m->fs.drap_valid)
9209 {
9210 toffset = 0 - cfa_offset;
9211 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9212 if (tlen <= len)
9213 {
9214 base_reg = crtl->drap_reg;
9215 base_offset = toffset;
9216 len = tlen;
9217 }
9218 }
9219 if (m->fs.fp_valid)
9220 {
9221 toffset = m->fs.fp_offset - cfa_offset;
9222 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9223 if (tlen <= len)
9224 {
9225 base_reg = hard_frame_pointer_rtx;
9226 base_offset = toffset;
9227 len = tlen;
9228 }
9229 }
9230 }
9231 gcc_assert (base_reg != NULL);
9232
9233 return plus_constant (Pmode, base_reg, base_offset);
9234 }
9235
9236 /* Emit code to save registers in the prologue. */
9237
9238 static void
9239 ix86_emit_save_regs (void)
9240 {
9241 unsigned int regno;
9242 rtx insn;
9243
9244 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9245 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9246 {
9247 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9248 RTX_FRAME_RELATED_P (insn) = 1;
9249 }
9250 }
9251
9252 /* Emit a single register save at CFA - CFA_OFFSET. */
9253
9254 static void
9255 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9256 HOST_WIDE_INT cfa_offset)
9257 {
9258 struct machine_function *m = cfun->machine;
9259 rtx reg = gen_rtx_REG (mode, regno);
9260 rtx mem, addr, base, insn;
9261
9262 addr = choose_baseaddr (cfa_offset);
9263 mem = gen_frame_mem (mode, addr);
9264
9265 /* For SSE saves, we need to indicate the 128-bit alignment. */
9266 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9267
9268 insn = emit_move_insn (mem, reg);
9269 RTX_FRAME_RELATED_P (insn) = 1;
9270
9271 base = addr;
9272 if (GET_CODE (base) == PLUS)
9273 base = XEXP (base, 0);
9274 gcc_checking_assert (REG_P (base));
9275
9276 /* When saving registers into a re-aligned local stack frame, avoid
9277 any tricky guessing by dwarf2out. */
9278 if (m->fs.realigned)
9279 {
9280 gcc_checking_assert (stack_realign_drap);
9281
9282 if (regno == REGNO (crtl->drap_reg))
9283 {
9284 /* A bit of a hack. We force the DRAP register to be saved in
9285 the re-aligned stack frame, which provides us with a copy
9286 of the CFA that will last past the prologue. Install it. */
9287 gcc_checking_assert (cfun->machine->fs.fp_valid);
9288 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9289 cfun->machine->fs.fp_offset - cfa_offset);
9290 mem = gen_rtx_MEM (mode, addr);
9291 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9292 }
9293 else
9294 {
9295 /* The frame pointer is a stable reference within the
9296 aligned frame. Use it. */
9297 gcc_checking_assert (cfun->machine->fs.fp_valid);
9298 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9299 cfun->machine->fs.fp_offset - cfa_offset);
9300 mem = gen_rtx_MEM (mode, addr);
9301 add_reg_note (insn, REG_CFA_EXPRESSION,
9302 gen_rtx_SET (VOIDmode, mem, reg));
9303 }
9304 }
9305
9306 /* The memory may not be relative to the current CFA register,
9307 which means that we may need to generate a new pattern for
9308 use by the unwind info. */
9309 else if (base != m->fs.cfa_reg)
9310 {
9311 addr = plus_constant (Pmode, m->fs.cfa_reg,
9312 m->fs.cfa_offset - cfa_offset);
9313 mem = gen_rtx_MEM (mode, addr);
9314 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9315 }
9316 }
9317
9318 /* Emit code to save registers using MOV insns.
9319 First register is stored at CFA - CFA_OFFSET. */
9320 static void
9321 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9322 {
9323 unsigned int regno;
9324
9325 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9326 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9327 {
9328 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9329 cfa_offset -= UNITS_PER_WORD;
9330 }
9331 }
9332
9333 /* Emit code to save SSE registers using MOV insns.
9334 First register is stored at CFA - CFA_OFFSET. */
9335 static void
9336 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9337 {
9338 unsigned int regno;
9339
9340 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9341 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9342 {
9343 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9344 cfa_offset -= 16;
9345 }
9346 }
9347
9348 static GTY(()) rtx queued_cfa_restores;
9349
9350 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9351 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9352 Don't add the note if the previously saved value will be left untouched
9353 within stack red-zone till return, as unwinders can find the same value
9354 in the register and on the stack. */
9355
9356 static void
9357 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9358 {
9359 if (!crtl->shrink_wrapped
9360 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9361 return;
9362
9363 if (insn)
9364 {
9365 add_reg_note (insn, REG_CFA_RESTORE, reg);
9366 RTX_FRAME_RELATED_P (insn) = 1;
9367 }
9368 else
9369 queued_cfa_restores
9370 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9371 }
9372
9373 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9374
9375 static void
9376 ix86_add_queued_cfa_restore_notes (rtx insn)
9377 {
9378 rtx last;
9379 if (!queued_cfa_restores)
9380 return;
9381 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9382 ;
9383 XEXP (last, 1) = REG_NOTES (insn);
9384 REG_NOTES (insn) = queued_cfa_restores;
9385 queued_cfa_restores = NULL_RTX;
9386 RTX_FRAME_RELATED_P (insn) = 1;
9387 }
9388
9389 /* Expand prologue or epilogue stack adjustment.
9390 The pattern exist to put a dependency on all ebp-based memory accesses.
9391 STYLE should be negative if instructions should be marked as frame related,
9392 zero if %r11 register is live and cannot be freely used and positive
9393 otherwise. */
9394
9395 static void
9396 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9397 int style, bool set_cfa)
9398 {
9399 struct machine_function *m = cfun->machine;
9400 rtx insn;
9401 bool add_frame_related_expr = false;
9402
9403 if (Pmode == SImode)
9404 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9405 else if (x86_64_immediate_operand (offset, DImode))
9406 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9407 else
9408 {
9409 rtx tmp;
9410 /* r11 is used by indirect sibcall return as well, set before the
9411 epilogue and used after the epilogue. */
9412 if (style)
9413 tmp = gen_rtx_REG (DImode, R11_REG);
9414 else
9415 {
9416 gcc_assert (src != hard_frame_pointer_rtx
9417 && dest != hard_frame_pointer_rtx);
9418 tmp = hard_frame_pointer_rtx;
9419 }
9420 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9421 if (style < 0)
9422 add_frame_related_expr = true;
9423
9424 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9425 }
9426
9427 insn = emit_insn (insn);
9428 if (style >= 0)
9429 ix86_add_queued_cfa_restore_notes (insn);
9430
9431 if (set_cfa)
9432 {
9433 rtx r;
9434
9435 gcc_assert (m->fs.cfa_reg == src);
9436 m->fs.cfa_offset += INTVAL (offset);
9437 m->fs.cfa_reg = dest;
9438
9439 r = gen_rtx_PLUS (Pmode, src, offset);
9440 r = gen_rtx_SET (VOIDmode, dest, r);
9441 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9442 RTX_FRAME_RELATED_P (insn) = 1;
9443 }
9444 else if (style < 0)
9445 {
9446 RTX_FRAME_RELATED_P (insn) = 1;
9447 if (add_frame_related_expr)
9448 {
9449 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9450 r = gen_rtx_SET (VOIDmode, dest, r);
9451 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9452 }
9453 }
9454
9455 if (dest == stack_pointer_rtx)
9456 {
9457 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9458 bool valid = m->fs.sp_valid;
9459
9460 if (src == hard_frame_pointer_rtx)
9461 {
9462 valid = m->fs.fp_valid;
9463 ooffset = m->fs.fp_offset;
9464 }
9465 else if (src == crtl->drap_reg)
9466 {
9467 valid = m->fs.drap_valid;
9468 ooffset = 0;
9469 }
9470 else
9471 {
9472 /* Else there are two possibilities: SP itself, which we set
9473 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9474 taken care of this by hand along the eh_return path. */
9475 gcc_checking_assert (src == stack_pointer_rtx
9476 || offset == const0_rtx);
9477 }
9478
9479 m->fs.sp_offset = ooffset - INTVAL (offset);
9480 m->fs.sp_valid = valid;
9481 }
9482 }
9483
9484 /* Find an available register to be used as dynamic realign argument
9485 pointer regsiter. Such a register will be written in prologue and
9486 used in begin of body, so it must not be
9487 1. parameter passing register.
9488 2. GOT pointer.
9489 We reuse static-chain register if it is available. Otherwise, we
9490 use DI for i386 and R13 for x86-64. We chose R13 since it has
9491 shorter encoding.
9492
9493 Return: the regno of chosen register. */
9494
9495 static unsigned int
9496 find_drap_reg (void)
9497 {
9498 tree decl = cfun->decl;
9499
9500 if (TARGET_64BIT)
9501 {
9502 /* Use R13 for nested function or function need static chain.
9503 Since function with tail call may use any caller-saved
9504 registers in epilogue, DRAP must not use caller-saved
9505 register in such case. */
9506 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9507 return R13_REG;
9508
9509 return R10_REG;
9510 }
9511 else
9512 {
9513 /* Use DI for nested function or function need static chain.
9514 Since function with tail call may use any caller-saved
9515 registers in epilogue, DRAP must not use caller-saved
9516 register in such case. */
9517 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9518 return DI_REG;
9519
9520 /* Reuse static chain register if it isn't used for parameter
9521 passing. */
9522 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9523 {
9524 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9525 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9526 return CX_REG;
9527 }
9528 return DI_REG;
9529 }
9530 }
9531
9532 /* Return minimum incoming stack alignment. */
9533
9534 static unsigned int
9535 ix86_minimum_incoming_stack_boundary (bool sibcall)
9536 {
9537 unsigned int incoming_stack_boundary;
9538
9539 /* Prefer the one specified at command line. */
9540 if (ix86_user_incoming_stack_boundary)
9541 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9542 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9543 if -mstackrealign is used, it isn't used for sibcall check and
9544 estimated stack alignment is 128bit. */
9545 else if (!sibcall
9546 && !TARGET_64BIT
9547 && ix86_force_align_arg_pointer
9548 && crtl->stack_alignment_estimated == 128)
9549 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9550 else
9551 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9552
9553 /* Incoming stack alignment can be changed on individual functions
9554 via force_align_arg_pointer attribute. We use the smallest
9555 incoming stack boundary. */
9556 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9557 && lookup_attribute (ix86_force_align_arg_pointer_string,
9558 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9559 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9560
9561 /* The incoming stack frame has to be aligned at least at
9562 parm_stack_boundary. */
9563 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9564 incoming_stack_boundary = crtl->parm_stack_boundary;
9565
9566 /* Stack at entrance of main is aligned by runtime. We use the
9567 smallest incoming stack boundary. */
9568 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9569 && DECL_NAME (current_function_decl)
9570 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9571 && DECL_FILE_SCOPE_P (current_function_decl))
9572 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9573
9574 return incoming_stack_boundary;
9575 }
9576
9577 /* Update incoming stack boundary and estimated stack alignment. */
9578
9579 static void
9580 ix86_update_stack_boundary (void)
9581 {
9582 ix86_incoming_stack_boundary
9583 = ix86_minimum_incoming_stack_boundary (false);
9584
9585 /* x86_64 vararg needs 16byte stack alignment for register save
9586 area. */
9587 if (TARGET_64BIT
9588 && cfun->stdarg
9589 && crtl->stack_alignment_estimated < 128)
9590 crtl->stack_alignment_estimated = 128;
9591 }
9592
9593 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9594 needed or an rtx for DRAP otherwise. */
9595
9596 static rtx
9597 ix86_get_drap_rtx (void)
9598 {
9599 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9600 crtl->need_drap = true;
9601
9602 if (stack_realign_drap)
9603 {
9604 /* Assign DRAP to vDRAP and returns vDRAP */
9605 unsigned int regno = find_drap_reg ();
9606 rtx drap_vreg;
9607 rtx arg_ptr;
9608 rtx seq, insn;
9609
9610 arg_ptr = gen_rtx_REG (Pmode, regno);
9611 crtl->drap_reg = arg_ptr;
9612
9613 start_sequence ();
9614 drap_vreg = copy_to_reg (arg_ptr);
9615 seq = get_insns ();
9616 end_sequence ();
9617
9618 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9619 if (!optimize)
9620 {
9621 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9622 RTX_FRAME_RELATED_P (insn) = 1;
9623 }
9624 return drap_vreg;
9625 }
9626 else
9627 return NULL;
9628 }
9629
9630 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9631
9632 static rtx
9633 ix86_internal_arg_pointer (void)
9634 {
9635 return virtual_incoming_args_rtx;
9636 }
9637
9638 struct scratch_reg {
9639 rtx reg;
9640 bool saved;
9641 };
9642
9643 /* Return a short-lived scratch register for use on function entry.
9644 In 32-bit mode, it is valid only after the registers are saved
9645 in the prologue. This register must be released by means of
9646 release_scratch_register_on_entry once it is dead. */
9647
9648 static void
9649 get_scratch_register_on_entry (struct scratch_reg *sr)
9650 {
9651 int regno;
9652
9653 sr->saved = false;
9654
9655 if (TARGET_64BIT)
9656 {
9657 /* We always use R11 in 64-bit mode. */
9658 regno = R11_REG;
9659 }
9660 else
9661 {
9662 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9663 bool fastcall_p
9664 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9665 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9666 int regparm = ix86_function_regparm (fntype, decl);
9667 int drap_regno
9668 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9669
9670 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9671 for the static chain register. */
9672 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9673 && drap_regno != AX_REG)
9674 regno = AX_REG;
9675 else if (regparm < 2 && drap_regno != DX_REG)
9676 regno = DX_REG;
9677 /* ecx is the static chain register. */
9678 else if (regparm < 3 && !fastcall_p && !static_chain_p
9679 && drap_regno != CX_REG)
9680 regno = CX_REG;
9681 else if (ix86_save_reg (BX_REG, true))
9682 regno = BX_REG;
9683 /* esi is the static chain register. */
9684 else if (!(regparm == 3 && static_chain_p)
9685 && ix86_save_reg (SI_REG, true))
9686 regno = SI_REG;
9687 else if (ix86_save_reg (DI_REG, true))
9688 regno = DI_REG;
9689 else
9690 {
9691 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9692 sr->saved = true;
9693 }
9694 }
9695
9696 sr->reg = gen_rtx_REG (Pmode, regno);
9697 if (sr->saved)
9698 {
9699 rtx insn = emit_insn (gen_push (sr->reg));
9700 RTX_FRAME_RELATED_P (insn) = 1;
9701 }
9702 }
9703
9704 /* Release a scratch register obtained from the preceding function. */
9705
9706 static void
9707 release_scratch_register_on_entry (struct scratch_reg *sr)
9708 {
9709 if (sr->saved)
9710 {
9711 rtx x, insn = emit_insn (gen_pop (sr->reg));
9712
9713 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9714 RTX_FRAME_RELATED_P (insn) = 1;
9715 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9716 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9717 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9718 }
9719 }
9720
9721 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9722
9723 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9724
9725 static void
9726 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9727 {
9728 /* We skip the probe for the first interval + a small dope of 4 words and
9729 probe that many bytes past the specified size to maintain a protection
9730 area at the botton of the stack. */
9731 const int dope = 4 * UNITS_PER_WORD;
9732 rtx size_rtx = GEN_INT (size), last;
9733
9734 /* See if we have a constant small number of probes to generate. If so,
9735 that's the easy case. The run-time loop is made up of 11 insns in the
9736 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9737 for n # of intervals. */
9738 if (size <= 5 * PROBE_INTERVAL)
9739 {
9740 HOST_WIDE_INT i, adjust;
9741 bool first_probe = true;
9742
9743 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9744 values of N from 1 until it exceeds SIZE. If only one probe is
9745 needed, this will not generate any code. Then adjust and probe
9746 to PROBE_INTERVAL + SIZE. */
9747 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9748 {
9749 if (first_probe)
9750 {
9751 adjust = 2 * PROBE_INTERVAL + dope;
9752 first_probe = false;
9753 }
9754 else
9755 adjust = PROBE_INTERVAL;
9756
9757 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9758 plus_constant (Pmode, stack_pointer_rtx,
9759 -adjust)));
9760 emit_stack_probe (stack_pointer_rtx);
9761 }
9762
9763 if (first_probe)
9764 adjust = size + PROBE_INTERVAL + dope;
9765 else
9766 adjust = size + PROBE_INTERVAL - i;
9767
9768 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9769 plus_constant (Pmode, stack_pointer_rtx,
9770 -adjust)));
9771 emit_stack_probe (stack_pointer_rtx);
9772
9773 /* Adjust back to account for the additional first interval. */
9774 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9775 plus_constant (Pmode, stack_pointer_rtx,
9776 PROBE_INTERVAL + dope)));
9777 }
9778
9779 /* Otherwise, do the same as above, but in a loop. Note that we must be
9780 extra careful with variables wrapping around because we might be at
9781 the very top (or the very bottom) of the address space and we have
9782 to be able to handle this case properly; in particular, we use an
9783 equality test for the loop condition. */
9784 else
9785 {
9786 HOST_WIDE_INT rounded_size;
9787 struct scratch_reg sr;
9788
9789 get_scratch_register_on_entry (&sr);
9790
9791
9792 /* Step 1: round SIZE to the previous multiple of the interval. */
9793
9794 rounded_size = size & -PROBE_INTERVAL;
9795
9796
9797 /* Step 2: compute initial and final value of the loop counter. */
9798
9799 /* SP = SP_0 + PROBE_INTERVAL. */
9800 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9801 plus_constant (Pmode, stack_pointer_rtx,
9802 - (PROBE_INTERVAL + dope))));
9803
9804 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9805 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9806 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9807 gen_rtx_PLUS (Pmode, sr.reg,
9808 stack_pointer_rtx)));
9809
9810
9811 /* Step 3: the loop
9812
9813 while (SP != LAST_ADDR)
9814 {
9815 SP = SP + PROBE_INTERVAL
9816 probe at SP
9817 }
9818
9819 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9820 values of N from 1 until it is equal to ROUNDED_SIZE. */
9821
9822 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9823
9824
9825 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9826 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9827
9828 if (size != rounded_size)
9829 {
9830 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9831 plus_constant (Pmode, stack_pointer_rtx,
9832 rounded_size - size)));
9833 emit_stack_probe (stack_pointer_rtx);
9834 }
9835
9836 /* Adjust back to account for the additional first interval. */
9837 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9838 plus_constant (Pmode, stack_pointer_rtx,
9839 PROBE_INTERVAL + dope)));
9840
9841 release_scratch_register_on_entry (&sr);
9842 }
9843
9844 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9845
9846 /* Even if the stack pointer isn't the CFA register, we need to correctly
9847 describe the adjustments made to it, in particular differentiate the
9848 frame-related ones from the frame-unrelated ones. */
9849 if (size > 0)
9850 {
9851 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9852 XVECEXP (expr, 0, 0)
9853 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9854 plus_constant (Pmode, stack_pointer_rtx, -size));
9855 XVECEXP (expr, 0, 1)
9856 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9857 plus_constant (Pmode, stack_pointer_rtx,
9858 PROBE_INTERVAL + dope + size));
9859 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9860 RTX_FRAME_RELATED_P (last) = 1;
9861
9862 cfun->machine->fs.sp_offset += size;
9863 }
9864
9865 /* Make sure nothing is scheduled before we are done. */
9866 emit_insn (gen_blockage ());
9867 }
9868
9869 /* Adjust the stack pointer up to REG while probing it. */
9870
9871 const char *
9872 output_adjust_stack_and_probe (rtx reg)
9873 {
9874 static int labelno = 0;
9875 char loop_lab[32], end_lab[32];
9876 rtx xops[2];
9877
9878 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9879 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9880
9881 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9882
9883 /* Jump to END_LAB if SP == LAST_ADDR. */
9884 xops[0] = stack_pointer_rtx;
9885 xops[1] = reg;
9886 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9887 fputs ("\tje\t", asm_out_file);
9888 assemble_name_raw (asm_out_file, end_lab);
9889 fputc ('\n', asm_out_file);
9890
9891 /* SP = SP + PROBE_INTERVAL. */
9892 xops[1] = GEN_INT (PROBE_INTERVAL);
9893 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9894
9895 /* Probe at SP. */
9896 xops[1] = const0_rtx;
9897 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9898
9899 fprintf (asm_out_file, "\tjmp\t");
9900 assemble_name_raw (asm_out_file, loop_lab);
9901 fputc ('\n', asm_out_file);
9902
9903 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9904
9905 return "";
9906 }
9907
9908 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9909 inclusive. These are offsets from the current stack pointer. */
9910
9911 static void
9912 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9913 {
9914 /* See if we have a constant small number of probes to generate. If so,
9915 that's the easy case. The run-time loop is made up of 7 insns in the
9916 generic case while the compile-time loop is made up of n insns for n #
9917 of intervals. */
9918 if (size <= 7 * PROBE_INTERVAL)
9919 {
9920 HOST_WIDE_INT i;
9921
9922 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9923 it exceeds SIZE. If only one probe is needed, this will not
9924 generate any code. Then probe at FIRST + SIZE. */
9925 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9926 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9927 -(first + i)));
9928
9929 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9930 -(first + size)));
9931 }
9932
9933 /* Otherwise, do the same as above, but in a loop. Note that we must be
9934 extra careful with variables wrapping around because we might be at
9935 the very top (or the very bottom) of the address space and we have
9936 to be able to handle this case properly; in particular, we use an
9937 equality test for the loop condition. */
9938 else
9939 {
9940 HOST_WIDE_INT rounded_size, last;
9941 struct scratch_reg sr;
9942
9943 get_scratch_register_on_entry (&sr);
9944
9945
9946 /* Step 1: round SIZE to the previous multiple of the interval. */
9947
9948 rounded_size = size & -PROBE_INTERVAL;
9949
9950
9951 /* Step 2: compute initial and final value of the loop counter. */
9952
9953 /* TEST_OFFSET = FIRST. */
9954 emit_move_insn (sr.reg, GEN_INT (-first));
9955
9956 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9957 last = first + rounded_size;
9958
9959
9960 /* Step 3: the loop
9961
9962 while (TEST_ADDR != LAST_ADDR)
9963 {
9964 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9965 probe at TEST_ADDR
9966 }
9967
9968 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9969 until it is equal to ROUNDED_SIZE. */
9970
9971 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9972
9973
9974 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9975 that SIZE is equal to ROUNDED_SIZE. */
9976
9977 if (size != rounded_size)
9978 emit_stack_probe (plus_constant (Pmode,
9979 gen_rtx_PLUS (Pmode,
9980 stack_pointer_rtx,
9981 sr.reg),
9982 rounded_size - size));
9983
9984 release_scratch_register_on_entry (&sr);
9985 }
9986
9987 /* Make sure nothing is scheduled before we are done. */
9988 emit_insn (gen_blockage ());
9989 }
9990
9991 /* Probe a range of stack addresses from REG to END, inclusive. These are
9992 offsets from the current stack pointer. */
9993
9994 const char *
9995 output_probe_stack_range (rtx reg, rtx end)
9996 {
9997 static int labelno = 0;
9998 char loop_lab[32], end_lab[32];
9999 rtx xops[3];
10000
10001 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10002 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10003
10004 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10005
10006 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10007 xops[0] = reg;
10008 xops[1] = end;
10009 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10010 fputs ("\tje\t", asm_out_file);
10011 assemble_name_raw (asm_out_file, end_lab);
10012 fputc ('\n', asm_out_file);
10013
10014 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10015 xops[1] = GEN_INT (PROBE_INTERVAL);
10016 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10017
10018 /* Probe at TEST_ADDR. */
10019 xops[0] = stack_pointer_rtx;
10020 xops[1] = reg;
10021 xops[2] = const0_rtx;
10022 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10023
10024 fprintf (asm_out_file, "\tjmp\t");
10025 assemble_name_raw (asm_out_file, loop_lab);
10026 fputc ('\n', asm_out_file);
10027
10028 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10029
10030 return "";
10031 }
10032
10033 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10034 to be generated in correct form. */
10035 static void
10036 ix86_finalize_stack_realign_flags (void)
10037 {
10038 /* Check if stack realign is really needed after reload, and
10039 stores result in cfun */
10040 unsigned int incoming_stack_boundary
10041 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10042 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10043 unsigned int stack_realign = (incoming_stack_boundary
10044 < (current_function_is_leaf
10045 ? crtl->max_used_stack_slot_alignment
10046 : crtl->stack_alignment_needed));
10047
10048 if (crtl->stack_realign_finalized)
10049 {
10050 /* After stack_realign_needed is finalized, we can't no longer
10051 change it. */
10052 gcc_assert (crtl->stack_realign_needed == stack_realign);
10053 return;
10054 }
10055
10056 /* If the only reason for frame_pointer_needed is that we conservatively
10057 assumed stack realignment might be needed, but in the end nothing that
10058 needed the stack alignment had been spilled, clear frame_pointer_needed
10059 and say we don't need stack realignment. */
10060 if (stack_realign
10061 && !crtl->need_drap
10062 && frame_pointer_needed
10063 && current_function_is_leaf
10064 && flag_omit_frame_pointer
10065 && current_function_sp_is_unchanging
10066 && !ix86_current_function_calls_tls_descriptor
10067 && !crtl->accesses_prior_frames
10068 && !cfun->calls_alloca
10069 && !crtl->calls_eh_return
10070 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10071 && !ix86_frame_pointer_required ()
10072 && get_frame_size () == 0
10073 && ix86_nsaved_sseregs () == 0
10074 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10075 {
10076 HARD_REG_SET set_up_by_prologue, prologue_used;
10077 basic_block bb;
10078
10079 CLEAR_HARD_REG_SET (prologue_used);
10080 CLEAR_HARD_REG_SET (set_up_by_prologue);
10081 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10082 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10083 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10084 HARD_FRAME_POINTER_REGNUM);
10085 FOR_EACH_BB (bb)
10086 {
10087 rtx insn;
10088 FOR_BB_INSNS (bb, insn)
10089 if (NONDEBUG_INSN_P (insn)
10090 && requires_stack_frame_p (insn, prologue_used,
10091 set_up_by_prologue))
10092 {
10093 crtl->stack_realign_needed = stack_realign;
10094 crtl->stack_realign_finalized = true;
10095 return;
10096 }
10097 }
10098
10099 frame_pointer_needed = false;
10100 stack_realign = false;
10101 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10102 crtl->stack_alignment_needed = incoming_stack_boundary;
10103 crtl->stack_alignment_estimated = incoming_stack_boundary;
10104 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10105 crtl->preferred_stack_boundary = incoming_stack_boundary;
10106 df_finish_pass (true);
10107 df_scan_alloc (NULL);
10108 df_scan_blocks ();
10109 df_compute_regs_ever_live (true);
10110 df_analyze ();
10111 }
10112
10113 crtl->stack_realign_needed = stack_realign;
10114 crtl->stack_realign_finalized = true;
10115 }
10116
10117 /* Expand the prologue into a bunch of separate insns. */
10118
10119 void
10120 ix86_expand_prologue (void)
10121 {
10122 struct machine_function *m = cfun->machine;
10123 rtx insn, t;
10124 bool pic_reg_used;
10125 struct ix86_frame frame;
10126 HOST_WIDE_INT allocate;
10127 bool int_registers_saved;
10128
10129 ix86_finalize_stack_realign_flags ();
10130
10131 /* DRAP should not coexist with stack_realign_fp */
10132 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10133
10134 memset (&m->fs, 0, sizeof (m->fs));
10135
10136 /* Initialize CFA state for before the prologue. */
10137 m->fs.cfa_reg = stack_pointer_rtx;
10138 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10139
10140 /* Track SP offset to the CFA. We continue tracking this after we've
10141 swapped the CFA register away from SP. In the case of re-alignment
10142 this is fudged; we're interested to offsets within the local frame. */
10143 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10144 m->fs.sp_valid = true;
10145
10146 ix86_compute_frame_layout (&frame);
10147
10148 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10149 {
10150 /* We should have already generated an error for any use of
10151 ms_hook on a nested function. */
10152 gcc_checking_assert (!ix86_static_chain_on_stack);
10153
10154 /* Check if profiling is active and we shall use profiling before
10155 prologue variant. If so sorry. */
10156 if (crtl->profile && flag_fentry != 0)
10157 sorry ("ms_hook_prologue attribute isn%'t compatible "
10158 "with -mfentry for 32-bit");
10159
10160 /* In ix86_asm_output_function_label we emitted:
10161 8b ff movl.s %edi,%edi
10162 55 push %ebp
10163 8b ec movl.s %esp,%ebp
10164
10165 This matches the hookable function prologue in Win32 API
10166 functions in Microsoft Windows XP Service Pack 2 and newer.
10167 Wine uses this to enable Windows apps to hook the Win32 API
10168 functions provided by Wine.
10169
10170 What that means is that we've already set up the frame pointer. */
10171
10172 if (frame_pointer_needed
10173 && !(crtl->drap_reg && crtl->stack_realign_needed))
10174 {
10175 rtx push, mov;
10176
10177 /* We've decided to use the frame pointer already set up.
10178 Describe this to the unwinder by pretending that both
10179 push and mov insns happen right here.
10180
10181 Putting the unwind info here at the end of the ms_hook
10182 is done so that we can make absolutely certain we get
10183 the required byte sequence at the start of the function,
10184 rather than relying on an assembler that can produce
10185 the exact encoding required.
10186
10187 However it does mean (in the unpatched case) that we have
10188 a 1 insn window where the asynchronous unwind info is
10189 incorrect. However, if we placed the unwind info at
10190 its correct location we would have incorrect unwind info
10191 in the patched case. Which is probably all moot since
10192 I don't expect Wine generates dwarf2 unwind info for the
10193 system libraries that use this feature. */
10194
10195 insn = emit_insn (gen_blockage ());
10196
10197 push = gen_push (hard_frame_pointer_rtx);
10198 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10199 stack_pointer_rtx);
10200 RTX_FRAME_RELATED_P (push) = 1;
10201 RTX_FRAME_RELATED_P (mov) = 1;
10202
10203 RTX_FRAME_RELATED_P (insn) = 1;
10204 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10205 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10206
10207 /* Note that gen_push incremented m->fs.cfa_offset, even
10208 though we didn't emit the push insn here. */
10209 m->fs.cfa_reg = hard_frame_pointer_rtx;
10210 m->fs.fp_offset = m->fs.cfa_offset;
10211 m->fs.fp_valid = true;
10212 }
10213 else
10214 {
10215 /* The frame pointer is not needed so pop %ebp again.
10216 This leaves us with a pristine state. */
10217 emit_insn (gen_pop (hard_frame_pointer_rtx));
10218 }
10219 }
10220
10221 /* The first insn of a function that accepts its static chain on the
10222 stack is to push the register that would be filled in by a direct
10223 call. This insn will be skipped by the trampoline. */
10224 else if (ix86_static_chain_on_stack)
10225 {
10226 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10227 emit_insn (gen_blockage ());
10228
10229 /* We don't want to interpret this push insn as a register save,
10230 only as a stack adjustment. The real copy of the register as
10231 a save will be done later, if needed. */
10232 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10233 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10234 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10235 RTX_FRAME_RELATED_P (insn) = 1;
10236 }
10237
10238 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10239 of DRAP is needed and stack realignment is really needed after reload */
10240 if (stack_realign_drap)
10241 {
10242 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10243
10244 /* Only need to push parameter pointer reg if it is caller saved. */
10245 if (!call_used_regs[REGNO (crtl->drap_reg)])
10246 {
10247 /* Push arg pointer reg */
10248 insn = emit_insn (gen_push (crtl->drap_reg));
10249 RTX_FRAME_RELATED_P (insn) = 1;
10250 }
10251
10252 /* Grab the argument pointer. */
10253 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10254 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10255 RTX_FRAME_RELATED_P (insn) = 1;
10256 m->fs.cfa_reg = crtl->drap_reg;
10257 m->fs.cfa_offset = 0;
10258
10259 /* Align the stack. */
10260 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10261 stack_pointer_rtx,
10262 GEN_INT (-align_bytes)));
10263 RTX_FRAME_RELATED_P (insn) = 1;
10264
10265 /* Replicate the return address on the stack so that return
10266 address can be reached via (argp - 1) slot. This is needed
10267 to implement macro RETURN_ADDR_RTX and intrinsic function
10268 expand_builtin_return_addr etc. */
10269 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10270 t = gen_frame_mem (word_mode, t);
10271 insn = emit_insn (gen_push (t));
10272 RTX_FRAME_RELATED_P (insn) = 1;
10273
10274 /* For the purposes of frame and register save area addressing,
10275 we've started over with a new frame. */
10276 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10277 m->fs.realigned = true;
10278 }
10279
10280 if (frame_pointer_needed && !m->fs.fp_valid)
10281 {
10282 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10283 slower on all targets. Also sdb doesn't like it. */
10284 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10285 RTX_FRAME_RELATED_P (insn) = 1;
10286
10287 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10288 {
10289 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10290 RTX_FRAME_RELATED_P (insn) = 1;
10291
10292 if (m->fs.cfa_reg == stack_pointer_rtx)
10293 m->fs.cfa_reg = hard_frame_pointer_rtx;
10294 m->fs.fp_offset = m->fs.sp_offset;
10295 m->fs.fp_valid = true;
10296 }
10297 }
10298
10299 int_registers_saved = (frame.nregs == 0);
10300
10301 if (!int_registers_saved)
10302 {
10303 /* If saving registers via PUSH, do so now. */
10304 if (!frame.save_regs_using_mov)
10305 {
10306 ix86_emit_save_regs ();
10307 int_registers_saved = true;
10308 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10309 }
10310
10311 /* When using red zone we may start register saving before allocating
10312 the stack frame saving one cycle of the prologue. However, avoid
10313 doing this if we have to probe the stack; at least on x86_64 the
10314 stack probe can turn into a call that clobbers a red zone location. */
10315 else if (ix86_using_red_zone ()
10316 && (! TARGET_STACK_PROBE
10317 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10318 {
10319 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10320 int_registers_saved = true;
10321 }
10322 }
10323
10324 if (stack_realign_fp)
10325 {
10326 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10327 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10328
10329 /* The computation of the size of the re-aligned stack frame means
10330 that we must allocate the size of the register save area before
10331 performing the actual alignment. Otherwise we cannot guarantee
10332 that there's enough storage above the realignment point. */
10333 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10334 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10335 GEN_INT (m->fs.sp_offset
10336 - frame.sse_reg_save_offset),
10337 -1, false);
10338
10339 /* Align the stack. */
10340 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10341 stack_pointer_rtx,
10342 GEN_INT (-align_bytes)));
10343
10344 /* For the purposes of register save area addressing, the stack
10345 pointer is no longer valid. As for the value of sp_offset,
10346 see ix86_compute_frame_layout, which we need to match in order
10347 to pass verification of stack_pointer_offset at the end. */
10348 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10349 m->fs.sp_valid = false;
10350 }
10351
10352 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10353
10354 if (flag_stack_usage_info)
10355 {
10356 /* We start to count from ARG_POINTER. */
10357 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10358
10359 /* If it was realigned, take into account the fake frame. */
10360 if (stack_realign_drap)
10361 {
10362 if (ix86_static_chain_on_stack)
10363 stack_size += UNITS_PER_WORD;
10364
10365 if (!call_used_regs[REGNO (crtl->drap_reg)])
10366 stack_size += UNITS_PER_WORD;
10367
10368 /* This over-estimates by 1 minimal-stack-alignment-unit but
10369 mitigates that by counting in the new return address slot. */
10370 current_function_dynamic_stack_size
10371 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10372 }
10373
10374 current_function_static_stack_size = stack_size;
10375 }
10376
10377 /* The stack has already been decremented by the instruction calling us
10378 so probe if the size is non-negative to preserve the protection area. */
10379 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10380 {
10381 /* We expect the registers to be saved when probes are used. */
10382 gcc_assert (int_registers_saved);
10383
10384 if (STACK_CHECK_MOVING_SP)
10385 {
10386 ix86_adjust_stack_and_probe (allocate);
10387 allocate = 0;
10388 }
10389 else
10390 {
10391 HOST_WIDE_INT size = allocate;
10392
10393 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10394 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10395
10396 if (TARGET_STACK_PROBE)
10397 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10398 else
10399 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10400 }
10401 }
10402
10403 if (allocate == 0)
10404 ;
10405 else if (!ix86_target_stack_probe ()
10406 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10407 {
10408 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10409 GEN_INT (-allocate), -1,
10410 m->fs.cfa_reg == stack_pointer_rtx);
10411 }
10412 else
10413 {
10414 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10415 rtx r10 = NULL;
10416 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10417
10418 bool eax_live = false;
10419 bool r10_live = false;
10420
10421 if (TARGET_64BIT)
10422 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10423 if (!TARGET_64BIT_MS_ABI)
10424 eax_live = ix86_eax_live_at_start_p ();
10425
10426 if (eax_live)
10427 {
10428 emit_insn (gen_push (eax));
10429 allocate -= UNITS_PER_WORD;
10430 }
10431 if (r10_live)
10432 {
10433 r10 = gen_rtx_REG (Pmode, R10_REG);
10434 emit_insn (gen_push (r10));
10435 allocate -= UNITS_PER_WORD;
10436 }
10437
10438 emit_move_insn (eax, GEN_INT (allocate));
10439 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10440
10441 /* Use the fact that AX still contains ALLOCATE. */
10442 adjust_stack_insn = (Pmode == DImode
10443 ? gen_pro_epilogue_adjust_stack_di_sub
10444 : gen_pro_epilogue_adjust_stack_si_sub);
10445
10446 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10447 stack_pointer_rtx, eax));
10448
10449 /* Note that SEH directives need to continue tracking the stack
10450 pointer even after the frame pointer has been set up. */
10451 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10452 {
10453 if (m->fs.cfa_reg == stack_pointer_rtx)
10454 m->fs.cfa_offset += allocate;
10455
10456 RTX_FRAME_RELATED_P (insn) = 1;
10457 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10458 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10459 plus_constant (Pmode, stack_pointer_rtx,
10460 -allocate)));
10461 }
10462 m->fs.sp_offset += allocate;
10463
10464 if (r10_live && eax_live)
10465 {
10466 t = choose_baseaddr (m->fs.sp_offset - allocate);
10467 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10468 gen_frame_mem (word_mode, t));
10469 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10470 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10471 gen_frame_mem (word_mode, t));
10472 }
10473 else if (eax_live || r10_live)
10474 {
10475 t = choose_baseaddr (m->fs.sp_offset - allocate);
10476 emit_move_insn (gen_rtx_REG (word_mode,
10477 (eax_live ? AX_REG : R10_REG)),
10478 gen_frame_mem (word_mode, t));
10479 }
10480 }
10481 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10482
10483 /* If we havn't already set up the frame pointer, do so now. */
10484 if (frame_pointer_needed && !m->fs.fp_valid)
10485 {
10486 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10487 GEN_INT (frame.stack_pointer_offset
10488 - frame.hard_frame_pointer_offset));
10489 insn = emit_insn (insn);
10490 RTX_FRAME_RELATED_P (insn) = 1;
10491 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10492
10493 if (m->fs.cfa_reg == stack_pointer_rtx)
10494 m->fs.cfa_reg = hard_frame_pointer_rtx;
10495 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10496 m->fs.fp_valid = true;
10497 }
10498
10499 if (!int_registers_saved)
10500 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10501 if (frame.nsseregs)
10502 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10503
10504 pic_reg_used = false;
10505 if (pic_offset_table_rtx
10506 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10507 || crtl->profile))
10508 {
10509 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10510
10511 if (alt_pic_reg_used != INVALID_REGNUM)
10512 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10513
10514 pic_reg_used = true;
10515 }
10516
10517 if (pic_reg_used)
10518 {
10519 if (TARGET_64BIT)
10520 {
10521 if (ix86_cmodel == CM_LARGE_PIC)
10522 {
10523 rtx label, tmp_reg;
10524
10525 gcc_assert (Pmode == DImode);
10526 label = gen_label_rtx ();
10527 emit_label (label);
10528 LABEL_PRESERVE_P (label) = 1;
10529 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10530 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10531 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10532 label));
10533 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10534 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10535 pic_offset_table_rtx, tmp_reg));
10536 }
10537 else
10538 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10539 }
10540 else
10541 {
10542 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10543 RTX_FRAME_RELATED_P (insn) = 1;
10544 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10545 }
10546 }
10547
10548 /* In the pic_reg_used case, make sure that the got load isn't deleted
10549 when mcount needs it. Blockage to avoid call movement across mcount
10550 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10551 note. */
10552 if (crtl->profile && !flag_fentry && pic_reg_used)
10553 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10554
10555 if (crtl->drap_reg && !crtl->stack_realign_needed)
10556 {
10557 /* vDRAP is setup but after reload it turns out stack realign
10558 isn't necessary, here we will emit prologue to setup DRAP
10559 without stack realign adjustment */
10560 t = choose_baseaddr (0);
10561 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10562 }
10563
10564 /* Prevent instructions from being scheduled into register save push
10565 sequence when access to the redzone area is done through frame pointer.
10566 The offset between the frame pointer and the stack pointer is calculated
10567 relative to the value of the stack pointer at the end of the function
10568 prologue, and moving instructions that access redzone area via frame
10569 pointer inside push sequence violates this assumption. */
10570 if (frame_pointer_needed && frame.red_zone_size)
10571 emit_insn (gen_memory_blockage ());
10572
10573 /* Emit cld instruction if stringops are used in the function. */
10574 if (TARGET_CLD && ix86_current_function_needs_cld)
10575 emit_insn (gen_cld ());
10576
10577 /* SEH requires that the prologue end within 256 bytes of the start of
10578 the function. Prevent instruction schedules that would extend that.
10579 Further, prevent alloca modifications to the stack pointer from being
10580 combined with prologue modifications. */
10581 if (TARGET_SEH)
10582 emit_insn (gen_prologue_use (stack_pointer_rtx));
10583 }
10584
10585 /* Emit code to restore REG using a POP insn. */
10586
10587 static void
10588 ix86_emit_restore_reg_using_pop (rtx reg)
10589 {
10590 struct machine_function *m = cfun->machine;
10591 rtx insn = emit_insn (gen_pop (reg));
10592
10593 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10594 m->fs.sp_offset -= UNITS_PER_WORD;
10595
10596 if (m->fs.cfa_reg == crtl->drap_reg
10597 && REGNO (reg) == REGNO (crtl->drap_reg))
10598 {
10599 /* Previously we'd represented the CFA as an expression
10600 like *(%ebp - 8). We've just popped that value from
10601 the stack, which means we need to reset the CFA to
10602 the drap register. This will remain until we restore
10603 the stack pointer. */
10604 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10605 RTX_FRAME_RELATED_P (insn) = 1;
10606
10607 /* This means that the DRAP register is valid for addressing too. */
10608 m->fs.drap_valid = true;
10609 return;
10610 }
10611
10612 if (m->fs.cfa_reg == stack_pointer_rtx)
10613 {
10614 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10615 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10616 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10617 RTX_FRAME_RELATED_P (insn) = 1;
10618
10619 m->fs.cfa_offset -= UNITS_PER_WORD;
10620 }
10621
10622 /* When the frame pointer is the CFA, and we pop it, we are
10623 swapping back to the stack pointer as the CFA. This happens
10624 for stack frames that don't allocate other data, so we assume
10625 the stack pointer is now pointing at the return address, i.e.
10626 the function entry state, which makes the offset be 1 word. */
10627 if (reg == hard_frame_pointer_rtx)
10628 {
10629 m->fs.fp_valid = false;
10630 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10631 {
10632 m->fs.cfa_reg = stack_pointer_rtx;
10633 m->fs.cfa_offset -= UNITS_PER_WORD;
10634
10635 add_reg_note (insn, REG_CFA_DEF_CFA,
10636 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10637 GEN_INT (m->fs.cfa_offset)));
10638 RTX_FRAME_RELATED_P (insn) = 1;
10639 }
10640 }
10641 }
10642
10643 /* Emit code to restore saved registers using POP insns. */
10644
10645 static void
10646 ix86_emit_restore_regs_using_pop (void)
10647 {
10648 unsigned int regno;
10649
10650 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10651 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10652 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10653 }
10654
10655 /* Emit code and notes for the LEAVE instruction. */
10656
10657 static void
10658 ix86_emit_leave (void)
10659 {
10660 struct machine_function *m = cfun->machine;
10661 rtx insn = emit_insn (ix86_gen_leave ());
10662
10663 ix86_add_queued_cfa_restore_notes (insn);
10664
10665 gcc_assert (m->fs.fp_valid);
10666 m->fs.sp_valid = true;
10667 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10668 m->fs.fp_valid = false;
10669
10670 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10671 {
10672 m->fs.cfa_reg = stack_pointer_rtx;
10673 m->fs.cfa_offset = m->fs.sp_offset;
10674
10675 add_reg_note (insn, REG_CFA_DEF_CFA,
10676 plus_constant (Pmode, stack_pointer_rtx,
10677 m->fs.sp_offset));
10678 RTX_FRAME_RELATED_P (insn) = 1;
10679 }
10680 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10681 m->fs.fp_offset);
10682 }
10683
10684 /* Emit code to restore saved registers using MOV insns.
10685 First register is restored from CFA - CFA_OFFSET. */
10686 static void
10687 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10688 bool maybe_eh_return)
10689 {
10690 struct machine_function *m = cfun->machine;
10691 unsigned int regno;
10692
10693 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10694 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10695 {
10696 rtx reg = gen_rtx_REG (word_mode, regno);
10697 rtx insn, mem;
10698
10699 mem = choose_baseaddr (cfa_offset);
10700 mem = gen_frame_mem (word_mode, mem);
10701 insn = emit_move_insn (reg, mem);
10702
10703 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10704 {
10705 /* Previously we'd represented the CFA as an expression
10706 like *(%ebp - 8). We've just popped that value from
10707 the stack, which means we need to reset the CFA to
10708 the drap register. This will remain until we restore
10709 the stack pointer. */
10710 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10711 RTX_FRAME_RELATED_P (insn) = 1;
10712
10713 /* This means that the DRAP register is valid for addressing. */
10714 m->fs.drap_valid = true;
10715 }
10716 else
10717 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10718
10719 cfa_offset -= UNITS_PER_WORD;
10720 }
10721 }
10722
10723 /* Emit code to restore saved registers using MOV insns.
10724 First register is restored from CFA - CFA_OFFSET. */
10725 static void
10726 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10727 bool maybe_eh_return)
10728 {
10729 unsigned int regno;
10730
10731 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10732 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10733 {
10734 rtx reg = gen_rtx_REG (V4SFmode, regno);
10735 rtx mem;
10736
10737 mem = choose_baseaddr (cfa_offset);
10738 mem = gen_rtx_MEM (V4SFmode, mem);
10739 set_mem_align (mem, 128);
10740 emit_move_insn (reg, mem);
10741
10742 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10743
10744 cfa_offset -= 16;
10745 }
10746 }
10747
10748 /* Emit vzeroupper if needed. */
10749
10750 void
10751 ix86_maybe_emit_epilogue_vzeroupper (void)
10752 {
10753 if (TARGET_VZEROUPPER
10754 && !TREE_THIS_VOLATILE (cfun->decl)
10755 && !cfun->machine->caller_return_avx256_p)
10756 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10757 }
10758
10759 /* Restore function stack, frame, and registers. */
10760
10761 void
10762 ix86_expand_epilogue (int style)
10763 {
10764 struct machine_function *m = cfun->machine;
10765 struct machine_frame_state frame_state_save = m->fs;
10766 struct ix86_frame frame;
10767 bool restore_regs_via_mov;
10768 bool using_drap;
10769
10770 ix86_finalize_stack_realign_flags ();
10771 ix86_compute_frame_layout (&frame);
10772
10773 m->fs.sp_valid = (!frame_pointer_needed
10774 || (current_function_sp_is_unchanging
10775 && !stack_realign_fp));
10776 gcc_assert (!m->fs.sp_valid
10777 || m->fs.sp_offset == frame.stack_pointer_offset);
10778
10779 /* The FP must be valid if the frame pointer is present. */
10780 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10781 gcc_assert (!m->fs.fp_valid
10782 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10783
10784 /* We must have *some* valid pointer to the stack frame. */
10785 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10786
10787 /* The DRAP is never valid at this point. */
10788 gcc_assert (!m->fs.drap_valid);
10789
10790 /* See the comment about red zone and frame
10791 pointer usage in ix86_expand_prologue. */
10792 if (frame_pointer_needed && frame.red_zone_size)
10793 emit_insn (gen_memory_blockage ());
10794
10795 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10796 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10797
10798 /* Determine the CFA offset of the end of the red-zone. */
10799 m->fs.red_zone_offset = 0;
10800 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10801 {
10802 /* The red-zone begins below the return address. */
10803 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10804
10805 /* When the register save area is in the aligned portion of
10806 the stack, determine the maximum runtime displacement that
10807 matches up with the aligned frame. */
10808 if (stack_realign_drap)
10809 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10810 + UNITS_PER_WORD);
10811 }
10812
10813 /* Special care must be taken for the normal return case of a function
10814 using eh_return: the eax and edx registers are marked as saved, but
10815 not restored along this path. Adjust the save location to match. */
10816 if (crtl->calls_eh_return && style != 2)
10817 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10818
10819 /* EH_RETURN requires the use of moves to function properly. */
10820 if (crtl->calls_eh_return)
10821 restore_regs_via_mov = true;
10822 /* SEH requires the use of pops to identify the epilogue. */
10823 else if (TARGET_SEH)
10824 restore_regs_via_mov = false;
10825 /* If we're only restoring one register and sp is not valid then
10826 using a move instruction to restore the register since it's
10827 less work than reloading sp and popping the register. */
10828 else if (!m->fs.sp_valid && frame.nregs <= 1)
10829 restore_regs_via_mov = true;
10830 else if (TARGET_EPILOGUE_USING_MOVE
10831 && cfun->machine->use_fast_prologue_epilogue
10832 && (frame.nregs > 1
10833 || m->fs.sp_offset != frame.reg_save_offset))
10834 restore_regs_via_mov = true;
10835 else if (frame_pointer_needed
10836 && !frame.nregs
10837 && m->fs.sp_offset != frame.reg_save_offset)
10838 restore_regs_via_mov = true;
10839 else if (frame_pointer_needed
10840 && TARGET_USE_LEAVE
10841 && cfun->machine->use_fast_prologue_epilogue
10842 && frame.nregs == 1)
10843 restore_regs_via_mov = true;
10844 else
10845 restore_regs_via_mov = false;
10846
10847 if (restore_regs_via_mov || frame.nsseregs)
10848 {
10849 /* Ensure that the entire register save area is addressable via
10850 the stack pointer, if we will restore via sp. */
10851 if (TARGET_64BIT
10852 && m->fs.sp_offset > 0x7fffffff
10853 && !(m->fs.fp_valid || m->fs.drap_valid)
10854 && (frame.nsseregs + frame.nregs) != 0)
10855 {
10856 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10857 GEN_INT (m->fs.sp_offset
10858 - frame.sse_reg_save_offset),
10859 style,
10860 m->fs.cfa_reg == stack_pointer_rtx);
10861 }
10862 }
10863
10864 /* If there are any SSE registers to restore, then we have to do it
10865 via moves, since there's obviously no pop for SSE regs. */
10866 if (frame.nsseregs)
10867 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10868 style == 2);
10869
10870 if (restore_regs_via_mov)
10871 {
10872 rtx t;
10873
10874 if (frame.nregs)
10875 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10876
10877 /* eh_return epilogues need %ecx added to the stack pointer. */
10878 if (style == 2)
10879 {
10880 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10881
10882 /* Stack align doesn't work with eh_return. */
10883 gcc_assert (!stack_realign_drap);
10884 /* Neither does regparm nested functions. */
10885 gcc_assert (!ix86_static_chain_on_stack);
10886
10887 if (frame_pointer_needed)
10888 {
10889 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10890 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10891 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10892
10893 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10894 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10895
10896 /* Note that we use SA as a temporary CFA, as the return
10897 address is at the proper place relative to it. We
10898 pretend this happens at the FP restore insn because
10899 prior to this insn the FP would be stored at the wrong
10900 offset relative to SA, and after this insn we have no
10901 other reasonable register to use for the CFA. We don't
10902 bother resetting the CFA to the SP for the duration of
10903 the return insn. */
10904 add_reg_note (insn, REG_CFA_DEF_CFA,
10905 plus_constant (Pmode, sa, UNITS_PER_WORD));
10906 ix86_add_queued_cfa_restore_notes (insn);
10907 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10908 RTX_FRAME_RELATED_P (insn) = 1;
10909
10910 m->fs.cfa_reg = sa;
10911 m->fs.cfa_offset = UNITS_PER_WORD;
10912 m->fs.fp_valid = false;
10913
10914 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10915 const0_rtx, style, false);
10916 }
10917 else
10918 {
10919 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10920 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10921 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10922 ix86_add_queued_cfa_restore_notes (insn);
10923
10924 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10925 if (m->fs.cfa_offset != UNITS_PER_WORD)
10926 {
10927 m->fs.cfa_offset = UNITS_PER_WORD;
10928 add_reg_note (insn, REG_CFA_DEF_CFA,
10929 plus_constant (Pmode, stack_pointer_rtx,
10930 UNITS_PER_WORD));
10931 RTX_FRAME_RELATED_P (insn) = 1;
10932 }
10933 }
10934 m->fs.sp_offset = UNITS_PER_WORD;
10935 m->fs.sp_valid = true;
10936 }
10937 }
10938 else
10939 {
10940 /* SEH requires that the function end with (1) a stack adjustment
10941 if necessary, (2) a sequence of pops, and (3) a return or
10942 jump instruction. Prevent insns from the function body from
10943 being scheduled into this sequence. */
10944 if (TARGET_SEH)
10945 {
10946 /* Prevent a catch region from being adjacent to the standard
10947 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10948 several other flags that would be interesting to test are
10949 not yet set up. */
10950 if (flag_non_call_exceptions)
10951 emit_insn (gen_nops (const1_rtx));
10952 else
10953 emit_insn (gen_blockage ());
10954 }
10955
10956 /* First step is to deallocate the stack frame so that we can
10957 pop the registers. */
10958 if (!m->fs.sp_valid)
10959 {
10960 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10961 GEN_INT (m->fs.fp_offset
10962 - frame.reg_save_offset),
10963 style, false);
10964 }
10965 else if (m->fs.sp_offset != frame.reg_save_offset)
10966 {
10967 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10968 GEN_INT (m->fs.sp_offset
10969 - frame.reg_save_offset),
10970 style,
10971 m->fs.cfa_reg == stack_pointer_rtx);
10972 }
10973
10974 ix86_emit_restore_regs_using_pop ();
10975 }
10976
10977 /* If we used a stack pointer and haven't already got rid of it,
10978 then do so now. */
10979 if (m->fs.fp_valid)
10980 {
10981 /* If the stack pointer is valid and pointing at the frame
10982 pointer store address, then we only need a pop. */
10983 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10984 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10985 /* Leave results in shorter dependency chains on CPUs that are
10986 able to grok it fast. */
10987 else if (TARGET_USE_LEAVE
10988 || optimize_function_for_size_p (cfun)
10989 || !cfun->machine->use_fast_prologue_epilogue)
10990 ix86_emit_leave ();
10991 else
10992 {
10993 pro_epilogue_adjust_stack (stack_pointer_rtx,
10994 hard_frame_pointer_rtx,
10995 const0_rtx, style, !using_drap);
10996 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10997 }
10998 }
10999
11000 if (using_drap)
11001 {
11002 int param_ptr_offset = UNITS_PER_WORD;
11003 rtx insn;
11004
11005 gcc_assert (stack_realign_drap);
11006
11007 if (ix86_static_chain_on_stack)
11008 param_ptr_offset += UNITS_PER_WORD;
11009 if (!call_used_regs[REGNO (crtl->drap_reg)])
11010 param_ptr_offset += UNITS_PER_WORD;
11011
11012 insn = emit_insn (gen_rtx_SET
11013 (VOIDmode, stack_pointer_rtx,
11014 gen_rtx_PLUS (Pmode,
11015 crtl->drap_reg,
11016 GEN_INT (-param_ptr_offset))));
11017 m->fs.cfa_reg = stack_pointer_rtx;
11018 m->fs.cfa_offset = param_ptr_offset;
11019 m->fs.sp_offset = param_ptr_offset;
11020 m->fs.realigned = false;
11021
11022 add_reg_note (insn, REG_CFA_DEF_CFA,
11023 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11024 GEN_INT (param_ptr_offset)));
11025 RTX_FRAME_RELATED_P (insn) = 1;
11026
11027 if (!call_used_regs[REGNO (crtl->drap_reg)])
11028 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11029 }
11030
11031 /* At this point the stack pointer must be valid, and we must have
11032 restored all of the registers. We may not have deallocated the
11033 entire stack frame. We've delayed this until now because it may
11034 be possible to merge the local stack deallocation with the
11035 deallocation forced by ix86_static_chain_on_stack. */
11036 gcc_assert (m->fs.sp_valid);
11037 gcc_assert (!m->fs.fp_valid);
11038 gcc_assert (!m->fs.realigned);
11039 if (m->fs.sp_offset != UNITS_PER_WORD)
11040 {
11041 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11042 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11043 style, true);
11044 }
11045 else
11046 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11047
11048 /* Sibcall epilogues don't want a return instruction. */
11049 if (style == 0)
11050 {
11051 m->fs = frame_state_save;
11052 return;
11053 }
11054
11055 /* Emit vzeroupper if needed. */
11056 ix86_maybe_emit_epilogue_vzeroupper ();
11057
11058 if (crtl->args.pops_args && crtl->args.size)
11059 {
11060 rtx popc = GEN_INT (crtl->args.pops_args);
11061
11062 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11063 address, do explicit add, and jump indirectly to the caller. */
11064
11065 if (crtl->args.pops_args >= 65536)
11066 {
11067 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11068 rtx insn;
11069
11070 /* There is no "pascal" calling convention in any 64bit ABI. */
11071 gcc_assert (!TARGET_64BIT);
11072
11073 insn = emit_insn (gen_pop (ecx));
11074 m->fs.cfa_offset -= UNITS_PER_WORD;
11075 m->fs.sp_offset -= UNITS_PER_WORD;
11076
11077 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11078 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11079 add_reg_note (insn, REG_CFA_REGISTER,
11080 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11081 RTX_FRAME_RELATED_P (insn) = 1;
11082
11083 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11084 popc, -1, true);
11085 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11086 }
11087 else
11088 emit_jump_insn (gen_simple_return_pop_internal (popc));
11089 }
11090 else
11091 emit_jump_insn (gen_simple_return_internal ());
11092
11093 /* Restore the state back to the state from the prologue,
11094 so that it's correct for the next epilogue. */
11095 m->fs = frame_state_save;
11096 }
11097
11098 /* Reset from the function's potential modifications. */
11099
11100 static void
11101 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11102 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11103 {
11104 if (pic_offset_table_rtx)
11105 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11106 #if TARGET_MACHO
11107 /* Mach-O doesn't support labels at the end of objects, so if
11108 it looks like we might want one, insert a NOP. */
11109 {
11110 rtx insn = get_last_insn ();
11111 rtx deleted_debug_label = NULL_RTX;
11112 while (insn
11113 && NOTE_P (insn)
11114 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11115 {
11116 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11117 notes only, instead set their CODE_LABEL_NUMBER to -1,
11118 otherwise there would be code generation differences
11119 in between -g and -g0. */
11120 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11121 deleted_debug_label = insn;
11122 insn = PREV_INSN (insn);
11123 }
11124 if (insn
11125 && (LABEL_P (insn)
11126 || (NOTE_P (insn)
11127 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11128 fputs ("\tnop\n", file);
11129 else if (deleted_debug_label)
11130 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11131 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11132 CODE_LABEL_NUMBER (insn) = -1;
11133 }
11134 #endif
11135
11136 }
11137
11138 /* Return a scratch register to use in the split stack prologue. The
11139 split stack prologue is used for -fsplit-stack. It is the first
11140 instructions in the function, even before the regular prologue.
11141 The scratch register can be any caller-saved register which is not
11142 used for parameters or for the static chain. */
11143
11144 static unsigned int
11145 split_stack_prologue_scratch_regno (void)
11146 {
11147 if (TARGET_64BIT)
11148 return R11_REG;
11149 else
11150 {
11151 bool is_fastcall;
11152 int regparm;
11153
11154 is_fastcall = (lookup_attribute ("fastcall",
11155 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11156 != NULL);
11157 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11158
11159 if (is_fastcall)
11160 {
11161 if (DECL_STATIC_CHAIN (cfun->decl))
11162 {
11163 sorry ("-fsplit-stack does not support fastcall with "
11164 "nested function");
11165 return INVALID_REGNUM;
11166 }
11167 return AX_REG;
11168 }
11169 else if (regparm < 3)
11170 {
11171 if (!DECL_STATIC_CHAIN (cfun->decl))
11172 return CX_REG;
11173 else
11174 {
11175 if (regparm >= 2)
11176 {
11177 sorry ("-fsplit-stack does not support 2 register "
11178 " parameters for a nested function");
11179 return INVALID_REGNUM;
11180 }
11181 return DX_REG;
11182 }
11183 }
11184 else
11185 {
11186 /* FIXME: We could make this work by pushing a register
11187 around the addition and comparison. */
11188 sorry ("-fsplit-stack does not support 3 register parameters");
11189 return INVALID_REGNUM;
11190 }
11191 }
11192 }
11193
11194 /* A SYMBOL_REF for the function which allocates new stackspace for
11195 -fsplit-stack. */
11196
11197 static GTY(()) rtx split_stack_fn;
11198
11199 /* A SYMBOL_REF for the more stack function when using the large
11200 model. */
11201
11202 static GTY(()) rtx split_stack_fn_large;
11203
11204 /* Handle -fsplit-stack. These are the first instructions in the
11205 function, even before the regular prologue. */
11206
11207 void
11208 ix86_expand_split_stack_prologue (void)
11209 {
11210 struct ix86_frame frame;
11211 HOST_WIDE_INT allocate;
11212 unsigned HOST_WIDE_INT args_size;
11213 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11214 rtx scratch_reg = NULL_RTX;
11215 rtx varargs_label = NULL_RTX;
11216 rtx fn;
11217
11218 gcc_assert (flag_split_stack && reload_completed);
11219
11220 ix86_finalize_stack_realign_flags ();
11221 ix86_compute_frame_layout (&frame);
11222 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11223
11224 /* This is the label we will branch to if we have enough stack
11225 space. We expect the basic block reordering pass to reverse this
11226 branch if optimizing, so that we branch in the unlikely case. */
11227 label = gen_label_rtx ();
11228
11229 /* We need to compare the stack pointer minus the frame size with
11230 the stack boundary in the TCB. The stack boundary always gives
11231 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11232 can compare directly. Otherwise we need to do an addition. */
11233
11234 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11235 UNSPEC_STACK_CHECK);
11236 limit = gen_rtx_CONST (Pmode, limit);
11237 limit = gen_rtx_MEM (Pmode, limit);
11238 if (allocate < SPLIT_STACK_AVAILABLE)
11239 current = stack_pointer_rtx;
11240 else
11241 {
11242 unsigned int scratch_regno;
11243 rtx offset;
11244
11245 /* We need a scratch register to hold the stack pointer minus
11246 the required frame size. Since this is the very start of the
11247 function, the scratch register can be any caller-saved
11248 register which is not used for parameters. */
11249 offset = GEN_INT (- allocate);
11250 scratch_regno = split_stack_prologue_scratch_regno ();
11251 if (scratch_regno == INVALID_REGNUM)
11252 return;
11253 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11254 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11255 {
11256 /* We don't use ix86_gen_add3 in this case because it will
11257 want to split to lea, but when not optimizing the insn
11258 will not be split after this point. */
11259 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11260 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11261 offset)));
11262 }
11263 else
11264 {
11265 emit_move_insn (scratch_reg, offset);
11266 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11267 stack_pointer_rtx));
11268 }
11269 current = scratch_reg;
11270 }
11271
11272 ix86_expand_branch (GEU, current, limit, label);
11273 jump_insn = get_last_insn ();
11274 JUMP_LABEL (jump_insn) = label;
11275
11276 /* Mark the jump as very likely to be taken. */
11277 add_reg_note (jump_insn, REG_BR_PROB,
11278 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11279
11280 if (split_stack_fn == NULL_RTX)
11281 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11282 fn = split_stack_fn;
11283
11284 /* Get more stack space. We pass in the desired stack space and the
11285 size of the arguments to copy to the new stack. In 32-bit mode
11286 we push the parameters; __morestack will return on a new stack
11287 anyhow. In 64-bit mode we pass the parameters in r10 and
11288 r11. */
11289 allocate_rtx = GEN_INT (allocate);
11290 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11291 call_fusage = NULL_RTX;
11292 if (TARGET_64BIT)
11293 {
11294 rtx reg10, reg11;
11295
11296 reg10 = gen_rtx_REG (Pmode, R10_REG);
11297 reg11 = gen_rtx_REG (Pmode, R11_REG);
11298
11299 /* If this function uses a static chain, it will be in %r10.
11300 Preserve it across the call to __morestack. */
11301 if (DECL_STATIC_CHAIN (cfun->decl))
11302 {
11303 rtx rax;
11304
11305 rax = gen_rtx_REG (word_mode, AX_REG);
11306 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11307 use_reg (&call_fusage, rax);
11308 }
11309
11310 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11311 {
11312 HOST_WIDE_INT argval;
11313
11314 gcc_assert (Pmode == DImode);
11315 /* When using the large model we need to load the address
11316 into a register, and we've run out of registers. So we
11317 switch to a different calling convention, and we call a
11318 different function: __morestack_large. We pass the
11319 argument size in the upper 32 bits of r10 and pass the
11320 frame size in the lower 32 bits. */
11321 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11322 gcc_assert ((args_size & 0xffffffff) == args_size);
11323
11324 if (split_stack_fn_large == NULL_RTX)
11325 split_stack_fn_large =
11326 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11327
11328 if (ix86_cmodel == CM_LARGE_PIC)
11329 {
11330 rtx label, x;
11331
11332 label = gen_label_rtx ();
11333 emit_label (label);
11334 LABEL_PRESERVE_P (label) = 1;
11335 emit_insn (gen_set_rip_rex64 (reg10, label));
11336 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11337 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11338 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11339 UNSPEC_GOT);
11340 x = gen_rtx_CONST (Pmode, x);
11341 emit_move_insn (reg11, x);
11342 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11343 x = gen_const_mem (Pmode, x);
11344 emit_move_insn (reg11, x);
11345 }
11346 else
11347 emit_move_insn (reg11, split_stack_fn_large);
11348
11349 fn = reg11;
11350
11351 argval = ((args_size << 16) << 16) + allocate;
11352 emit_move_insn (reg10, GEN_INT (argval));
11353 }
11354 else
11355 {
11356 emit_move_insn (reg10, allocate_rtx);
11357 emit_move_insn (reg11, GEN_INT (args_size));
11358 use_reg (&call_fusage, reg11);
11359 }
11360
11361 use_reg (&call_fusage, reg10);
11362 }
11363 else
11364 {
11365 emit_insn (gen_push (GEN_INT (args_size)));
11366 emit_insn (gen_push (allocate_rtx));
11367 }
11368 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11369 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11370 NULL_RTX, false);
11371 add_function_usage_to (call_insn, call_fusage);
11372
11373 /* In order to make call/return prediction work right, we now need
11374 to execute a return instruction. See
11375 libgcc/config/i386/morestack.S for the details on how this works.
11376
11377 For flow purposes gcc must not see this as a return
11378 instruction--we need control flow to continue at the subsequent
11379 label. Therefore, we use an unspec. */
11380 gcc_assert (crtl->args.pops_args < 65536);
11381 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11382
11383 /* If we are in 64-bit mode and this function uses a static chain,
11384 we saved %r10 in %rax before calling _morestack. */
11385 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11386 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11387 gen_rtx_REG (word_mode, AX_REG));
11388
11389 /* If this function calls va_start, we need to store a pointer to
11390 the arguments on the old stack, because they may not have been
11391 all copied to the new stack. At this point the old stack can be
11392 found at the frame pointer value used by __morestack, because
11393 __morestack has set that up before calling back to us. Here we
11394 store that pointer in a scratch register, and in
11395 ix86_expand_prologue we store the scratch register in a stack
11396 slot. */
11397 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11398 {
11399 unsigned int scratch_regno;
11400 rtx frame_reg;
11401 int words;
11402
11403 scratch_regno = split_stack_prologue_scratch_regno ();
11404 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11405 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11406
11407 /* 64-bit:
11408 fp -> old fp value
11409 return address within this function
11410 return address of caller of this function
11411 stack arguments
11412 So we add three words to get to the stack arguments.
11413
11414 32-bit:
11415 fp -> old fp value
11416 return address within this function
11417 first argument to __morestack
11418 second argument to __morestack
11419 return address of caller of this function
11420 stack arguments
11421 So we add five words to get to the stack arguments.
11422 */
11423 words = TARGET_64BIT ? 3 : 5;
11424 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11425 gen_rtx_PLUS (Pmode, frame_reg,
11426 GEN_INT (words * UNITS_PER_WORD))));
11427
11428 varargs_label = gen_label_rtx ();
11429 emit_jump_insn (gen_jump (varargs_label));
11430 JUMP_LABEL (get_last_insn ()) = varargs_label;
11431
11432 emit_barrier ();
11433 }
11434
11435 emit_label (label);
11436 LABEL_NUSES (label) = 1;
11437
11438 /* If this function calls va_start, we now have to set the scratch
11439 register for the case where we do not call __morestack. In this
11440 case we need to set it based on the stack pointer. */
11441 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11442 {
11443 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11444 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11445 GEN_INT (UNITS_PER_WORD))));
11446
11447 emit_label (varargs_label);
11448 LABEL_NUSES (varargs_label) = 1;
11449 }
11450 }
11451
11452 /* We may have to tell the dataflow pass that the split stack prologue
11453 is initializing a scratch register. */
11454
11455 static void
11456 ix86_live_on_entry (bitmap regs)
11457 {
11458 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11459 {
11460 gcc_assert (flag_split_stack);
11461 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11462 }
11463 }
11464 \f
11465 /* Determine if op is suitable SUBREG RTX for address. */
11466
11467 static bool
11468 ix86_address_subreg_operand (rtx op)
11469 {
11470 enum machine_mode mode;
11471
11472 if (!REG_P (op))
11473 return false;
11474
11475 mode = GET_MODE (op);
11476
11477 if (GET_MODE_CLASS (mode) != MODE_INT)
11478 return false;
11479
11480 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11481 failures when the register is one word out of a two word structure. */
11482 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11483 return false;
11484
11485 /* Allow only SUBREGs of non-eliminable hard registers. */
11486 return register_no_elim_operand (op, mode);
11487 }
11488
11489 /* Extract the parts of an RTL expression that is a valid memory address
11490 for an instruction. Return 0 if the structure of the address is
11491 grossly off. Return -1 if the address contains ASHIFT, so it is not
11492 strictly valid, but still used for computing length of lea instruction. */
11493
11494 int
11495 ix86_decompose_address (rtx addr, struct ix86_address *out)
11496 {
11497 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11498 rtx base_reg, index_reg;
11499 HOST_WIDE_INT scale = 1;
11500 rtx scale_rtx = NULL_RTX;
11501 rtx tmp;
11502 int retval = 1;
11503 enum ix86_address_seg seg = SEG_DEFAULT;
11504
11505 /* Allow zero-extended SImode addresses,
11506 they will be emitted with addr32 prefix. */
11507 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11508 {
11509 if (GET_CODE (addr) == ZERO_EXTEND
11510 && GET_MODE (XEXP (addr, 0)) == SImode)
11511 addr = XEXP (addr, 0);
11512 else if (GET_CODE (addr) == AND
11513 && const_32bit_mask (XEXP (addr, 1), DImode))
11514 {
11515 addr = XEXP (addr, 0);
11516
11517 /* Adjust SUBREGs. */
11518 if (GET_CODE (addr) == SUBREG
11519 && GET_MODE (SUBREG_REG (addr)) == SImode)
11520 addr = SUBREG_REG (addr);
11521 else if (GET_MODE (addr) == DImode)
11522 addr = gen_rtx_SUBREG (SImode, addr, 0);
11523 else if (GET_MODE (addr) != VOIDmode)
11524 return 0;
11525 }
11526 }
11527
11528 if (REG_P (addr))
11529 base = addr;
11530 else if (GET_CODE (addr) == SUBREG)
11531 {
11532 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11533 base = addr;
11534 else
11535 return 0;
11536 }
11537 else if (GET_CODE (addr) == PLUS)
11538 {
11539 rtx addends[4], op;
11540 int n = 0, i;
11541
11542 op = addr;
11543 do
11544 {
11545 if (n >= 4)
11546 return 0;
11547 addends[n++] = XEXP (op, 1);
11548 op = XEXP (op, 0);
11549 }
11550 while (GET_CODE (op) == PLUS);
11551 if (n >= 4)
11552 return 0;
11553 addends[n] = op;
11554
11555 for (i = n; i >= 0; --i)
11556 {
11557 op = addends[i];
11558 switch (GET_CODE (op))
11559 {
11560 case MULT:
11561 if (index)
11562 return 0;
11563 index = XEXP (op, 0);
11564 scale_rtx = XEXP (op, 1);
11565 break;
11566
11567 case ASHIFT:
11568 if (index)
11569 return 0;
11570 index = XEXP (op, 0);
11571 tmp = XEXP (op, 1);
11572 if (!CONST_INT_P (tmp))
11573 return 0;
11574 scale = INTVAL (tmp);
11575 if ((unsigned HOST_WIDE_INT) scale > 3)
11576 return 0;
11577 scale = 1 << scale;
11578 break;
11579
11580 case ZERO_EXTEND:
11581 op = XEXP (op, 0);
11582 if (GET_CODE (op) != UNSPEC)
11583 return 0;
11584 /* FALLTHRU */
11585
11586 case UNSPEC:
11587 if (XINT (op, 1) == UNSPEC_TP
11588 && TARGET_TLS_DIRECT_SEG_REFS
11589 && seg == SEG_DEFAULT)
11590 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11591 else
11592 return 0;
11593 break;
11594
11595 case SUBREG:
11596 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11597 return 0;
11598 /* FALLTHRU */
11599
11600 case REG:
11601 if (!base)
11602 base = op;
11603 else if (!index)
11604 index = op;
11605 else
11606 return 0;
11607 break;
11608
11609 case CONST:
11610 case CONST_INT:
11611 case SYMBOL_REF:
11612 case LABEL_REF:
11613 if (disp)
11614 return 0;
11615 disp = op;
11616 break;
11617
11618 default:
11619 return 0;
11620 }
11621 }
11622 }
11623 else if (GET_CODE (addr) == MULT)
11624 {
11625 index = XEXP (addr, 0); /* index*scale */
11626 scale_rtx = XEXP (addr, 1);
11627 }
11628 else if (GET_CODE (addr) == ASHIFT)
11629 {
11630 /* We're called for lea too, which implements ashift on occasion. */
11631 index = XEXP (addr, 0);
11632 tmp = XEXP (addr, 1);
11633 if (!CONST_INT_P (tmp))
11634 return 0;
11635 scale = INTVAL (tmp);
11636 if ((unsigned HOST_WIDE_INT) scale > 3)
11637 return 0;
11638 scale = 1 << scale;
11639 retval = -1;
11640 }
11641 else
11642 disp = addr; /* displacement */
11643
11644 if (index)
11645 {
11646 if (REG_P (index))
11647 ;
11648 else if (GET_CODE (index) == SUBREG
11649 && ix86_address_subreg_operand (SUBREG_REG (index)))
11650 ;
11651 else
11652 return 0;
11653 }
11654
11655 /* Address override works only on the (%reg) part of %fs:(%reg). */
11656 if (seg != SEG_DEFAULT
11657 && ((base && GET_MODE (base) != word_mode)
11658 || (index && GET_MODE (index) != word_mode)))
11659 return 0;
11660
11661 /* Extract the integral value of scale. */
11662 if (scale_rtx)
11663 {
11664 if (!CONST_INT_P (scale_rtx))
11665 return 0;
11666 scale = INTVAL (scale_rtx);
11667 }
11668
11669 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11670 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11671
11672 /* Avoid useless 0 displacement. */
11673 if (disp == const0_rtx && (base || index))
11674 disp = NULL_RTX;
11675
11676 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11677 if (base_reg && index_reg && scale == 1
11678 && (index_reg == arg_pointer_rtx
11679 || index_reg == frame_pointer_rtx
11680 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11681 {
11682 rtx tmp;
11683 tmp = base, base = index, index = tmp;
11684 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11685 }
11686
11687 /* Special case: %ebp cannot be encoded as a base without a displacement.
11688 Similarly %r13. */
11689 if (!disp
11690 && base_reg
11691 && (base_reg == hard_frame_pointer_rtx
11692 || base_reg == frame_pointer_rtx
11693 || base_reg == arg_pointer_rtx
11694 || (REG_P (base_reg)
11695 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11696 || REGNO (base_reg) == R13_REG))))
11697 disp = const0_rtx;
11698
11699 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11700 Avoid this by transforming to [%esi+0].
11701 Reload calls address legitimization without cfun defined, so we need
11702 to test cfun for being non-NULL. */
11703 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11704 && base_reg && !index_reg && !disp
11705 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11706 disp = const0_rtx;
11707
11708 /* Special case: encode reg+reg instead of reg*2. */
11709 if (!base && index && scale == 2)
11710 base = index, base_reg = index_reg, scale = 1;
11711
11712 /* Special case: scaling cannot be encoded without base or displacement. */
11713 if (!base && !disp && index && scale != 1)
11714 disp = const0_rtx;
11715
11716 out->base = base;
11717 out->index = index;
11718 out->disp = disp;
11719 out->scale = scale;
11720 out->seg = seg;
11721
11722 return retval;
11723 }
11724 \f
11725 /* Return cost of the memory address x.
11726 For i386, it is better to use a complex address than let gcc copy
11727 the address into a reg and make a new pseudo. But not if the address
11728 requires to two regs - that would mean more pseudos with longer
11729 lifetimes. */
11730 static int
11731 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11732 {
11733 struct ix86_address parts;
11734 int cost = 1;
11735 int ok = ix86_decompose_address (x, &parts);
11736
11737 gcc_assert (ok);
11738
11739 if (parts.base && GET_CODE (parts.base) == SUBREG)
11740 parts.base = SUBREG_REG (parts.base);
11741 if (parts.index && GET_CODE (parts.index) == SUBREG)
11742 parts.index = SUBREG_REG (parts.index);
11743
11744 /* Attempt to minimize number of registers in the address. */
11745 if ((parts.base
11746 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11747 || (parts.index
11748 && (!REG_P (parts.index)
11749 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11750 cost++;
11751
11752 if (parts.base
11753 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11754 && parts.index
11755 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11756 && parts.base != parts.index)
11757 cost++;
11758
11759 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11760 since it's predecode logic can't detect the length of instructions
11761 and it degenerates to vector decoded. Increase cost of such
11762 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11763 to split such addresses or even refuse such addresses at all.
11764
11765 Following addressing modes are affected:
11766 [base+scale*index]
11767 [scale*index+disp]
11768 [base+index]
11769
11770 The first and last case may be avoidable by explicitly coding the zero in
11771 memory address, but I don't have AMD-K6 machine handy to check this
11772 theory. */
11773
11774 if (TARGET_K6
11775 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11776 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11777 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11778 cost += 10;
11779
11780 return cost;
11781 }
11782 \f
11783 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11784 this is used for to form addresses to local data when -fPIC is in
11785 use. */
11786
11787 static bool
11788 darwin_local_data_pic (rtx disp)
11789 {
11790 return (GET_CODE (disp) == UNSPEC
11791 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11792 }
11793
11794 /* Determine if a given RTX is a valid constant. We already know this
11795 satisfies CONSTANT_P. */
11796
11797 static bool
11798 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11799 {
11800 switch (GET_CODE (x))
11801 {
11802 case CONST:
11803 x = XEXP (x, 0);
11804
11805 if (GET_CODE (x) == PLUS)
11806 {
11807 if (!CONST_INT_P (XEXP (x, 1)))
11808 return false;
11809 x = XEXP (x, 0);
11810 }
11811
11812 if (TARGET_MACHO && darwin_local_data_pic (x))
11813 return true;
11814
11815 /* Only some unspecs are valid as "constants". */
11816 if (GET_CODE (x) == UNSPEC)
11817 switch (XINT (x, 1))
11818 {
11819 case UNSPEC_GOT:
11820 case UNSPEC_GOTOFF:
11821 case UNSPEC_PLTOFF:
11822 return TARGET_64BIT;
11823 case UNSPEC_TPOFF:
11824 case UNSPEC_NTPOFF:
11825 x = XVECEXP (x, 0, 0);
11826 return (GET_CODE (x) == SYMBOL_REF
11827 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11828 case UNSPEC_DTPOFF:
11829 x = XVECEXP (x, 0, 0);
11830 return (GET_CODE (x) == SYMBOL_REF
11831 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11832 default:
11833 return false;
11834 }
11835
11836 /* We must have drilled down to a symbol. */
11837 if (GET_CODE (x) == LABEL_REF)
11838 return true;
11839 if (GET_CODE (x) != SYMBOL_REF)
11840 return false;
11841 /* FALLTHRU */
11842
11843 case SYMBOL_REF:
11844 /* TLS symbols are never valid. */
11845 if (SYMBOL_REF_TLS_MODEL (x))
11846 return false;
11847
11848 /* DLLIMPORT symbols are never valid. */
11849 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11850 && SYMBOL_REF_DLLIMPORT_P (x))
11851 return false;
11852
11853 #if TARGET_MACHO
11854 /* mdynamic-no-pic */
11855 if (MACHO_DYNAMIC_NO_PIC_P)
11856 return machopic_symbol_defined_p (x);
11857 #endif
11858 break;
11859
11860 case CONST_DOUBLE:
11861 if (GET_MODE (x) == TImode
11862 && x != CONST0_RTX (TImode)
11863 && !TARGET_64BIT)
11864 return false;
11865 break;
11866
11867 case CONST_VECTOR:
11868 if (!standard_sse_constant_p (x))
11869 return false;
11870
11871 default:
11872 break;
11873 }
11874
11875 /* Otherwise we handle everything else in the move patterns. */
11876 return true;
11877 }
11878
11879 /* Determine if it's legal to put X into the constant pool. This
11880 is not possible for the address of thread-local symbols, which
11881 is checked above. */
11882
11883 static bool
11884 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11885 {
11886 /* We can always put integral constants and vectors in memory. */
11887 switch (GET_CODE (x))
11888 {
11889 case CONST_INT:
11890 case CONST_DOUBLE:
11891 case CONST_VECTOR:
11892 return false;
11893
11894 default:
11895 break;
11896 }
11897 return !ix86_legitimate_constant_p (mode, x);
11898 }
11899
11900
11901 /* Nonzero if the constant value X is a legitimate general operand
11902 when generating PIC code. It is given that flag_pic is on and
11903 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11904
11905 bool
11906 legitimate_pic_operand_p (rtx x)
11907 {
11908 rtx inner;
11909
11910 switch (GET_CODE (x))
11911 {
11912 case CONST:
11913 inner = XEXP (x, 0);
11914 if (GET_CODE (inner) == PLUS
11915 && CONST_INT_P (XEXP (inner, 1)))
11916 inner = XEXP (inner, 0);
11917
11918 /* Only some unspecs are valid as "constants". */
11919 if (GET_CODE (inner) == UNSPEC)
11920 switch (XINT (inner, 1))
11921 {
11922 case UNSPEC_GOT:
11923 case UNSPEC_GOTOFF:
11924 case UNSPEC_PLTOFF:
11925 return TARGET_64BIT;
11926 case UNSPEC_TPOFF:
11927 x = XVECEXP (inner, 0, 0);
11928 return (GET_CODE (x) == SYMBOL_REF
11929 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11930 case UNSPEC_MACHOPIC_OFFSET:
11931 return legitimate_pic_address_disp_p (x);
11932 default:
11933 return false;
11934 }
11935 /* FALLTHRU */
11936
11937 case SYMBOL_REF:
11938 case LABEL_REF:
11939 return legitimate_pic_address_disp_p (x);
11940
11941 default:
11942 return true;
11943 }
11944 }
11945
11946 /* Determine if a given CONST RTX is a valid memory displacement
11947 in PIC mode. */
11948
11949 bool
11950 legitimate_pic_address_disp_p (rtx disp)
11951 {
11952 bool saw_plus;
11953
11954 /* In 64bit mode we can allow direct addresses of symbols and labels
11955 when they are not dynamic symbols. */
11956 if (TARGET_64BIT)
11957 {
11958 rtx op0 = disp, op1;
11959
11960 switch (GET_CODE (disp))
11961 {
11962 case LABEL_REF:
11963 return true;
11964
11965 case CONST:
11966 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11967 break;
11968 op0 = XEXP (XEXP (disp, 0), 0);
11969 op1 = XEXP (XEXP (disp, 0), 1);
11970 if (!CONST_INT_P (op1)
11971 || INTVAL (op1) >= 16*1024*1024
11972 || INTVAL (op1) < -16*1024*1024)
11973 break;
11974 if (GET_CODE (op0) == LABEL_REF)
11975 return true;
11976 if (GET_CODE (op0) == CONST
11977 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11978 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11979 return true;
11980 if (GET_CODE (op0) == UNSPEC
11981 && XINT (op0, 1) == UNSPEC_PCREL)
11982 return true;
11983 if (GET_CODE (op0) != SYMBOL_REF)
11984 break;
11985 /* FALLTHRU */
11986
11987 case SYMBOL_REF:
11988 /* TLS references should always be enclosed in UNSPEC. */
11989 if (SYMBOL_REF_TLS_MODEL (op0))
11990 return false;
11991 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11992 && ix86_cmodel != CM_LARGE_PIC)
11993 return true;
11994 break;
11995
11996 default:
11997 break;
11998 }
11999 }
12000 if (GET_CODE (disp) != CONST)
12001 return false;
12002 disp = XEXP (disp, 0);
12003
12004 if (TARGET_64BIT)
12005 {
12006 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12007 of GOT tables. We should not need these anyway. */
12008 if (GET_CODE (disp) != UNSPEC
12009 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12010 && XINT (disp, 1) != UNSPEC_GOTOFF
12011 && XINT (disp, 1) != UNSPEC_PCREL
12012 && XINT (disp, 1) != UNSPEC_PLTOFF))
12013 return false;
12014
12015 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12016 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12017 return false;
12018 return true;
12019 }
12020
12021 saw_plus = false;
12022 if (GET_CODE (disp) == PLUS)
12023 {
12024 if (!CONST_INT_P (XEXP (disp, 1)))
12025 return false;
12026 disp = XEXP (disp, 0);
12027 saw_plus = true;
12028 }
12029
12030 if (TARGET_MACHO && darwin_local_data_pic (disp))
12031 return true;
12032
12033 if (GET_CODE (disp) != UNSPEC)
12034 return false;
12035
12036 switch (XINT (disp, 1))
12037 {
12038 case UNSPEC_GOT:
12039 if (saw_plus)
12040 return false;
12041 /* We need to check for both symbols and labels because VxWorks loads
12042 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12043 details. */
12044 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12045 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12046 case UNSPEC_GOTOFF:
12047 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12048 While ABI specify also 32bit relocation but we don't produce it in
12049 small PIC model at all. */
12050 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12051 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12052 && !TARGET_64BIT)
12053 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12054 return false;
12055 case UNSPEC_GOTTPOFF:
12056 case UNSPEC_GOTNTPOFF:
12057 case UNSPEC_INDNTPOFF:
12058 if (saw_plus)
12059 return false;
12060 disp = XVECEXP (disp, 0, 0);
12061 return (GET_CODE (disp) == SYMBOL_REF
12062 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12063 case UNSPEC_NTPOFF:
12064 disp = XVECEXP (disp, 0, 0);
12065 return (GET_CODE (disp) == SYMBOL_REF
12066 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12067 case UNSPEC_DTPOFF:
12068 disp = XVECEXP (disp, 0, 0);
12069 return (GET_CODE (disp) == SYMBOL_REF
12070 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12071 }
12072
12073 return false;
12074 }
12075
12076 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12077 replace the input X, or the original X if no replacement is called for.
12078 The output parameter *WIN is 1 if the calling macro should goto WIN,
12079 0 if it should not. */
12080
12081 bool
12082 ix86_legitimize_reload_address (rtx x,
12083 enum machine_mode mode ATTRIBUTE_UNUSED,
12084 int opnum, int type,
12085 int ind_levels ATTRIBUTE_UNUSED)
12086 {
12087 /* Reload can generate:
12088
12089 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12090 (reg:DI 97))
12091 (reg:DI 2 cx))
12092
12093 This RTX is rejected from ix86_legitimate_address_p due to
12094 non-strictness of base register 97. Following this rejection,
12095 reload pushes all three components into separate registers,
12096 creating invalid memory address RTX.
12097
12098 Following code reloads only the invalid part of the
12099 memory address RTX. */
12100
12101 if (GET_CODE (x) == PLUS
12102 && REG_P (XEXP (x, 1))
12103 && GET_CODE (XEXP (x, 0)) == PLUS
12104 && REG_P (XEXP (XEXP (x, 0), 1)))
12105 {
12106 rtx base, index;
12107 bool something_reloaded = false;
12108
12109 base = XEXP (XEXP (x, 0), 1);
12110 if (!REG_OK_FOR_BASE_STRICT_P (base))
12111 {
12112 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12113 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12114 opnum, (enum reload_type) type);
12115 something_reloaded = true;
12116 }
12117
12118 index = XEXP (x, 1);
12119 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12120 {
12121 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12122 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12123 opnum, (enum reload_type) type);
12124 something_reloaded = true;
12125 }
12126
12127 gcc_assert (something_reloaded);
12128 return true;
12129 }
12130
12131 return false;
12132 }
12133
12134 /* Recognizes RTL expressions that are valid memory addresses for an
12135 instruction. The MODE argument is the machine mode for the MEM
12136 expression that wants to use this address.
12137
12138 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12139 convert common non-canonical forms to canonical form so that they will
12140 be recognized. */
12141
12142 static bool
12143 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12144 rtx addr, bool strict)
12145 {
12146 struct ix86_address parts;
12147 rtx base, index, disp;
12148 HOST_WIDE_INT scale;
12149
12150 /* Since constant address in x32 is signed extended to 64bit,
12151 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12152 if (TARGET_X32
12153 && CONST_INT_P (addr)
12154 && INTVAL (addr) < 0)
12155 return false;
12156
12157 if (ix86_decompose_address (addr, &parts) <= 0)
12158 /* Decomposition failed. */
12159 return false;
12160
12161 base = parts.base;
12162 index = parts.index;
12163 disp = parts.disp;
12164 scale = parts.scale;
12165
12166 /* Validate base register. */
12167 if (base)
12168 {
12169 rtx reg;
12170
12171 if (REG_P (base))
12172 reg = base;
12173 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12174 reg = SUBREG_REG (base);
12175 else
12176 /* Base is not a register. */
12177 return false;
12178
12179 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12180 return false;
12181
12182 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12183 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12184 /* Base is not valid. */
12185 return false;
12186 }
12187
12188 /* Validate index register. */
12189 if (index)
12190 {
12191 rtx reg;
12192
12193 if (REG_P (index))
12194 reg = index;
12195 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12196 reg = SUBREG_REG (index);
12197 else
12198 /* Index is not a register. */
12199 return false;
12200
12201 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12202 return false;
12203
12204 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12205 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12206 /* Index is not valid. */
12207 return false;
12208 }
12209
12210 /* Index and base should have the same mode. */
12211 if (base && index
12212 && GET_MODE (base) != GET_MODE (index))
12213 return false;
12214
12215 /* Validate scale factor. */
12216 if (scale != 1)
12217 {
12218 if (!index)
12219 /* Scale without index. */
12220 return false;
12221
12222 if (scale != 2 && scale != 4 && scale != 8)
12223 /* Scale is not a valid multiplier. */
12224 return false;
12225 }
12226
12227 /* Validate displacement. */
12228 if (disp)
12229 {
12230 if (GET_CODE (disp) == CONST
12231 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12232 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12233 switch (XINT (XEXP (disp, 0), 1))
12234 {
12235 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12236 used. While ABI specify also 32bit relocations, we don't produce
12237 them at all and use IP relative instead. */
12238 case UNSPEC_GOT:
12239 case UNSPEC_GOTOFF:
12240 gcc_assert (flag_pic);
12241 if (!TARGET_64BIT)
12242 goto is_legitimate_pic;
12243
12244 /* 64bit address unspec. */
12245 return false;
12246
12247 case UNSPEC_GOTPCREL:
12248 case UNSPEC_PCREL:
12249 gcc_assert (flag_pic);
12250 goto is_legitimate_pic;
12251
12252 case UNSPEC_GOTTPOFF:
12253 case UNSPEC_GOTNTPOFF:
12254 case UNSPEC_INDNTPOFF:
12255 case UNSPEC_NTPOFF:
12256 case UNSPEC_DTPOFF:
12257 break;
12258
12259 case UNSPEC_STACK_CHECK:
12260 gcc_assert (flag_split_stack);
12261 break;
12262
12263 default:
12264 /* Invalid address unspec. */
12265 return false;
12266 }
12267
12268 else if (SYMBOLIC_CONST (disp)
12269 && (flag_pic
12270 || (TARGET_MACHO
12271 #if TARGET_MACHO
12272 && MACHOPIC_INDIRECT
12273 && !machopic_operand_p (disp)
12274 #endif
12275 )))
12276 {
12277
12278 is_legitimate_pic:
12279 if (TARGET_64BIT && (index || base))
12280 {
12281 /* foo@dtpoff(%rX) is ok. */
12282 if (GET_CODE (disp) != CONST
12283 || GET_CODE (XEXP (disp, 0)) != PLUS
12284 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12285 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12286 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12287 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12288 /* Non-constant pic memory reference. */
12289 return false;
12290 }
12291 else if ((!TARGET_MACHO || flag_pic)
12292 && ! legitimate_pic_address_disp_p (disp))
12293 /* Displacement is an invalid pic construct. */
12294 return false;
12295 #if TARGET_MACHO
12296 else if (MACHO_DYNAMIC_NO_PIC_P
12297 && !ix86_legitimate_constant_p (Pmode, disp))
12298 /* displacment must be referenced via non_lazy_pointer */
12299 return false;
12300 #endif
12301
12302 /* This code used to verify that a symbolic pic displacement
12303 includes the pic_offset_table_rtx register.
12304
12305 While this is good idea, unfortunately these constructs may
12306 be created by "adds using lea" optimization for incorrect
12307 code like:
12308
12309 int a;
12310 int foo(int i)
12311 {
12312 return *(&a+i);
12313 }
12314
12315 This code is nonsensical, but results in addressing
12316 GOT table with pic_offset_table_rtx base. We can't
12317 just refuse it easily, since it gets matched by
12318 "addsi3" pattern, that later gets split to lea in the
12319 case output register differs from input. While this
12320 can be handled by separate addsi pattern for this case
12321 that never results in lea, this seems to be easier and
12322 correct fix for crash to disable this test. */
12323 }
12324 else if (GET_CODE (disp) != LABEL_REF
12325 && !CONST_INT_P (disp)
12326 && (GET_CODE (disp) != CONST
12327 || !ix86_legitimate_constant_p (Pmode, disp))
12328 && (GET_CODE (disp) != SYMBOL_REF
12329 || !ix86_legitimate_constant_p (Pmode, disp)))
12330 /* Displacement is not constant. */
12331 return false;
12332 else if (TARGET_64BIT
12333 && !x86_64_immediate_operand (disp, VOIDmode))
12334 /* Displacement is out of range. */
12335 return false;
12336 }
12337
12338 /* Everything looks valid. */
12339 return true;
12340 }
12341
12342 /* Determine if a given RTX is a valid constant address. */
12343
12344 bool
12345 constant_address_p (rtx x)
12346 {
12347 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12348 }
12349 \f
12350 /* Return a unique alias set for the GOT. */
12351
12352 static alias_set_type
12353 ix86_GOT_alias_set (void)
12354 {
12355 static alias_set_type set = -1;
12356 if (set == -1)
12357 set = new_alias_set ();
12358 return set;
12359 }
12360
12361 /* Return a legitimate reference for ORIG (an address) using the
12362 register REG. If REG is 0, a new pseudo is generated.
12363
12364 There are two types of references that must be handled:
12365
12366 1. Global data references must load the address from the GOT, via
12367 the PIC reg. An insn is emitted to do this load, and the reg is
12368 returned.
12369
12370 2. Static data references, constant pool addresses, and code labels
12371 compute the address as an offset from the GOT, whose base is in
12372 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12373 differentiate them from global data objects. The returned
12374 address is the PIC reg + an unspec constant.
12375
12376 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12377 reg also appears in the address. */
12378
12379 static rtx
12380 legitimize_pic_address (rtx orig, rtx reg)
12381 {
12382 rtx addr = orig;
12383 rtx new_rtx = orig;
12384 rtx base;
12385
12386 #if TARGET_MACHO
12387 if (TARGET_MACHO && !TARGET_64BIT)
12388 {
12389 if (reg == 0)
12390 reg = gen_reg_rtx (Pmode);
12391 /* Use the generic Mach-O PIC machinery. */
12392 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12393 }
12394 #endif
12395
12396 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12397 new_rtx = addr;
12398 else if (TARGET_64BIT
12399 && ix86_cmodel != CM_SMALL_PIC
12400 && gotoff_operand (addr, Pmode))
12401 {
12402 rtx tmpreg;
12403 /* This symbol may be referenced via a displacement from the PIC
12404 base address (@GOTOFF). */
12405
12406 if (reload_in_progress)
12407 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12408 if (GET_CODE (addr) == CONST)
12409 addr = XEXP (addr, 0);
12410 if (GET_CODE (addr) == PLUS)
12411 {
12412 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12413 UNSPEC_GOTOFF);
12414 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12415 }
12416 else
12417 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12418 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12419 if (!reg)
12420 tmpreg = gen_reg_rtx (Pmode);
12421 else
12422 tmpreg = reg;
12423 emit_move_insn (tmpreg, new_rtx);
12424
12425 if (reg != 0)
12426 {
12427 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12428 tmpreg, 1, OPTAB_DIRECT);
12429 new_rtx = reg;
12430 }
12431 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12432 }
12433 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12434 {
12435 /* This symbol may be referenced via a displacement from the PIC
12436 base address (@GOTOFF). */
12437
12438 if (reload_in_progress)
12439 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12440 if (GET_CODE (addr) == CONST)
12441 addr = XEXP (addr, 0);
12442 if (GET_CODE (addr) == PLUS)
12443 {
12444 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12445 UNSPEC_GOTOFF);
12446 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12447 }
12448 else
12449 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12450 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12451 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12452
12453 if (reg != 0)
12454 {
12455 emit_move_insn (reg, new_rtx);
12456 new_rtx = reg;
12457 }
12458 }
12459 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12460 /* We can't use @GOTOFF for text labels on VxWorks;
12461 see gotoff_operand. */
12462 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12463 {
12464 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12465 {
12466 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12467 return legitimize_dllimport_symbol (addr, true);
12468 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12469 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12470 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12471 {
12472 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12473 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12474 }
12475 }
12476
12477 /* For x64 PE-COFF there is no GOT table. So we use address
12478 directly. */
12479 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12480 {
12481 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12482 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12483
12484 if (reg == 0)
12485 reg = gen_reg_rtx (Pmode);
12486 emit_move_insn (reg, new_rtx);
12487 new_rtx = reg;
12488 }
12489 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12490 {
12491 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12492 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12493 new_rtx = gen_const_mem (Pmode, new_rtx);
12494 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12495
12496 if (reg == 0)
12497 reg = gen_reg_rtx (Pmode);
12498 /* Use directly gen_movsi, otherwise the address is loaded
12499 into register for CSE. We don't want to CSE this addresses,
12500 instead we CSE addresses from the GOT table, so skip this. */
12501 emit_insn (gen_movsi (reg, new_rtx));
12502 new_rtx = reg;
12503 }
12504 else
12505 {
12506 /* This symbol must be referenced via a load from the
12507 Global Offset Table (@GOT). */
12508
12509 if (reload_in_progress)
12510 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12511 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12512 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12513 if (TARGET_64BIT)
12514 new_rtx = force_reg (Pmode, new_rtx);
12515 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12516 new_rtx = gen_const_mem (Pmode, new_rtx);
12517 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12518
12519 if (reg == 0)
12520 reg = gen_reg_rtx (Pmode);
12521 emit_move_insn (reg, new_rtx);
12522 new_rtx = reg;
12523 }
12524 }
12525 else
12526 {
12527 if (CONST_INT_P (addr)
12528 && !x86_64_immediate_operand (addr, VOIDmode))
12529 {
12530 if (reg)
12531 {
12532 emit_move_insn (reg, addr);
12533 new_rtx = reg;
12534 }
12535 else
12536 new_rtx = force_reg (Pmode, addr);
12537 }
12538 else if (GET_CODE (addr) == CONST)
12539 {
12540 addr = XEXP (addr, 0);
12541
12542 /* We must match stuff we generate before. Assume the only
12543 unspecs that can get here are ours. Not that we could do
12544 anything with them anyway.... */
12545 if (GET_CODE (addr) == UNSPEC
12546 || (GET_CODE (addr) == PLUS
12547 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12548 return orig;
12549 gcc_assert (GET_CODE (addr) == PLUS);
12550 }
12551 if (GET_CODE (addr) == PLUS)
12552 {
12553 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12554
12555 /* Check first to see if this is a constant offset from a @GOTOFF
12556 symbol reference. */
12557 if (gotoff_operand (op0, Pmode)
12558 && CONST_INT_P (op1))
12559 {
12560 if (!TARGET_64BIT)
12561 {
12562 if (reload_in_progress)
12563 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12564 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12565 UNSPEC_GOTOFF);
12566 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12567 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12568 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12569
12570 if (reg != 0)
12571 {
12572 emit_move_insn (reg, new_rtx);
12573 new_rtx = reg;
12574 }
12575 }
12576 else
12577 {
12578 if (INTVAL (op1) < -16*1024*1024
12579 || INTVAL (op1) >= 16*1024*1024)
12580 {
12581 if (!x86_64_immediate_operand (op1, Pmode))
12582 op1 = force_reg (Pmode, op1);
12583 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12584 }
12585 }
12586 }
12587 else
12588 {
12589 base = legitimize_pic_address (XEXP (addr, 0), reg);
12590 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12591 base == reg ? NULL_RTX : reg);
12592
12593 if (CONST_INT_P (new_rtx))
12594 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12595 else
12596 {
12597 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12598 {
12599 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12600 new_rtx = XEXP (new_rtx, 1);
12601 }
12602 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12603 }
12604 }
12605 }
12606 }
12607 return new_rtx;
12608 }
12609 \f
12610 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12611
12612 static rtx
12613 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12614 {
12615 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12616
12617 if (GET_MODE (tp) != tp_mode)
12618 {
12619 gcc_assert (GET_MODE (tp) == SImode);
12620 gcc_assert (tp_mode == DImode);
12621
12622 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12623 }
12624
12625 if (to_reg)
12626 tp = copy_to_mode_reg (tp_mode, tp);
12627
12628 return tp;
12629 }
12630
12631 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12632
12633 static GTY(()) rtx ix86_tls_symbol;
12634
12635 static rtx
12636 ix86_tls_get_addr (void)
12637 {
12638 if (!ix86_tls_symbol)
12639 {
12640 const char *sym
12641 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12642 ? "___tls_get_addr" : "__tls_get_addr");
12643
12644 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12645 }
12646
12647 return ix86_tls_symbol;
12648 }
12649
12650 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12651
12652 static GTY(()) rtx ix86_tls_module_base_symbol;
12653
12654 rtx
12655 ix86_tls_module_base (void)
12656 {
12657 if (!ix86_tls_module_base_symbol)
12658 {
12659 ix86_tls_module_base_symbol
12660 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12661
12662 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12663 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12664 }
12665
12666 return ix86_tls_module_base_symbol;
12667 }
12668
12669 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12670 false if we expect this to be used for a memory address and true if
12671 we expect to load the address into a register. */
12672
12673 static rtx
12674 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12675 {
12676 rtx dest, base, off;
12677 rtx pic = NULL_RTX, tp = NULL_RTX;
12678 enum machine_mode tp_mode = Pmode;
12679 int type;
12680
12681 switch (model)
12682 {
12683 case TLS_MODEL_GLOBAL_DYNAMIC:
12684 dest = gen_reg_rtx (Pmode);
12685
12686 if (!TARGET_64BIT)
12687 {
12688 if (flag_pic)
12689 pic = pic_offset_table_rtx;
12690 else
12691 {
12692 pic = gen_reg_rtx (Pmode);
12693 emit_insn (gen_set_got (pic));
12694 }
12695 }
12696
12697 if (TARGET_GNU2_TLS)
12698 {
12699 if (TARGET_64BIT)
12700 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12701 else
12702 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12703
12704 tp = get_thread_pointer (Pmode, true);
12705 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12706
12707 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12708 }
12709 else
12710 {
12711 rtx caddr = ix86_tls_get_addr ();
12712
12713 if (TARGET_64BIT)
12714 {
12715 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12716
12717 start_sequence ();
12718 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12719 caddr));
12720 insns = get_insns ();
12721 end_sequence ();
12722
12723 RTL_CONST_CALL_P (insns) = 1;
12724 emit_libcall_block (insns, dest, rax, x);
12725 }
12726 else
12727 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12728 }
12729 break;
12730
12731 case TLS_MODEL_LOCAL_DYNAMIC:
12732 base = gen_reg_rtx (Pmode);
12733
12734 if (!TARGET_64BIT)
12735 {
12736 if (flag_pic)
12737 pic = pic_offset_table_rtx;
12738 else
12739 {
12740 pic = gen_reg_rtx (Pmode);
12741 emit_insn (gen_set_got (pic));
12742 }
12743 }
12744
12745 if (TARGET_GNU2_TLS)
12746 {
12747 rtx tmp = ix86_tls_module_base ();
12748
12749 if (TARGET_64BIT)
12750 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12751 else
12752 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12753
12754 tp = get_thread_pointer (Pmode, true);
12755 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12756 gen_rtx_MINUS (Pmode, tmp, tp));
12757 }
12758 else
12759 {
12760 rtx caddr = ix86_tls_get_addr ();
12761
12762 if (TARGET_64BIT)
12763 {
12764 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12765
12766 start_sequence ();
12767 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12768 caddr));
12769 insns = get_insns ();
12770 end_sequence ();
12771
12772 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12773 share the LD_BASE result with other LD model accesses. */
12774 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12775 UNSPEC_TLS_LD_BASE);
12776
12777 RTL_CONST_CALL_P (insns) = 1;
12778 emit_libcall_block (insns, base, rax, eqv);
12779 }
12780 else
12781 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12782 }
12783
12784 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12785 off = gen_rtx_CONST (Pmode, off);
12786
12787 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12788
12789 if (TARGET_GNU2_TLS)
12790 {
12791 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12792
12793 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12794 }
12795 break;
12796
12797 case TLS_MODEL_INITIAL_EXEC:
12798 if (TARGET_64BIT)
12799 {
12800 if (TARGET_SUN_TLS)
12801 {
12802 /* The Sun linker took the AMD64 TLS spec literally
12803 and can only handle %rax as destination of the
12804 initial executable code sequence. */
12805
12806 dest = gen_reg_rtx (Pmode);
12807 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12808 return dest;
12809 }
12810
12811 /* Generate DImode references to avoid %fs:(%reg32)
12812 problems and linker IE->LE relaxation bug. */
12813 tp_mode = DImode;
12814 pic = NULL;
12815 type = UNSPEC_GOTNTPOFF;
12816 }
12817 else if (flag_pic)
12818 {
12819 if (reload_in_progress)
12820 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12821 pic = pic_offset_table_rtx;
12822 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12823 }
12824 else if (!TARGET_ANY_GNU_TLS)
12825 {
12826 pic = gen_reg_rtx (Pmode);
12827 emit_insn (gen_set_got (pic));
12828 type = UNSPEC_GOTTPOFF;
12829 }
12830 else
12831 {
12832 pic = NULL;
12833 type = UNSPEC_INDNTPOFF;
12834 }
12835
12836 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12837 off = gen_rtx_CONST (tp_mode, off);
12838 if (pic)
12839 off = gen_rtx_PLUS (tp_mode, pic, off);
12840 off = gen_const_mem (tp_mode, off);
12841 set_mem_alias_set (off, ix86_GOT_alias_set ());
12842
12843 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12844 {
12845 base = get_thread_pointer (tp_mode,
12846 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12847 off = force_reg (tp_mode, off);
12848 return gen_rtx_PLUS (tp_mode, base, off);
12849 }
12850 else
12851 {
12852 base = get_thread_pointer (Pmode, true);
12853 dest = gen_reg_rtx (Pmode);
12854 emit_insn (ix86_gen_sub3 (dest, base, off));
12855 }
12856 break;
12857
12858 case TLS_MODEL_LOCAL_EXEC:
12859 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12860 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12861 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12862 off = gen_rtx_CONST (Pmode, off);
12863
12864 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12865 {
12866 base = get_thread_pointer (Pmode,
12867 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12868 return gen_rtx_PLUS (Pmode, base, off);
12869 }
12870 else
12871 {
12872 base = get_thread_pointer (Pmode, true);
12873 dest = gen_reg_rtx (Pmode);
12874 emit_insn (ix86_gen_sub3 (dest, base, off));
12875 }
12876 break;
12877
12878 default:
12879 gcc_unreachable ();
12880 }
12881
12882 return dest;
12883 }
12884
12885 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12886 to symbol DECL. */
12887
12888 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12889 htab_t dllimport_map;
12890
12891 static tree
12892 get_dllimport_decl (tree decl)
12893 {
12894 struct tree_map *h, in;
12895 void **loc;
12896 const char *name;
12897 const char *prefix;
12898 size_t namelen, prefixlen;
12899 char *imp_name;
12900 tree to;
12901 rtx rtl;
12902
12903 if (!dllimport_map)
12904 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12905
12906 in.hash = htab_hash_pointer (decl);
12907 in.base.from = decl;
12908 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12909 h = (struct tree_map *) *loc;
12910 if (h)
12911 return h->to;
12912
12913 *loc = h = ggc_alloc_tree_map ();
12914 h->hash = in.hash;
12915 h->base.from = decl;
12916 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12917 VAR_DECL, NULL, ptr_type_node);
12918 DECL_ARTIFICIAL (to) = 1;
12919 DECL_IGNORED_P (to) = 1;
12920 DECL_EXTERNAL (to) = 1;
12921 TREE_READONLY (to) = 1;
12922
12923 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12924 name = targetm.strip_name_encoding (name);
12925 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12926 ? "*__imp_" : "*__imp__";
12927 namelen = strlen (name);
12928 prefixlen = strlen (prefix);
12929 imp_name = (char *) alloca (namelen + prefixlen + 1);
12930 memcpy (imp_name, prefix, prefixlen);
12931 memcpy (imp_name + prefixlen, name, namelen + 1);
12932
12933 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12934 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12935 SET_SYMBOL_REF_DECL (rtl, to);
12936 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12937
12938 rtl = gen_const_mem (Pmode, rtl);
12939 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12940
12941 SET_DECL_RTL (to, rtl);
12942 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12943
12944 return to;
12945 }
12946
12947 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12948 true if we require the result be a register. */
12949
12950 static rtx
12951 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12952 {
12953 tree imp_decl;
12954 rtx x;
12955
12956 gcc_assert (SYMBOL_REF_DECL (symbol));
12957 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12958
12959 x = DECL_RTL (imp_decl);
12960 if (want_reg)
12961 x = force_reg (Pmode, x);
12962 return x;
12963 }
12964
12965 /* Try machine-dependent ways of modifying an illegitimate address
12966 to be legitimate. If we find one, return the new, valid address.
12967 This macro is used in only one place: `memory_address' in explow.c.
12968
12969 OLDX is the address as it was before break_out_memory_refs was called.
12970 In some cases it is useful to look at this to decide what needs to be done.
12971
12972 It is always safe for this macro to do nothing. It exists to recognize
12973 opportunities to optimize the output.
12974
12975 For the 80386, we handle X+REG by loading X into a register R and
12976 using R+REG. R will go in a general reg and indexing will be used.
12977 However, if REG is a broken-out memory address or multiplication,
12978 nothing needs to be done because REG can certainly go in a general reg.
12979
12980 When -fpic is used, special handling is needed for symbolic references.
12981 See comments by legitimize_pic_address in i386.c for details. */
12982
12983 static rtx
12984 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12985 enum machine_mode mode)
12986 {
12987 int changed = 0;
12988 unsigned log;
12989
12990 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12991 if (log)
12992 return legitimize_tls_address (x, (enum tls_model) log, false);
12993 if (GET_CODE (x) == CONST
12994 && GET_CODE (XEXP (x, 0)) == PLUS
12995 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12996 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12997 {
12998 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12999 (enum tls_model) log, false);
13000 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13001 }
13002
13003 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13004 {
13005 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13006 return legitimize_dllimport_symbol (x, true);
13007 if (GET_CODE (x) == CONST
13008 && GET_CODE (XEXP (x, 0)) == PLUS
13009 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13010 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13011 {
13012 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13013 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13014 }
13015 }
13016
13017 if (flag_pic && SYMBOLIC_CONST (x))
13018 return legitimize_pic_address (x, 0);
13019
13020 #if TARGET_MACHO
13021 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13022 return machopic_indirect_data_reference (x, 0);
13023 #endif
13024
13025 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13026 if (GET_CODE (x) == ASHIFT
13027 && CONST_INT_P (XEXP (x, 1))
13028 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13029 {
13030 changed = 1;
13031 log = INTVAL (XEXP (x, 1));
13032 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13033 GEN_INT (1 << log));
13034 }
13035
13036 if (GET_CODE (x) == PLUS)
13037 {
13038 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13039
13040 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13041 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13042 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13043 {
13044 changed = 1;
13045 log = INTVAL (XEXP (XEXP (x, 0), 1));
13046 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13047 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13048 GEN_INT (1 << log));
13049 }
13050
13051 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13052 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13053 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13054 {
13055 changed = 1;
13056 log = INTVAL (XEXP (XEXP (x, 1), 1));
13057 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13058 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13059 GEN_INT (1 << log));
13060 }
13061
13062 /* Put multiply first if it isn't already. */
13063 if (GET_CODE (XEXP (x, 1)) == MULT)
13064 {
13065 rtx tmp = XEXP (x, 0);
13066 XEXP (x, 0) = XEXP (x, 1);
13067 XEXP (x, 1) = tmp;
13068 changed = 1;
13069 }
13070
13071 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13072 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13073 created by virtual register instantiation, register elimination, and
13074 similar optimizations. */
13075 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13076 {
13077 changed = 1;
13078 x = gen_rtx_PLUS (Pmode,
13079 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13080 XEXP (XEXP (x, 1), 0)),
13081 XEXP (XEXP (x, 1), 1));
13082 }
13083
13084 /* Canonicalize
13085 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13086 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13087 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13088 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13089 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13090 && CONSTANT_P (XEXP (x, 1)))
13091 {
13092 rtx constant;
13093 rtx other = NULL_RTX;
13094
13095 if (CONST_INT_P (XEXP (x, 1)))
13096 {
13097 constant = XEXP (x, 1);
13098 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13099 }
13100 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13101 {
13102 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13103 other = XEXP (x, 1);
13104 }
13105 else
13106 constant = 0;
13107
13108 if (constant)
13109 {
13110 changed = 1;
13111 x = gen_rtx_PLUS (Pmode,
13112 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13113 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13114 plus_constant (Pmode, other,
13115 INTVAL (constant)));
13116 }
13117 }
13118
13119 if (changed && ix86_legitimate_address_p (mode, x, false))
13120 return x;
13121
13122 if (GET_CODE (XEXP (x, 0)) == MULT)
13123 {
13124 changed = 1;
13125 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13126 }
13127
13128 if (GET_CODE (XEXP (x, 1)) == MULT)
13129 {
13130 changed = 1;
13131 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13132 }
13133
13134 if (changed
13135 && REG_P (XEXP (x, 1))
13136 && REG_P (XEXP (x, 0)))
13137 return x;
13138
13139 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13140 {
13141 changed = 1;
13142 x = legitimize_pic_address (x, 0);
13143 }
13144
13145 if (changed && ix86_legitimate_address_p (mode, x, false))
13146 return x;
13147
13148 if (REG_P (XEXP (x, 0)))
13149 {
13150 rtx temp = gen_reg_rtx (Pmode);
13151 rtx val = force_operand (XEXP (x, 1), temp);
13152 if (val != temp)
13153 {
13154 if (GET_MODE (val) != Pmode)
13155 val = convert_to_mode (Pmode, val, 1);
13156 emit_move_insn (temp, val);
13157 }
13158
13159 XEXP (x, 1) = temp;
13160 return x;
13161 }
13162
13163 else if (REG_P (XEXP (x, 1)))
13164 {
13165 rtx temp = gen_reg_rtx (Pmode);
13166 rtx val = force_operand (XEXP (x, 0), temp);
13167 if (val != temp)
13168 {
13169 if (GET_MODE (val) != Pmode)
13170 val = convert_to_mode (Pmode, val, 1);
13171 emit_move_insn (temp, val);
13172 }
13173
13174 XEXP (x, 0) = temp;
13175 return x;
13176 }
13177 }
13178
13179 return x;
13180 }
13181 \f
13182 /* Print an integer constant expression in assembler syntax. Addition
13183 and subtraction are the only arithmetic that may appear in these
13184 expressions. FILE is the stdio stream to write to, X is the rtx, and
13185 CODE is the operand print code from the output string. */
13186
13187 static void
13188 output_pic_addr_const (FILE *file, rtx x, int code)
13189 {
13190 char buf[256];
13191
13192 switch (GET_CODE (x))
13193 {
13194 case PC:
13195 gcc_assert (flag_pic);
13196 putc ('.', file);
13197 break;
13198
13199 case SYMBOL_REF:
13200 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13201 output_addr_const (file, x);
13202 else
13203 {
13204 const char *name = XSTR (x, 0);
13205
13206 /* Mark the decl as referenced so that cgraph will
13207 output the function. */
13208 if (SYMBOL_REF_DECL (x))
13209 mark_decl_referenced (SYMBOL_REF_DECL (x));
13210
13211 #if TARGET_MACHO
13212 if (MACHOPIC_INDIRECT
13213 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13214 name = machopic_indirection_name (x, /*stub_p=*/true);
13215 #endif
13216 assemble_name (file, name);
13217 }
13218 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13219 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13220 fputs ("@PLT", file);
13221 break;
13222
13223 case LABEL_REF:
13224 x = XEXP (x, 0);
13225 /* FALLTHRU */
13226 case CODE_LABEL:
13227 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13228 assemble_name (asm_out_file, buf);
13229 break;
13230
13231 case CONST_INT:
13232 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13233 break;
13234
13235 case CONST:
13236 /* This used to output parentheses around the expression,
13237 but that does not work on the 386 (either ATT or BSD assembler). */
13238 output_pic_addr_const (file, XEXP (x, 0), code);
13239 break;
13240
13241 case CONST_DOUBLE:
13242 if (GET_MODE (x) == VOIDmode)
13243 {
13244 /* We can use %d if the number is <32 bits and positive. */
13245 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13246 fprintf (file, "0x%lx%08lx",
13247 (unsigned long) CONST_DOUBLE_HIGH (x),
13248 (unsigned long) CONST_DOUBLE_LOW (x));
13249 else
13250 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13251 }
13252 else
13253 /* We can't handle floating point constants;
13254 TARGET_PRINT_OPERAND must handle them. */
13255 output_operand_lossage ("floating constant misused");
13256 break;
13257
13258 case PLUS:
13259 /* Some assemblers need integer constants to appear first. */
13260 if (CONST_INT_P (XEXP (x, 0)))
13261 {
13262 output_pic_addr_const (file, XEXP (x, 0), code);
13263 putc ('+', file);
13264 output_pic_addr_const (file, XEXP (x, 1), code);
13265 }
13266 else
13267 {
13268 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13269 output_pic_addr_const (file, XEXP (x, 1), code);
13270 putc ('+', file);
13271 output_pic_addr_const (file, XEXP (x, 0), code);
13272 }
13273 break;
13274
13275 case MINUS:
13276 if (!TARGET_MACHO)
13277 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13278 output_pic_addr_const (file, XEXP (x, 0), code);
13279 putc ('-', file);
13280 output_pic_addr_const (file, XEXP (x, 1), code);
13281 if (!TARGET_MACHO)
13282 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13283 break;
13284
13285 case UNSPEC:
13286 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13287 {
13288 bool f = i386_asm_output_addr_const_extra (file, x);
13289 gcc_assert (f);
13290 break;
13291 }
13292
13293 gcc_assert (XVECLEN (x, 0) == 1);
13294 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13295 switch (XINT (x, 1))
13296 {
13297 case UNSPEC_GOT:
13298 fputs ("@GOT", file);
13299 break;
13300 case UNSPEC_GOTOFF:
13301 fputs ("@GOTOFF", file);
13302 break;
13303 case UNSPEC_PLTOFF:
13304 fputs ("@PLTOFF", file);
13305 break;
13306 case UNSPEC_PCREL:
13307 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13308 "(%rip)" : "[rip]", file);
13309 break;
13310 case UNSPEC_GOTPCREL:
13311 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13312 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13313 break;
13314 case UNSPEC_GOTTPOFF:
13315 /* FIXME: This might be @TPOFF in Sun ld too. */
13316 fputs ("@gottpoff", file);
13317 break;
13318 case UNSPEC_TPOFF:
13319 fputs ("@tpoff", file);
13320 break;
13321 case UNSPEC_NTPOFF:
13322 if (TARGET_64BIT)
13323 fputs ("@tpoff", file);
13324 else
13325 fputs ("@ntpoff", file);
13326 break;
13327 case UNSPEC_DTPOFF:
13328 fputs ("@dtpoff", file);
13329 break;
13330 case UNSPEC_GOTNTPOFF:
13331 if (TARGET_64BIT)
13332 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13333 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13334 else
13335 fputs ("@gotntpoff", file);
13336 break;
13337 case UNSPEC_INDNTPOFF:
13338 fputs ("@indntpoff", file);
13339 break;
13340 #if TARGET_MACHO
13341 case UNSPEC_MACHOPIC_OFFSET:
13342 putc ('-', file);
13343 machopic_output_function_base_name (file);
13344 break;
13345 #endif
13346 default:
13347 output_operand_lossage ("invalid UNSPEC as operand");
13348 break;
13349 }
13350 break;
13351
13352 default:
13353 output_operand_lossage ("invalid expression as operand");
13354 }
13355 }
13356
13357 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13358 We need to emit DTP-relative relocations. */
13359
13360 static void ATTRIBUTE_UNUSED
13361 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13362 {
13363 fputs (ASM_LONG, file);
13364 output_addr_const (file, x);
13365 fputs ("@dtpoff", file);
13366 switch (size)
13367 {
13368 case 4:
13369 break;
13370 case 8:
13371 fputs (", 0", file);
13372 break;
13373 default:
13374 gcc_unreachable ();
13375 }
13376 }
13377
13378 /* Return true if X is a representation of the PIC register. This copes
13379 with calls from ix86_find_base_term, where the register might have
13380 been replaced by a cselib value. */
13381
13382 static bool
13383 ix86_pic_register_p (rtx x)
13384 {
13385 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13386 return (pic_offset_table_rtx
13387 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13388 else
13389 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13390 }
13391
13392 /* Helper function for ix86_delegitimize_address.
13393 Attempt to delegitimize TLS local-exec accesses. */
13394
13395 static rtx
13396 ix86_delegitimize_tls_address (rtx orig_x)
13397 {
13398 rtx x = orig_x, unspec;
13399 struct ix86_address addr;
13400
13401 if (!TARGET_TLS_DIRECT_SEG_REFS)
13402 return orig_x;
13403 if (MEM_P (x))
13404 x = XEXP (x, 0);
13405 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13406 return orig_x;
13407 if (ix86_decompose_address (x, &addr) == 0
13408 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13409 || addr.disp == NULL_RTX
13410 || GET_CODE (addr.disp) != CONST)
13411 return orig_x;
13412 unspec = XEXP (addr.disp, 0);
13413 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13414 unspec = XEXP (unspec, 0);
13415 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13416 return orig_x;
13417 x = XVECEXP (unspec, 0, 0);
13418 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13419 if (unspec != XEXP (addr.disp, 0))
13420 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13421 if (addr.index)
13422 {
13423 rtx idx = addr.index;
13424 if (addr.scale != 1)
13425 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13426 x = gen_rtx_PLUS (Pmode, idx, x);
13427 }
13428 if (addr.base)
13429 x = gen_rtx_PLUS (Pmode, addr.base, x);
13430 if (MEM_P (orig_x))
13431 x = replace_equiv_address_nv (orig_x, x);
13432 return x;
13433 }
13434
13435 /* In the name of slightly smaller debug output, and to cater to
13436 general assembler lossage, recognize PIC+GOTOFF and turn it back
13437 into a direct symbol reference.
13438
13439 On Darwin, this is necessary to avoid a crash, because Darwin
13440 has a different PIC label for each routine but the DWARF debugging
13441 information is not associated with any particular routine, so it's
13442 necessary to remove references to the PIC label from RTL stored by
13443 the DWARF output code. */
13444
13445 static rtx
13446 ix86_delegitimize_address (rtx x)
13447 {
13448 rtx orig_x = delegitimize_mem_from_attrs (x);
13449 /* addend is NULL or some rtx if x is something+GOTOFF where
13450 something doesn't include the PIC register. */
13451 rtx addend = NULL_RTX;
13452 /* reg_addend is NULL or a multiple of some register. */
13453 rtx reg_addend = NULL_RTX;
13454 /* const_addend is NULL or a const_int. */
13455 rtx const_addend = NULL_RTX;
13456 /* This is the result, or NULL. */
13457 rtx result = NULL_RTX;
13458
13459 x = orig_x;
13460
13461 if (MEM_P (x))
13462 x = XEXP (x, 0);
13463
13464 if (TARGET_64BIT)
13465 {
13466 if (GET_CODE (x) == CONST
13467 && GET_CODE (XEXP (x, 0)) == PLUS
13468 && GET_MODE (XEXP (x, 0)) == Pmode
13469 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13470 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13471 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13472 {
13473 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13474 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13475 if (MEM_P (orig_x))
13476 x = replace_equiv_address_nv (orig_x, x);
13477 return x;
13478 }
13479 if (GET_CODE (x) != CONST
13480 || GET_CODE (XEXP (x, 0)) != UNSPEC
13481 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13482 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13483 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13484 return ix86_delegitimize_tls_address (orig_x);
13485 x = XVECEXP (XEXP (x, 0), 0, 0);
13486 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13487 {
13488 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13489 GET_MODE (x), 0);
13490 if (x == NULL_RTX)
13491 return orig_x;
13492 }
13493 return x;
13494 }
13495
13496 if (GET_CODE (x) != PLUS
13497 || GET_CODE (XEXP (x, 1)) != CONST)
13498 return ix86_delegitimize_tls_address (orig_x);
13499
13500 if (ix86_pic_register_p (XEXP (x, 0)))
13501 /* %ebx + GOT/GOTOFF */
13502 ;
13503 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13504 {
13505 /* %ebx + %reg * scale + GOT/GOTOFF */
13506 reg_addend = XEXP (x, 0);
13507 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13508 reg_addend = XEXP (reg_addend, 1);
13509 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13510 reg_addend = XEXP (reg_addend, 0);
13511 else
13512 {
13513 reg_addend = NULL_RTX;
13514 addend = XEXP (x, 0);
13515 }
13516 }
13517 else
13518 addend = XEXP (x, 0);
13519
13520 x = XEXP (XEXP (x, 1), 0);
13521 if (GET_CODE (x) == PLUS
13522 && CONST_INT_P (XEXP (x, 1)))
13523 {
13524 const_addend = XEXP (x, 1);
13525 x = XEXP (x, 0);
13526 }
13527
13528 if (GET_CODE (x) == UNSPEC
13529 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13530 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13531 result = XVECEXP (x, 0, 0);
13532
13533 if (TARGET_MACHO && darwin_local_data_pic (x)
13534 && !MEM_P (orig_x))
13535 result = XVECEXP (x, 0, 0);
13536
13537 if (! result)
13538 return ix86_delegitimize_tls_address (orig_x);
13539
13540 if (const_addend)
13541 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13542 if (reg_addend)
13543 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13544 if (addend)
13545 {
13546 /* If the rest of original X doesn't involve the PIC register, add
13547 addend and subtract pic_offset_table_rtx. This can happen e.g.
13548 for code like:
13549 leal (%ebx, %ecx, 4), %ecx
13550 ...
13551 movl foo@GOTOFF(%ecx), %edx
13552 in which case we return (%ecx - %ebx) + foo. */
13553 if (pic_offset_table_rtx)
13554 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13555 pic_offset_table_rtx),
13556 result);
13557 else
13558 return orig_x;
13559 }
13560 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13561 {
13562 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13563 if (result == NULL_RTX)
13564 return orig_x;
13565 }
13566 return result;
13567 }
13568
13569 /* If X is a machine specific address (i.e. a symbol or label being
13570 referenced as a displacement from the GOT implemented using an
13571 UNSPEC), then return the base term. Otherwise return X. */
13572
13573 rtx
13574 ix86_find_base_term (rtx x)
13575 {
13576 rtx term;
13577
13578 if (TARGET_64BIT)
13579 {
13580 if (GET_CODE (x) != CONST)
13581 return x;
13582 term = XEXP (x, 0);
13583 if (GET_CODE (term) == PLUS
13584 && (CONST_INT_P (XEXP (term, 1))
13585 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13586 term = XEXP (term, 0);
13587 if (GET_CODE (term) != UNSPEC
13588 || (XINT (term, 1) != UNSPEC_GOTPCREL
13589 && XINT (term, 1) != UNSPEC_PCREL))
13590 return x;
13591
13592 return XVECEXP (term, 0, 0);
13593 }
13594
13595 return ix86_delegitimize_address (x);
13596 }
13597 \f
13598 static void
13599 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13600 int fp, FILE *file)
13601 {
13602 const char *suffix;
13603
13604 if (mode == CCFPmode || mode == CCFPUmode)
13605 {
13606 code = ix86_fp_compare_code_to_integer (code);
13607 mode = CCmode;
13608 }
13609 if (reverse)
13610 code = reverse_condition (code);
13611
13612 switch (code)
13613 {
13614 case EQ:
13615 switch (mode)
13616 {
13617 case CCAmode:
13618 suffix = "a";
13619 break;
13620
13621 case CCCmode:
13622 suffix = "c";
13623 break;
13624
13625 case CCOmode:
13626 suffix = "o";
13627 break;
13628
13629 case CCSmode:
13630 suffix = "s";
13631 break;
13632
13633 default:
13634 suffix = "e";
13635 }
13636 break;
13637 case NE:
13638 switch (mode)
13639 {
13640 case CCAmode:
13641 suffix = "na";
13642 break;
13643
13644 case CCCmode:
13645 suffix = "nc";
13646 break;
13647
13648 case CCOmode:
13649 suffix = "no";
13650 break;
13651
13652 case CCSmode:
13653 suffix = "ns";
13654 break;
13655
13656 default:
13657 suffix = "ne";
13658 }
13659 break;
13660 case GT:
13661 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13662 suffix = "g";
13663 break;
13664 case GTU:
13665 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13666 Those same assemblers have the same but opposite lossage on cmov. */
13667 if (mode == CCmode)
13668 suffix = fp ? "nbe" : "a";
13669 else if (mode == CCCmode)
13670 suffix = "b";
13671 else
13672 gcc_unreachable ();
13673 break;
13674 case LT:
13675 switch (mode)
13676 {
13677 case CCNOmode:
13678 case CCGOCmode:
13679 suffix = "s";
13680 break;
13681
13682 case CCmode:
13683 case CCGCmode:
13684 suffix = "l";
13685 break;
13686
13687 default:
13688 gcc_unreachable ();
13689 }
13690 break;
13691 case LTU:
13692 gcc_assert (mode == CCmode || mode == CCCmode);
13693 suffix = "b";
13694 break;
13695 case GE:
13696 switch (mode)
13697 {
13698 case CCNOmode:
13699 case CCGOCmode:
13700 suffix = "ns";
13701 break;
13702
13703 case CCmode:
13704 case CCGCmode:
13705 suffix = "ge";
13706 break;
13707
13708 default:
13709 gcc_unreachable ();
13710 }
13711 break;
13712 case GEU:
13713 /* ??? As above. */
13714 gcc_assert (mode == CCmode || mode == CCCmode);
13715 suffix = fp ? "nb" : "ae";
13716 break;
13717 case LE:
13718 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13719 suffix = "le";
13720 break;
13721 case LEU:
13722 /* ??? As above. */
13723 if (mode == CCmode)
13724 suffix = "be";
13725 else if (mode == CCCmode)
13726 suffix = fp ? "nb" : "ae";
13727 else
13728 gcc_unreachable ();
13729 break;
13730 case UNORDERED:
13731 suffix = fp ? "u" : "p";
13732 break;
13733 case ORDERED:
13734 suffix = fp ? "nu" : "np";
13735 break;
13736 default:
13737 gcc_unreachable ();
13738 }
13739 fputs (suffix, file);
13740 }
13741
13742 /* Print the name of register X to FILE based on its machine mode and number.
13743 If CODE is 'w', pretend the mode is HImode.
13744 If CODE is 'b', pretend the mode is QImode.
13745 If CODE is 'k', pretend the mode is SImode.
13746 If CODE is 'q', pretend the mode is DImode.
13747 If CODE is 'x', pretend the mode is V4SFmode.
13748 If CODE is 't', pretend the mode is V8SFmode.
13749 If CODE is 'h', pretend the reg is the 'high' byte register.
13750 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13751 If CODE is 'd', duplicate the operand for AVX instruction.
13752 */
13753
13754 void
13755 print_reg (rtx x, int code, FILE *file)
13756 {
13757 const char *reg;
13758 bool duplicated = code == 'd' && TARGET_AVX;
13759
13760 gcc_assert (x == pc_rtx
13761 || (REGNO (x) != ARG_POINTER_REGNUM
13762 && REGNO (x) != FRAME_POINTER_REGNUM
13763 && REGNO (x) != FLAGS_REG
13764 && REGNO (x) != FPSR_REG
13765 && REGNO (x) != FPCR_REG));
13766
13767 if (ASSEMBLER_DIALECT == ASM_ATT)
13768 putc ('%', file);
13769
13770 if (x == pc_rtx)
13771 {
13772 gcc_assert (TARGET_64BIT);
13773 fputs ("rip", file);
13774 return;
13775 }
13776
13777 if (code == 'w' || MMX_REG_P (x))
13778 code = 2;
13779 else if (code == 'b')
13780 code = 1;
13781 else if (code == 'k')
13782 code = 4;
13783 else if (code == 'q')
13784 code = 8;
13785 else if (code == 'y')
13786 code = 3;
13787 else if (code == 'h')
13788 code = 0;
13789 else if (code == 'x')
13790 code = 16;
13791 else if (code == 't')
13792 code = 32;
13793 else
13794 code = GET_MODE_SIZE (GET_MODE (x));
13795
13796 /* Irritatingly, AMD extended registers use different naming convention
13797 from the normal registers: "r%d[bwd]" */
13798 if (REX_INT_REG_P (x))
13799 {
13800 gcc_assert (TARGET_64BIT);
13801 putc ('r', file);
13802 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13803 switch (code)
13804 {
13805 case 0:
13806 error ("extended registers have no high halves");
13807 break;
13808 case 1:
13809 putc ('b', file);
13810 break;
13811 case 2:
13812 putc ('w', file);
13813 break;
13814 case 4:
13815 putc ('d', file);
13816 break;
13817 case 8:
13818 /* no suffix */
13819 break;
13820 default:
13821 error ("unsupported operand size for extended register");
13822 break;
13823 }
13824 return;
13825 }
13826
13827 reg = NULL;
13828 switch (code)
13829 {
13830 case 3:
13831 if (STACK_TOP_P (x))
13832 {
13833 reg = "st(0)";
13834 break;
13835 }
13836 /* FALLTHRU */
13837 case 8:
13838 case 4:
13839 case 12:
13840 if (! ANY_FP_REG_P (x))
13841 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13842 /* FALLTHRU */
13843 case 16:
13844 case 2:
13845 normal:
13846 reg = hi_reg_name[REGNO (x)];
13847 break;
13848 case 1:
13849 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13850 goto normal;
13851 reg = qi_reg_name[REGNO (x)];
13852 break;
13853 case 0:
13854 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13855 goto normal;
13856 reg = qi_high_reg_name[REGNO (x)];
13857 break;
13858 case 32:
13859 if (SSE_REG_P (x))
13860 {
13861 gcc_assert (!duplicated);
13862 putc ('y', file);
13863 fputs (hi_reg_name[REGNO (x)] + 1, file);
13864 return;
13865 }
13866 break;
13867 default:
13868 gcc_unreachable ();
13869 }
13870
13871 fputs (reg, file);
13872 if (duplicated)
13873 {
13874 if (ASSEMBLER_DIALECT == ASM_ATT)
13875 fprintf (file, ", %%%s", reg);
13876 else
13877 fprintf (file, ", %s", reg);
13878 }
13879 }
13880
13881 /* Locate some local-dynamic symbol still in use by this function
13882 so that we can print its name in some tls_local_dynamic_base
13883 pattern. */
13884
13885 static int
13886 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13887 {
13888 rtx x = *px;
13889
13890 if (GET_CODE (x) == SYMBOL_REF
13891 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13892 {
13893 cfun->machine->some_ld_name = XSTR (x, 0);
13894 return 1;
13895 }
13896
13897 return 0;
13898 }
13899
13900 static const char *
13901 get_some_local_dynamic_name (void)
13902 {
13903 rtx insn;
13904
13905 if (cfun->machine->some_ld_name)
13906 return cfun->machine->some_ld_name;
13907
13908 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13909 if (NONDEBUG_INSN_P (insn)
13910 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13911 return cfun->machine->some_ld_name;
13912
13913 return NULL;
13914 }
13915
13916 /* Meaning of CODE:
13917 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13918 C -- print opcode suffix for set/cmov insn.
13919 c -- like C, but print reversed condition
13920 F,f -- likewise, but for floating-point.
13921 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13922 otherwise nothing
13923 R -- print the prefix for register names.
13924 z -- print the opcode suffix for the size of the current operand.
13925 Z -- likewise, with special suffixes for x87 instructions.
13926 * -- print a star (in certain assembler syntax)
13927 A -- print an absolute memory reference.
13928 E -- print address with DImode register names if TARGET_64BIT.
13929 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13930 s -- print a shift double count, followed by the assemblers argument
13931 delimiter.
13932 b -- print the QImode name of the register for the indicated operand.
13933 %b0 would print %al if operands[0] is reg 0.
13934 w -- likewise, print the HImode name of the register.
13935 k -- likewise, print the SImode name of the register.
13936 q -- likewise, print the DImode name of the register.
13937 x -- likewise, print the V4SFmode name of the register.
13938 t -- likewise, print the V8SFmode name of the register.
13939 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13940 y -- print "st(0)" instead of "st" as a register.
13941 d -- print duplicated register operand for AVX instruction.
13942 D -- print condition for SSE cmp instruction.
13943 P -- if PIC, print an @PLT suffix.
13944 p -- print raw symbol name.
13945 X -- don't print any sort of PIC '@' suffix for a symbol.
13946 & -- print some in-use local-dynamic symbol name.
13947 H -- print a memory address offset by 8; used for sse high-parts
13948 Y -- print condition for XOP pcom* instruction.
13949 + -- print a branch hint as 'cs' or 'ds' prefix
13950 ; -- print a semicolon (after prefixes due to bug in older gas).
13951 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13952 @ -- print a segment register of thread base pointer load
13953 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13954 */
13955
13956 void
13957 ix86_print_operand (FILE *file, rtx x, int code)
13958 {
13959 if (code)
13960 {
13961 switch (code)
13962 {
13963 case '*':
13964 if (ASSEMBLER_DIALECT == ASM_ATT)
13965 putc ('*', file);
13966 return;
13967
13968 case '&':
13969 {
13970 const char *name = get_some_local_dynamic_name ();
13971 if (name == NULL)
13972 output_operand_lossage ("'%%&' used without any "
13973 "local dynamic TLS references");
13974 else
13975 assemble_name (file, name);
13976 return;
13977 }
13978
13979 case 'A':
13980 switch (ASSEMBLER_DIALECT)
13981 {
13982 case ASM_ATT:
13983 putc ('*', file);
13984 break;
13985
13986 case ASM_INTEL:
13987 /* Intel syntax. For absolute addresses, registers should not
13988 be surrounded by braces. */
13989 if (!REG_P (x))
13990 {
13991 putc ('[', file);
13992 ix86_print_operand (file, x, 0);
13993 putc (']', file);
13994 return;
13995 }
13996 break;
13997
13998 default:
13999 gcc_unreachable ();
14000 }
14001
14002 ix86_print_operand (file, x, 0);
14003 return;
14004
14005 case 'E':
14006 /* Wrap address in an UNSPEC to declare special handling. */
14007 if (TARGET_64BIT)
14008 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14009
14010 output_address (x);
14011 return;
14012
14013 case 'L':
14014 if (ASSEMBLER_DIALECT == ASM_ATT)
14015 putc ('l', file);
14016 return;
14017
14018 case 'W':
14019 if (ASSEMBLER_DIALECT == ASM_ATT)
14020 putc ('w', file);
14021 return;
14022
14023 case 'B':
14024 if (ASSEMBLER_DIALECT == ASM_ATT)
14025 putc ('b', file);
14026 return;
14027
14028 case 'Q':
14029 if (ASSEMBLER_DIALECT == ASM_ATT)
14030 putc ('l', file);
14031 return;
14032
14033 case 'S':
14034 if (ASSEMBLER_DIALECT == ASM_ATT)
14035 putc ('s', file);
14036 return;
14037
14038 case 'T':
14039 if (ASSEMBLER_DIALECT == ASM_ATT)
14040 putc ('t', file);
14041 return;
14042
14043 case 'z':
14044 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14045 {
14046 /* Opcodes don't get size suffixes if using Intel opcodes. */
14047 if (ASSEMBLER_DIALECT == ASM_INTEL)
14048 return;
14049
14050 switch (GET_MODE_SIZE (GET_MODE (x)))
14051 {
14052 case 1:
14053 putc ('b', file);
14054 return;
14055
14056 case 2:
14057 putc ('w', file);
14058 return;
14059
14060 case 4:
14061 putc ('l', file);
14062 return;
14063
14064 case 8:
14065 putc ('q', file);
14066 return;
14067
14068 default:
14069 output_operand_lossage
14070 ("invalid operand size for operand code '%c'", code);
14071 return;
14072 }
14073 }
14074
14075 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14076 warning
14077 (0, "non-integer operand used with operand code '%c'", code);
14078 /* FALLTHRU */
14079
14080 case 'Z':
14081 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14082 if (ASSEMBLER_DIALECT == ASM_INTEL)
14083 return;
14084
14085 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14086 {
14087 switch (GET_MODE_SIZE (GET_MODE (x)))
14088 {
14089 case 2:
14090 #ifdef HAVE_AS_IX86_FILDS
14091 putc ('s', file);
14092 #endif
14093 return;
14094
14095 case 4:
14096 putc ('l', file);
14097 return;
14098
14099 case 8:
14100 #ifdef HAVE_AS_IX86_FILDQ
14101 putc ('q', file);
14102 #else
14103 fputs ("ll", file);
14104 #endif
14105 return;
14106
14107 default:
14108 break;
14109 }
14110 }
14111 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14112 {
14113 /* 387 opcodes don't get size suffixes
14114 if the operands are registers. */
14115 if (STACK_REG_P (x))
14116 return;
14117
14118 switch (GET_MODE_SIZE (GET_MODE (x)))
14119 {
14120 case 4:
14121 putc ('s', file);
14122 return;
14123
14124 case 8:
14125 putc ('l', file);
14126 return;
14127
14128 case 12:
14129 case 16:
14130 putc ('t', file);
14131 return;
14132
14133 default:
14134 break;
14135 }
14136 }
14137 else
14138 {
14139 output_operand_lossage
14140 ("invalid operand type used with operand code '%c'", code);
14141 return;
14142 }
14143
14144 output_operand_lossage
14145 ("invalid operand size for operand code '%c'", code);
14146 return;
14147
14148 case 'd':
14149 case 'b':
14150 case 'w':
14151 case 'k':
14152 case 'q':
14153 case 'h':
14154 case 't':
14155 case 'y':
14156 case 'x':
14157 case 'X':
14158 case 'P':
14159 case 'p':
14160 break;
14161
14162 case 's':
14163 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14164 {
14165 ix86_print_operand (file, x, 0);
14166 fputs (", ", file);
14167 }
14168 return;
14169
14170 case 'D':
14171 /* Little bit of braindamage here. The SSE compare instructions
14172 does use completely different names for the comparisons that the
14173 fp conditional moves. */
14174 if (TARGET_AVX)
14175 {
14176 switch (GET_CODE (x))
14177 {
14178 case EQ:
14179 fputs ("eq", file);
14180 break;
14181 case UNEQ:
14182 fputs ("eq_us", file);
14183 break;
14184 case LT:
14185 fputs ("lt", file);
14186 break;
14187 case UNLT:
14188 fputs ("nge", file);
14189 break;
14190 case LE:
14191 fputs ("le", file);
14192 break;
14193 case UNLE:
14194 fputs ("ngt", file);
14195 break;
14196 case UNORDERED:
14197 fputs ("unord", file);
14198 break;
14199 case NE:
14200 fputs ("neq", file);
14201 break;
14202 case LTGT:
14203 fputs ("neq_oq", file);
14204 break;
14205 case GE:
14206 fputs ("ge", file);
14207 break;
14208 case UNGE:
14209 fputs ("nlt", file);
14210 break;
14211 case GT:
14212 fputs ("gt", file);
14213 break;
14214 case UNGT:
14215 fputs ("nle", file);
14216 break;
14217 case ORDERED:
14218 fputs ("ord", file);
14219 break;
14220 default:
14221 output_operand_lossage ("operand is not a condition code, "
14222 "invalid operand code 'D'");
14223 return;
14224 }
14225 }
14226 else
14227 {
14228 switch (GET_CODE (x))
14229 {
14230 case EQ:
14231 case UNEQ:
14232 fputs ("eq", file);
14233 break;
14234 case LT:
14235 case UNLT:
14236 fputs ("lt", file);
14237 break;
14238 case LE:
14239 case UNLE:
14240 fputs ("le", file);
14241 break;
14242 case UNORDERED:
14243 fputs ("unord", file);
14244 break;
14245 case NE:
14246 case LTGT:
14247 fputs ("neq", file);
14248 break;
14249 case UNGE:
14250 case GE:
14251 fputs ("nlt", file);
14252 break;
14253 case UNGT:
14254 case GT:
14255 fputs ("nle", file);
14256 break;
14257 case ORDERED:
14258 fputs ("ord", file);
14259 break;
14260 default:
14261 output_operand_lossage ("operand is not a condition code, "
14262 "invalid operand code 'D'");
14263 return;
14264 }
14265 }
14266 return;
14267 case 'O':
14268 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14269 if (ASSEMBLER_DIALECT == ASM_ATT)
14270 {
14271 switch (GET_MODE (x))
14272 {
14273 case HImode: putc ('w', file); break;
14274 case SImode:
14275 case SFmode: putc ('l', file); break;
14276 case DImode:
14277 case DFmode: putc ('q', file); break;
14278 default: gcc_unreachable ();
14279 }
14280 putc ('.', file);
14281 }
14282 #endif
14283 return;
14284 case 'C':
14285 if (!COMPARISON_P (x))
14286 {
14287 output_operand_lossage ("operand is neither a constant nor a "
14288 "condition code, invalid operand code "
14289 "'C'");
14290 return;
14291 }
14292 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14293 return;
14294 case 'F':
14295 if (!COMPARISON_P (x))
14296 {
14297 output_operand_lossage ("operand is neither a constant nor a "
14298 "condition code, invalid operand code "
14299 "'F'");
14300 return;
14301 }
14302 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14303 if (ASSEMBLER_DIALECT == ASM_ATT)
14304 putc ('.', file);
14305 #endif
14306 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14307 return;
14308
14309 /* Like above, but reverse condition */
14310 case 'c':
14311 /* Check to see if argument to %c is really a constant
14312 and not a condition code which needs to be reversed. */
14313 if (!COMPARISON_P (x))
14314 {
14315 output_operand_lossage ("operand is neither a constant nor a "
14316 "condition code, invalid operand "
14317 "code 'c'");
14318 return;
14319 }
14320 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14321 return;
14322 case 'f':
14323 if (!COMPARISON_P (x))
14324 {
14325 output_operand_lossage ("operand is neither a constant nor a "
14326 "condition code, invalid operand "
14327 "code 'f'");
14328 return;
14329 }
14330 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14331 if (ASSEMBLER_DIALECT == ASM_ATT)
14332 putc ('.', file);
14333 #endif
14334 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14335 return;
14336
14337 case 'H':
14338 if (!offsettable_memref_p (x))
14339 {
14340 output_operand_lossage ("operand is not an offsettable memory "
14341 "reference, invalid operand "
14342 "code 'H'");
14343 return;
14344 }
14345 /* It doesn't actually matter what mode we use here, as we're
14346 only going to use this for printing. */
14347 x = adjust_address_nv (x, DImode, 8);
14348 break;
14349
14350 case 'K':
14351 gcc_assert (CONST_INT_P (x));
14352
14353 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14354 #ifdef HAVE_AS_IX86_HLE
14355 fputs ("xacquire ", file);
14356 #else
14357 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14358 #endif
14359 else if (INTVAL (x) & IX86_HLE_RELEASE)
14360 #ifdef HAVE_AS_IX86_HLE
14361 fputs ("xrelease ", file);
14362 #else
14363 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14364 #endif
14365 /* We do not want to print value of the operand. */
14366 return;
14367
14368 case '+':
14369 {
14370 rtx x;
14371
14372 if (!optimize
14373 || optimize_function_for_size_p (cfun)
14374 || !TARGET_BRANCH_PREDICTION_HINTS)
14375 return;
14376
14377 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14378 if (x)
14379 {
14380 int pred_val = INTVAL (XEXP (x, 0));
14381
14382 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14383 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14384 {
14385 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14386 bool cputaken
14387 = final_forward_branch_p (current_output_insn) == 0;
14388
14389 /* Emit hints only in the case default branch prediction
14390 heuristics would fail. */
14391 if (taken != cputaken)
14392 {
14393 /* We use 3e (DS) prefix for taken branches and
14394 2e (CS) prefix for not taken branches. */
14395 if (taken)
14396 fputs ("ds ; ", file);
14397 else
14398 fputs ("cs ; ", file);
14399 }
14400 }
14401 }
14402 return;
14403 }
14404
14405 case 'Y':
14406 switch (GET_CODE (x))
14407 {
14408 case NE:
14409 fputs ("neq", file);
14410 break;
14411 case EQ:
14412 fputs ("eq", file);
14413 break;
14414 case GE:
14415 case GEU:
14416 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14417 break;
14418 case GT:
14419 case GTU:
14420 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14421 break;
14422 case LE:
14423 case LEU:
14424 fputs ("le", file);
14425 break;
14426 case LT:
14427 case LTU:
14428 fputs ("lt", file);
14429 break;
14430 case UNORDERED:
14431 fputs ("unord", file);
14432 break;
14433 case ORDERED:
14434 fputs ("ord", file);
14435 break;
14436 case UNEQ:
14437 fputs ("ueq", file);
14438 break;
14439 case UNGE:
14440 fputs ("nlt", file);
14441 break;
14442 case UNGT:
14443 fputs ("nle", file);
14444 break;
14445 case UNLE:
14446 fputs ("ule", file);
14447 break;
14448 case UNLT:
14449 fputs ("ult", file);
14450 break;
14451 case LTGT:
14452 fputs ("une", file);
14453 break;
14454 default:
14455 output_operand_lossage ("operand is not a condition code, "
14456 "invalid operand code 'Y'");
14457 return;
14458 }
14459 return;
14460
14461 case ';':
14462 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14463 putc (';', file);
14464 #endif
14465 return;
14466
14467 case '@':
14468 if (ASSEMBLER_DIALECT == ASM_ATT)
14469 putc ('%', file);
14470
14471 /* The kernel uses a different segment register for performance
14472 reasons; a system call would not have to trash the userspace
14473 segment register, which would be expensive. */
14474 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14475 fputs ("fs", file);
14476 else
14477 fputs ("gs", file);
14478 return;
14479
14480 case '~':
14481 putc (TARGET_AVX2 ? 'i' : 'f', file);
14482 return;
14483
14484 case '^':
14485 if (TARGET_64BIT && Pmode != word_mode)
14486 fputs ("addr32 ", file);
14487 return;
14488
14489 default:
14490 output_operand_lossage ("invalid operand code '%c'", code);
14491 }
14492 }
14493
14494 if (REG_P (x))
14495 print_reg (x, code, file);
14496
14497 else if (MEM_P (x))
14498 {
14499 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14500 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14501 && GET_MODE (x) != BLKmode)
14502 {
14503 const char * size;
14504 switch (GET_MODE_SIZE (GET_MODE (x)))
14505 {
14506 case 1: size = "BYTE"; break;
14507 case 2: size = "WORD"; break;
14508 case 4: size = "DWORD"; break;
14509 case 8: size = "QWORD"; break;
14510 case 12: size = "TBYTE"; break;
14511 case 16:
14512 if (GET_MODE (x) == XFmode)
14513 size = "TBYTE";
14514 else
14515 size = "XMMWORD";
14516 break;
14517 case 32: size = "YMMWORD"; break;
14518 default:
14519 gcc_unreachable ();
14520 }
14521
14522 /* Check for explicit size override (codes 'b', 'w', 'k',
14523 'q' and 'x') */
14524 if (code == 'b')
14525 size = "BYTE";
14526 else if (code == 'w')
14527 size = "WORD";
14528 else if (code == 'k')
14529 size = "DWORD";
14530 else if (code == 'q')
14531 size = "QWORD";
14532 else if (code == 'x')
14533 size = "XMMWORD";
14534
14535 fputs (size, file);
14536 fputs (" PTR ", file);
14537 }
14538
14539 x = XEXP (x, 0);
14540 /* Avoid (%rip) for call operands. */
14541 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14542 && !CONST_INT_P (x))
14543 output_addr_const (file, x);
14544 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14545 output_operand_lossage ("invalid constraints for operand");
14546 else
14547 output_address (x);
14548 }
14549
14550 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14551 {
14552 REAL_VALUE_TYPE r;
14553 long l;
14554
14555 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14556 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14557
14558 if (ASSEMBLER_DIALECT == ASM_ATT)
14559 putc ('$', file);
14560 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14561 if (code == 'q')
14562 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14563 else
14564 fprintf (file, "0x%08x", (unsigned int) l);
14565 }
14566
14567 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14568 {
14569 REAL_VALUE_TYPE r;
14570 long l[2];
14571
14572 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14573 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14574
14575 if (ASSEMBLER_DIALECT == ASM_ATT)
14576 putc ('$', file);
14577 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14578 }
14579
14580 /* These float cases don't actually occur as immediate operands. */
14581 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14582 {
14583 char dstr[30];
14584
14585 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14586 fputs (dstr, file);
14587 }
14588
14589 else
14590 {
14591 /* We have patterns that allow zero sets of memory, for instance.
14592 In 64-bit mode, we should probably support all 8-byte vectors,
14593 since we can in fact encode that into an immediate. */
14594 if (GET_CODE (x) == CONST_VECTOR)
14595 {
14596 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14597 x = const0_rtx;
14598 }
14599
14600 if (code != 'P' && code != 'p')
14601 {
14602 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14603 {
14604 if (ASSEMBLER_DIALECT == ASM_ATT)
14605 putc ('$', file);
14606 }
14607 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14608 || GET_CODE (x) == LABEL_REF)
14609 {
14610 if (ASSEMBLER_DIALECT == ASM_ATT)
14611 putc ('$', file);
14612 else
14613 fputs ("OFFSET FLAT:", file);
14614 }
14615 }
14616 if (CONST_INT_P (x))
14617 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14618 else if (flag_pic || MACHOPIC_INDIRECT)
14619 output_pic_addr_const (file, x, code);
14620 else
14621 output_addr_const (file, x);
14622 }
14623 }
14624
14625 static bool
14626 ix86_print_operand_punct_valid_p (unsigned char code)
14627 {
14628 return (code == '@' || code == '*' || code == '+' || code == '&'
14629 || code == ';' || code == '~' || code == '^');
14630 }
14631 \f
14632 /* Print a memory operand whose address is ADDR. */
14633
14634 static void
14635 ix86_print_operand_address (FILE *file, rtx addr)
14636 {
14637 struct ix86_address parts;
14638 rtx base, index, disp;
14639 int scale;
14640 int ok;
14641 bool vsib = false;
14642 int code = 0;
14643
14644 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14645 {
14646 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14647 gcc_assert (parts.index == NULL_RTX);
14648 parts.index = XVECEXP (addr, 0, 1);
14649 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14650 addr = XVECEXP (addr, 0, 0);
14651 vsib = true;
14652 }
14653 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14654 {
14655 gcc_assert (TARGET_64BIT);
14656 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14657 code = 'q';
14658 }
14659 else
14660 ok = ix86_decompose_address (addr, &parts);
14661
14662 gcc_assert (ok);
14663
14664 if (parts.base && GET_CODE (parts.base) == SUBREG)
14665 {
14666 rtx tmp = SUBREG_REG (parts.base);
14667 parts.base = simplify_subreg (GET_MODE (parts.base),
14668 tmp, GET_MODE (tmp), 0);
14669 }
14670
14671 if (parts.index && GET_CODE (parts.index) == SUBREG)
14672 {
14673 rtx tmp = SUBREG_REG (parts.index);
14674 parts.index = simplify_subreg (GET_MODE (parts.index),
14675 tmp, GET_MODE (tmp), 0);
14676 }
14677
14678 base = parts.base;
14679 index = parts.index;
14680 disp = parts.disp;
14681 scale = parts.scale;
14682
14683 switch (parts.seg)
14684 {
14685 case SEG_DEFAULT:
14686 break;
14687 case SEG_FS:
14688 case SEG_GS:
14689 if (ASSEMBLER_DIALECT == ASM_ATT)
14690 putc ('%', file);
14691 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14692 break;
14693 default:
14694 gcc_unreachable ();
14695 }
14696
14697 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14698 if (TARGET_64BIT && !base && !index)
14699 {
14700 rtx symbol = disp;
14701
14702 if (GET_CODE (disp) == CONST
14703 && GET_CODE (XEXP (disp, 0)) == PLUS
14704 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14705 symbol = XEXP (XEXP (disp, 0), 0);
14706
14707 if (GET_CODE (symbol) == LABEL_REF
14708 || (GET_CODE (symbol) == SYMBOL_REF
14709 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14710 base = pc_rtx;
14711 }
14712 if (!base && !index)
14713 {
14714 /* Displacement only requires special attention. */
14715
14716 if (CONST_INT_P (disp))
14717 {
14718 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14719 fputs ("ds:", file);
14720 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14721 }
14722 else if (flag_pic)
14723 output_pic_addr_const (file, disp, 0);
14724 else
14725 output_addr_const (file, disp);
14726 }
14727 else
14728 {
14729 /* Print SImode register names for zero-extended
14730 addresses to force addr32 prefix. */
14731 if (TARGET_64BIT
14732 && (GET_CODE (addr) == ZERO_EXTEND
14733 || GET_CODE (addr) == AND))
14734 {
14735 gcc_assert (!code);
14736 code = 'l';
14737 }
14738
14739 if (ASSEMBLER_DIALECT == ASM_ATT)
14740 {
14741 if (disp)
14742 {
14743 if (flag_pic)
14744 output_pic_addr_const (file, disp, 0);
14745 else if (GET_CODE (disp) == LABEL_REF)
14746 output_asm_label (disp);
14747 else
14748 output_addr_const (file, disp);
14749 }
14750
14751 putc ('(', file);
14752 if (base)
14753 print_reg (base, code, file);
14754 if (index)
14755 {
14756 putc (',', file);
14757 print_reg (index, vsib ? 0 : code, file);
14758 if (scale != 1 || vsib)
14759 fprintf (file, ",%d", scale);
14760 }
14761 putc (')', file);
14762 }
14763 else
14764 {
14765 rtx offset = NULL_RTX;
14766
14767 if (disp)
14768 {
14769 /* Pull out the offset of a symbol; print any symbol itself. */
14770 if (GET_CODE (disp) == CONST
14771 && GET_CODE (XEXP (disp, 0)) == PLUS
14772 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14773 {
14774 offset = XEXP (XEXP (disp, 0), 1);
14775 disp = gen_rtx_CONST (VOIDmode,
14776 XEXP (XEXP (disp, 0), 0));
14777 }
14778
14779 if (flag_pic)
14780 output_pic_addr_const (file, disp, 0);
14781 else if (GET_CODE (disp) == LABEL_REF)
14782 output_asm_label (disp);
14783 else if (CONST_INT_P (disp))
14784 offset = disp;
14785 else
14786 output_addr_const (file, disp);
14787 }
14788
14789 putc ('[', file);
14790 if (base)
14791 {
14792 print_reg (base, code, file);
14793 if (offset)
14794 {
14795 if (INTVAL (offset) >= 0)
14796 putc ('+', file);
14797 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14798 }
14799 }
14800 else if (offset)
14801 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14802 else
14803 putc ('0', file);
14804
14805 if (index)
14806 {
14807 putc ('+', file);
14808 print_reg (index, vsib ? 0 : code, file);
14809 if (scale != 1 || vsib)
14810 fprintf (file, "*%d", scale);
14811 }
14812 putc (']', file);
14813 }
14814 }
14815 }
14816
14817 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14818
14819 static bool
14820 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14821 {
14822 rtx op;
14823
14824 if (GET_CODE (x) != UNSPEC)
14825 return false;
14826
14827 op = XVECEXP (x, 0, 0);
14828 switch (XINT (x, 1))
14829 {
14830 case UNSPEC_GOTTPOFF:
14831 output_addr_const (file, op);
14832 /* FIXME: This might be @TPOFF in Sun ld. */
14833 fputs ("@gottpoff", file);
14834 break;
14835 case UNSPEC_TPOFF:
14836 output_addr_const (file, op);
14837 fputs ("@tpoff", file);
14838 break;
14839 case UNSPEC_NTPOFF:
14840 output_addr_const (file, op);
14841 if (TARGET_64BIT)
14842 fputs ("@tpoff", file);
14843 else
14844 fputs ("@ntpoff", file);
14845 break;
14846 case UNSPEC_DTPOFF:
14847 output_addr_const (file, op);
14848 fputs ("@dtpoff", file);
14849 break;
14850 case UNSPEC_GOTNTPOFF:
14851 output_addr_const (file, op);
14852 if (TARGET_64BIT)
14853 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14854 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14855 else
14856 fputs ("@gotntpoff", file);
14857 break;
14858 case UNSPEC_INDNTPOFF:
14859 output_addr_const (file, op);
14860 fputs ("@indntpoff", file);
14861 break;
14862 #if TARGET_MACHO
14863 case UNSPEC_MACHOPIC_OFFSET:
14864 output_addr_const (file, op);
14865 putc ('-', file);
14866 machopic_output_function_base_name (file);
14867 break;
14868 #endif
14869
14870 case UNSPEC_STACK_CHECK:
14871 {
14872 int offset;
14873
14874 gcc_assert (flag_split_stack);
14875
14876 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14877 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14878 #else
14879 gcc_unreachable ();
14880 #endif
14881
14882 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14883 }
14884 break;
14885
14886 default:
14887 return false;
14888 }
14889
14890 return true;
14891 }
14892 \f
14893 /* Split one or more double-mode RTL references into pairs of half-mode
14894 references. The RTL can be REG, offsettable MEM, integer constant, or
14895 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14896 split and "num" is its length. lo_half and hi_half are output arrays
14897 that parallel "operands". */
14898
14899 void
14900 split_double_mode (enum machine_mode mode, rtx operands[],
14901 int num, rtx lo_half[], rtx hi_half[])
14902 {
14903 enum machine_mode half_mode;
14904 unsigned int byte;
14905
14906 switch (mode)
14907 {
14908 case TImode:
14909 half_mode = DImode;
14910 break;
14911 case DImode:
14912 half_mode = SImode;
14913 break;
14914 default:
14915 gcc_unreachable ();
14916 }
14917
14918 byte = GET_MODE_SIZE (half_mode);
14919
14920 while (num--)
14921 {
14922 rtx op = operands[num];
14923
14924 /* simplify_subreg refuse to split volatile memory addresses,
14925 but we still have to handle it. */
14926 if (MEM_P (op))
14927 {
14928 lo_half[num] = adjust_address (op, half_mode, 0);
14929 hi_half[num] = adjust_address (op, half_mode, byte);
14930 }
14931 else
14932 {
14933 lo_half[num] = simplify_gen_subreg (half_mode, op,
14934 GET_MODE (op) == VOIDmode
14935 ? mode : GET_MODE (op), 0);
14936 hi_half[num] = simplify_gen_subreg (half_mode, op,
14937 GET_MODE (op) == VOIDmode
14938 ? mode : GET_MODE (op), byte);
14939 }
14940 }
14941 }
14942 \f
14943 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14944 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14945 is the expression of the binary operation. The output may either be
14946 emitted here, or returned to the caller, like all output_* functions.
14947
14948 There is no guarantee that the operands are the same mode, as they
14949 might be within FLOAT or FLOAT_EXTEND expressions. */
14950
14951 #ifndef SYSV386_COMPAT
14952 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14953 wants to fix the assemblers because that causes incompatibility
14954 with gcc. No-one wants to fix gcc because that causes
14955 incompatibility with assemblers... You can use the option of
14956 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14957 #define SYSV386_COMPAT 1
14958 #endif
14959
14960 const char *
14961 output_387_binary_op (rtx insn, rtx *operands)
14962 {
14963 static char buf[40];
14964 const char *p;
14965 const char *ssep;
14966 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14967
14968 #ifdef ENABLE_CHECKING
14969 /* Even if we do not want to check the inputs, this documents input
14970 constraints. Which helps in understanding the following code. */
14971 if (STACK_REG_P (operands[0])
14972 && ((REG_P (operands[1])
14973 && REGNO (operands[0]) == REGNO (operands[1])
14974 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14975 || (REG_P (operands[2])
14976 && REGNO (operands[0]) == REGNO (operands[2])
14977 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14978 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14979 ; /* ok */
14980 else
14981 gcc_assert (is_sse);
14982 #endif
14983
14984 switch (GET_CODE (operands[3]))
14985 {
14986 case PLUS:
14987 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14988 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14989 p = "fiadd";
14990 else
14991 p = "fadd";
14992 ssep = "vadd";
14993 break;
14994
14995 case MINUS:
14996 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14997 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14998 p = "fisub";
14999 else
15000 p = "fsub";
15001 ssep = "vsub";
15002 break;
15003
15004 case MULT:
15005 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15006 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15007 p = "fimul";
15008 else
15009 p = "fmul";
15010 ssep = "vmul";
15011 break;
15012
15013 case DIV:
15014 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15015 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15016 p = "fidiv";
15017 else
15018 p = "fdiv";
15019 ssep = "vdiv";
15020 break;
15021
15022 default:
15023 gcc_unreachable ();
15024 }
15025
15026 if (is_sse)
15027 {
15028 if (TARGET_AVX)
15029 {
15030 strcpy (buf, ssep);
15031 if (GET_MODE (operands[0]) == SFmode)
15032 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15033 else
15034 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15035 }
15036 else
15037 {
15038 strcpy (buf, ssep + 1);
15039 if (GET_MODE (operands[0]) == SFmode)
15040 strcat (buf, "ss\t{%2, %0|%0, %2}");
15041 else
15042 strcat (buf, "sd\t{%2, %0|%0, %2}");
15043 }
15044 return buf;
15045 }
15046 strcpy (buf, p);
15047
15048 switch (GET_CODE (operands[3]))
15049 {
15050 case MULT:
15051 case PLUS:
15052 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15053 {
15054 rtx temp = operands[2];
15055 operands[2] = operands[1];
15056 operands[1] = temp;
15057 }
15058
15059 /* know operands[0] == operands[1]. */
15060
15061 if (MEM_P (operands[2]))
15062 {
15063 p = "%Z2\t%2";
15064 break;
15065 }
15066
15067 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15068 {
15069 if (STACK_TOP_P (operands[0]))
15070 /* How is it that we are storing to a dead operand[2]?
15071 Well, presumably operands[1] is dead too. We can't
15072 store the result to st(0) as st(0) gets popped on this
15073 instruction. Instead store to operands[2] (which I
15074 think has to be st(1)). st(1) will be popped later.
15075 gcc <= 2.8.1 didn't have this check and generated
15076 assembly code that the Unixware assembler rejected. */
15077 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15078 else
15079 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15080 break;
15081 }
15082
15083 if (STACK_TOP_P (operands[0]))
15084 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15085 else
15086 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15087 break;
15088
15089 case MINUS:
15090 case DIV:
15091 if (MEM_P (operands[1]))
15092 {
15093 p = "r%Z1\t%1";
15094 break;
15095 }
15096
15097 if (MEM_P (operands[2]))
15098 {
15099 p = "%Z2\t%2";
15100 break;
15101 }
15102
15103 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15104 {
15105 #if SYSV386_COMPAT
15106 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15107 derived assemblers, confusingly reverse the direction of
15108 the operation for fsub{r} and fdiv{r} when the
15109 destination register is not st(0). The Intel assembler
15110 doesn't have this brain damage. Read !SYSV386_COMPAT to
15111 figure out what the hardware really does. */
15112 if (STACK_TOP_P (operands[0]))
15113 p = "{p\t%0, %2|rp\t%2, %0}";
15114 else
15115 p = "{rp\t%2, %0|p\t%0, %2}";
15116 #else
15117 if (STACK_TOP_P (operands[0]))
15118 /* As above for fmul/fadd, we can't store to st(0). */
15119 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15120 else
15121 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15122 #endif
15123 break;
15124 }
15125
15126 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15127 {
15128 #if SYSV386_COMPAT
15129 if (STACK_TOP_P (operands[0]))
15130 p = "{rp\t%0, %1|p\t%1, %0}";
15131 else
15132 p = "{p\t%1, %0|rp\t%0, %1}";
15133 #else
15134 if (STACK_TOP_P (operands[0]))
15135 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15136 else
15137 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15138 #endif
15139 break;
15140 }
15141
15142 if (STACK_TOP_P (operands[0]))
15143 {
15144 if (STACK_TOP_P (operands[1]))
15145 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15146 else
15147 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15148 break;
15149 }
15150 else if (STACK_TOP_P (operands[1]))
15151 {
15152 #if SYSV386_COMPAT
15153 p = "{\t%1, %0|r\t%0, %1}";
15154 #else
15155 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15156 #endif
15157 }
15158 else
15159 {
15160 #if SYSV386_COMPAT
15161 p = "{r\t%2, %0|\t%0, %2}";
15162 #else
15163 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15164 #endif
15165 }
15166 break;
15167
15168 default:
15169 gcc_unreachable ();
15170 }
15171
15172 strcat (buf, p);
15173 return buf;
15174 }
15175
15176 /* Return needed mode for entity in optimize_mode_switching pass. */
15177
15178 int
15179 ix86_mode_needed (int entity, rtx insn)
15180 {
15181 enum attr_i387_cw mode;
15182
15183 /* The mode UNINITIALIZED is used to store control word after a
15184 function call or ASM pattern. The mode ANY specify that function
15185 has no requirements on the control word and make no changes in the
15186 bits we are interested in. */
15187
15188 if (CALL_P (insn)
15189 || (NONJUMP_INSN_P (insn)
15190 && (asm_noperands (PATTERN (insn)) >= 0
15191 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15192 return I387_CW_UNINITIALIZED;
15193
15194 if (recog_memoized (insn) < 0)
15195 return I387_CW_ANY;
15196
15197 mode = get_attr_i387_cw (insn);
15198
15199 switch (entity)
15200 {
15201 case I387_TRUNC:
15202 if (mode == I387_CW_TRUNC)
15203 return mode;
15204 break;
15205
15206 case I387_FLOOR:
15207 if (mode == I387_CW_FLOOR)
15208 return mode;
15209 break;
15210
15211 case I387_CEIL:
15212 if (mode == I387_CW_CEIL)
15213 return mode;
15214 break;
15215
15216 case I387_MASK_PM:
15217 if (mode == I387_CW_MASK_PM)
15218 return mode;
15219 break;
15220
15221 default:
15222 gcc_unreachable ();
15223 }
15224
15225 return I387_CW_ANY;
15226 }
15227
15228 /* Output code to initialize control word copies used by trunc?f?i and
15229 rounding patterns. CURRENT_MODE is set to current control word,
15230 while NEW_MODE is set to new control word. */
15231
15232 void
15233 emit_i387_cw_initialization (int mode)
15234 {
15235 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15236 rtx new_mode;
15237
15238 enum ix86_stack_slot slot;
15239
15240 rtx reg = gen_reg_rtx (HImode);
15241
15242 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15243 emit_move_insn (reg, copy_rtx (stored_mode));
15244
15245 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15246 || optimize_function_for_size_p (cfun))
15247 {
15248 switch (mode)
15249 {
15250 case I387_CW_TRUNC:
15251 /* round toward zero (truncate) */
15252 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15253 slot = SLOT_CW_TRUNC;
15254 break;
15255
15256 case I387_CW_FLOOR:
15257 /* round down toward -oo */
15258 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15259 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15260 slot = SLOT_CW_FLOOR;
15261 break;
15262
15263 case I387_CW_CEIL:
15264 /* round up toward +oo */
15265 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15266 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15267 slot = SLOT_CW_CEIL;
15268 break;
15269
15270 case I387_CW_MASK_PM:
15271 /* mask precision exception for nearbyint() */
15272 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15273 slot = SLOT_CW_MASK_PM;
15274 break;
15275
15276 default:
15277 gcc_unreachable ();
15278 }
15279 }
15280 else
15281 {
15282 switch (mode)
15283 {
15284 case I387_CW_TRUNC:
15285 /* round toward zero (truncate) */
15286 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15287 slot = SLOT_CW_TRUNC;
15288 break;
15289
15290 case I387_CW_FLOOR:
15291 /* round down toward -oo */
15292 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15293 slot = SLOT_CW_FLOOR;
15294 break;
15295
15296 case I387_CW_CEIL:
15297 /* round up toward +oo */
15298 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15299 slot = SLOT_CW_CEIL;
15300 break;
15301
15302 case I387_CW_MASK_PM:
15303 /* mask precision exception for nearbyint() */
15304 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15305 slot = SLOT_CW_MASK_PM;
15306 break;
15307
15308 default:
15309 gcc_unreachable ();
15310 }
15311 }
15312
15313 gcc_assert (slot < MAX_386_STACK_LOCALS);
15314
15315 new_mode = assign_386_stack_local (HImode, slot);
15316 emit_move_insn (new_mode, reg);
15317 }
15318
15319 /* Output code for INSN to convert a float to a signed int. OPERANDS
15320 are the insn operands. The output may be [HSD]Imode and the input
15321 operand may be [SDX]Fmode. */
15322
15323 const char *
15324 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15325 {
15326 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15327 int dimode_p = GET_MODE (operands[0]) == DImode;
15328 int round_mode = get_attr_i387_cw (insn);
15329
15330 /* Jump through a hoop or two for DImode, since the hardware has no
15331 non-popping instruction. We used to do this a different way, but
15332 that was somewhat fragile and broke with post-reload splitters. */
15333 if ((dimode_p || fisttp) && !stack_top_dies)
15334 output_asm_insn ("fld\t%y1", operands);
15335
15336 gcc_assert (STACK_TOP_P (operands[1]));
15337 gcc_assert (MEM_P (operands[0]));
15338 gcc_assert (GET_MODE (operands[1]) != TFmode);
15339
15340 if (fisttp)
15341 output_asm_insn ("fisttp%Z0\t%0", operands);
15342 else
15343 {
15344 if (round_mode != I387_CW_ANY)
15345 output_asm_insn ("fldcw\t%3", operands);
15346 if (stack_top_dies || dimode_p)
15347 output_asm_insn ("fistp%Z0\t%0", operands);
15348 else
15349 output_asm_insn ("fist%Z0\t%0", operands);
15350 if (round_mode != I387_CW_ANY)
15351 output_asm_insn ("fldcw\t%2", operands);
15352 }
15353
15354 return "";
15355 }
15356
15357 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15358 have the values zero or one, indicates the ffreep insn's operand
15359 from the OPERANDS array. */
15360
15361 static const char *
15362 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15363 {
15364 if (TARGET_USE_FFREEP)
15365 #ifdef HAVE_AS_IX86_FFREEP
15366 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15367 #else
15368 {
15369 static char retval[32];
15370 int regno = REGNO (operands[opno]);
15371
15372 gcc_assert (FP_REGNO_P (regno));
15373
15374 regno -= FIRST_STACK_REG;
15375
15376 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15377 return retval;
15378 }
15379 #endif
15380
15381 return opno ? "fstp\t%y1" : "fstp\t%y0";
15382 }
15383
15384
15385 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15386 should be used. UNORDERED_P is true when fucom should be used. */
15387
15388 const char *
15389 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15390 {
15391 int stack_top_dies;
15392 rtx cmp_op0, cmp_op1;
15393 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15394
15395 if (eflags_p)
15396 {
15397 cmp_op0 = operands[0];
15398 cmp_op1 = operands[1];
15399 }
15400 else
15401 {
15402 cmp_op0 = operands[1];
15403 cmp_op1 = operands[2];
15404 }
15405
15406 if (is_sse)
15407 {
15408 if (GET_MODE (operands[0]) == SFmode)
15409 if (unordered_p)
15410 return "%vucomiss\t{%1, %0|%0, %1}";
15411 else
15412 return "%vcomiss\t{%1, %0|%0, %1}";
15413 else
15414 if (unordered_p)
15415 return "%vucomisd\t{%1, %0|%0, %1}";
15416 else
15417 return "%vcomisd\t{%1, %0|%0, %1}";
15418 }
15419
15420 gcc_assert (STACK_TOP_P (cmp_op0));
15421
15422 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15423
15424 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15425 {
15426 if (stack_top_dies)
15427 {
15428 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15429 return output_387_ffreep (operands, 1);
15430 }
15431 else
15432 return "ftst\n\tfnstsw\t%0";
15433 }
15434
15435 if (STACK_REG_P (cmp_op1)
15436 && stack_top_dies
15437 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15438 && REGNO (cmp_op1) != FIRST_STACK_REG)
15439 {
15440 /* If both the top of the 387 stack dies, and the other operand
15441 is also a stack register that dies, then this must be a
15442 `fcompp' float compare */
15443
15444 if (eflags_p)
15445 {
15446 /* There is no double popping fcomi variant. Fortunately,
15447 eflags is immune from the fstp's cc clobbering. */
15448 if (unordered_p)
15449 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15450 else
15451 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15452 return output_387_ffreep (operands, 0);
15453 }
15454 else
15455 {
15456 if (unordered_p)
15457 return "fucompp\n\tfnstsw\t%0";
15458 else
15459 return "fcompp\n\tfnstsw\t%0";
15460 }
15461 }
15462 else
15463 {
15464 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15465
15466 static const char * const alt[16] =
15467 {
15468 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15469 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15470 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15471 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15472
15473 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15474 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15475 NULL,
15476 NULL,
15477
15478 "fcomi\t{%y1, %0|%0, %y1}",
15479 "fcomip\t{%y1, %0|%0, %y1}",
15480 "fucomi\t{%y1, %0|%0, %y1}",
15481 "fucomip\t{%y1, %0|%0, %y1}",
15482
15483 NULL,
15484 NULL,
15485 NULL,
15486 NULL
15487 };
15488
15489 int mask;
15490 const char *ret;
15491
15492 mask = eflags_p << 3;
15493 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15494 mask |= unordered_p << 1;
15495 mask |= stack_top_dies;
15496
15497 gcc_assert (mask < 16);
15498 ret = alt[mask];
15499 gcc_assert (ret);
15500
15501 return ret;
15502 }
15503 }
15504
15505 void
15506 ix86_output_addr_vec_elt (FILE *file, int value)
15507 {
15508 const char *directive = ASM_LONG;
15509
15510 #ifdef ASM_QUAD
15511 if (TARGET_LP64)
15512 directive = ASM_QUAD;
15513 #else
15514 gcc_assert (!TARGET_64BIT);
15515 #endif
15516
15517 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15518 }
15519
15520 void
15521 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15522 {
15523 const char *directive = ASM_LONG;
15524
15525 #ifdef ASM_QUAD
15526 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15527 directive = ASM_QUAD;
15528 #else
15529 gcc_assert (!TARGET_64BIT);
15530 #endif
15531 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15532 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15533 fprintf (file, "%s%s%d-%s%d\n",
15534 directive, LPREFIX, value, LPREFIX, rel);
15535 else if (HAVE_AS_GOTOFF_IN_DATA)
15536 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15537 #if TARGET_MACHO
15538 else if (TARGET_MACHO)
15539 {
15540 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15541 machopic_output_function_base_name (file);
15542 putc ('\n', file);
15543 }
15544 #endif
15545 else
15546 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15547 GOT_SYMBOL_NAME, LPREFIX, value);
15548 }
15549 \f
15550 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15551 for the target. */
15552
15553 void
15554 ix86_expand_clear (rtx dest)
15555 {
15556 rtx tmp;
15557
15558 /* We play register width games, which are only valid after reload. */
15559 gcc_assert (reload_completed);
15560
15561 /* Avoid HImode and its attendant prefix byte. */
15562 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15563 dest = gen_rtx_REG (SImode, REGNO (dest));
15564 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15565
15566 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15567 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15568 {
15569 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15570 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15571 }
15572
15573 emit_insn (tmp);
15574 }
15575
15576 /* X is an unchanging MEM. If it is a constant pool reference, return
15577 the constant pool rtx, else NULL. */
15578
15579 rtx
15580 maybe_get_pool_constant (rtx x)
15581 {
15582 x = ix86_delegitimize_address (XEXP (x, 0));
15583
15584 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15585 return get_pool_constant (x);
15586
15587 return NULL_RTX;
15588 }
15589
15590 void
15591 ix86_expand_move (enum machine_mode mode, rtx operands[])
15592 {
15593 rtx op0, op1;
15594 enum tls_model model;
15595
15596 op0 = operands[0];
15597 op1 = operands[1];
15598
15599 if (GET_CODE (op1) == SYMBOL_REF)
15600 {
15601 model = SYMBOL_REF_TLS_MODEL (op1);
15602 if (model)
15603 {
15604 op1 = legitimize_tls_address (op1, model, true);
15605 op1 = force_operand (op1, op0);
15606 if (op1 == op0)
15607 return;
15608 if (GET_MODE (op1) != mode)
15609 op1 = convert_to_mode (mode, op1, 1);
15610 }
15611 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15612 && SYMBOL_REF_DLLIMPORT_P (op1))
15613 op1 = legitimize_dllimport_symbol (op1, false);
15614 }
15615 else if (GET_CODE (op1) == CONST
15616 && GET_CODE (XEXP (op1, 0)) == PLUS
15617 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15618 {
15619 rtx addend = XEXP (XEXP (op1, 0), 1);
15620 rtx symbol = XEXP (XEXP (op1, 0), 0);
15621 rtx tmp = NULL;
15622
15623 model = SYMBOL_REF_TLS_MODEL (symbol);
15624 if (model)
15625 tmp = legitimize_tls_address (symbol, model, true);
15626 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15627 && SYMBOL_REF_DLLIMPORT_P (symbol))
15628 tmp = legitimize_dllimport_symbol (symbol, true);
15629
15630 if (tmp)
15631 {
15632 tmp = force_operand (tmp, NULL);
15633 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15634 op0, 1, OPTAB_DIRECT);
15635 if (tmp == op0)
15636 return;
15637 if (GET_MODE (tmp) != mode)
15638 op1 = convert_to_mode (mode, tmp, 1);
15639 }
15640 }
15641
15642 if ((flag_pic || MACHOPIC_INDIRECT)
15643 && symbolic_operand (op1, mode))
15644 {
15645 if (TARGET_MACHO && !TARGET_64BIT)
15646 {
15647 #if TARGET_MACHO
15648 /* dynamic-no-pic */
15649 if (MACHOPIC_INDIRECT)
15650 {
15651 rtx temp = ((reload_in_progress
15652 || ((op0 && REG_P (op0))
15653 && mode == Pmode))
15654 ? op0 : gen_reg_rtx (Pmode));
15655 op1 = machopic_indirect_data_reference (op1, temp);
15656 if (MACHOPIC_PURE)
15657 op1 = machopic_legitimize_pic_address (op1, mode,
15658 temp == op1 ? 0 : temp);
15659 }
15660 if (op0 != op1 && GET_CODE (op0) != MEM)
15661 {
15662 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15663 emit_insn (insn);
15664 return;
15665 }
15666 if (GET_CODE (op0) == MEM)
15667 op1 = force_reg (Pmode, op1);
15668 else
15669 {
15670 rtx temp = op0;
15671 if (GET_CODE (temp) != REG)
15672 temp = gen_reg_rtx (Pmode);
15673 temp = legitimize_pic_address (op1, temp);
15674 if (temp == op0)
15675 return;
15676 op1 = temp;
15677 }
15678 /* dynamic-no-pic */
15679 #endif
15680 }
15681 else
15682 {
15683 if (MEM_P (op0))
15684 op1 = force_reg (mode, op1);
15685 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15686 {
15687 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15688 op1 = legitimize_pic_address (op1, reg);
15689 if (op0 == op1)
15690 return;
15691 if (GET_MODE (op1) != mode)
15692 op1 = convert_to_mode (mode, op1, 1);
15693 }
15694 }
15695 }
15696 else
15697 {
15698 if (MEM_P (op0)
15699 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15700 || !push_operand (op0, mode))
15701 && MEM_P (op1))
15702 op1 = force_reg (mode, op1);
15703
15704 if (push_operand (op0, mode)
15705 && ! general_no_elim_operand (op1, mode))
15706 op1 = copy_to_mode_reg (mode, op1);
15707
15708 /* Force large constants in 64bit compilation into register
15709 to get them CSEed. */
15710 if (can_create_pseudo_p ()
15711 && (mode == DImode) && TARGET_64BIT
15712 && immediate_operand (op1, mode)
15713 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15714 && !register_operand (op0, mode)
15715 && optimize)
15716 op1 = copy_to_mode_reg (mode, op1);
15717
15718 if (can_create_pseudo_p ()
15719 && FLOAT_MODE_P (mode)
15720 && GET_CODE (op1) == CONST_DOUBLE)
15721 {
15722 /* If we are loading a floating point constant to a register,
15723 force the value to memory now, since we'll get better code
15724 out the back end. */
15725
15726 op1 = validize_mem (force_const_mem (mode, op1));
15727 if (!register_operand (op0, mode))
15728 {
15729 rtx temp = gen_reg_rtx (mode);
15730 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15731 emit_move_insn (op0, temp);
15732 return;
15733 }
15734 }
15735 }
15736
15737 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15738 }
15739
15740 void
15741 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15742 {
15743 rtx op0 = operands[0], op1 = operands[1];
15744 unsigned int align = GET_MODE_ALIGNMENT (mode);
15745
15746 /* Force constants other than zero into memory. We do not know how
15747 the instructions used to build constants modify the upper 64 bits
15748 of the register, once we have that information we may be able
15749 to handle some of them more efficiently. */
15750 if (can_create_pseudo_p ()
15751 && register_operand (op0, mode)
15752 && (CONSTANT_P (op1)
15753 || (GET_CODE (op1) == SUBREG
15754 && CONSTANT_P (SUBREG_REG (op1))))
15755 && !standard_sse_constant_p (op1))
15756 op1 = validize_mem (force_const_mem (mode, op1));
15757
15758 /* We need to check memory alignment for SSE mode since attribute
15759 can make operands unaligned. */
15760 if (can_create_pseudo_p ()
15761 && SSE_REG_MODE_P (mode)
15762 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15763 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15764 {
15765 rtx tmp[2];
15766
15767 /* ix86_expand_vector_move_misalign() does not like constants ... */
15768 if (CONSTANT_P (op1)
15769 || (GET_CODE (op1) == SUBREG
15770 && CONSTANT_P (SUBREG_REG (op1))))
15771 op1 = validize_mem (force_const_mem (mode, op1));
15772
15773 /* ... nor both arguments in memory. */
15774 if (!register_operand (op0, mode)
15775 && !register_operand (op1, mode))
15776 op1 = force_reg (mode, op1);
15777
15778 tmp[0] = op0; tmp[1] = op1;
15779 ix86_expand_vector_move_misalign (mode, tmp);
15780 return;
15781 }
15782
15783 /* Make operand1 a register if it isn't already. */
15784 if (can_create_pseudo_p ()
15785 && !register_operand (op0, mode)
15786 && !register_operand (op1, mode))
15787 {
15788 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15789 return;
15790 }
15791
15792 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15793 }
15794
15795 /* Split 32-byte AVX unaligned load and store if needed. */
15796
15797 static void
15798 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15799 {
15800 rtx m;
15801 rtx (*extract) (rtx, rtx, rtx);
15802 rtx (*move_unaligned) (rtx, rtx);
15803 enum machine_mode mode;
15804
15805 switch (GET_MODE (op0))
15806 {
15807 default:
15808 gcc_unreachable ();
15809 case V32QImode:
15810 extract = gen_avx_vextractf128v32qi;
15811 move_unaligned = gen_avx_movdqu256;
15812 mode = V16QImode;
15813 break;
15814 case V8SFmode:
15815 extract = gen_avx_vextractf128v8sf;
15816 move_unaligned = gen_avx_movups256;
15817 mode = V4SFmode;
15818 break;
15819 case V4DFmode:
15820 extract = gen_avx_vextractf128v4df;
15821 move_unaligned = gen_avx_movupd256;
15822 mode = V2DFmode;
15823 break;
15824 }
15825
15826 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15827 {
15828 rtx r = gen_reg_rtx (mode);
15829 m = adjust_address (op1, mode, 0);
15830 emit_move_insn (r, m);
15831 m = adjust_address (op1, mode, 16);
15832 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15833 emit_move_insn (op0, r);
15834 }
15835 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15836 {
15837 m = adjust_address (op0, mode, 0);
15838 emit_insn (extract (m, op1, const0_rtx));
15839 m = adjust_address (op0, mode, 16);
15840 emit_insn (extract (m, op1, const1_rtx));
15841 }
15842 else
15843 emit_insn (move_unaligned (op0, op1));
15844 }
15845
15846 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15847 straight to ix86_expand_vector_move. */
15848 /* Code generation for scalar reg-reg moves of single and double precision data:
15849 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15850 movaps reg, reg
15851 else
15852 movss reg, reg
15853 if (x86_sse_partial_reg_dependency == true)
15854 movapd reg, reg
15855 else
15856 movsd reg, reg
15857
15858 Code generation for scalar loads of double precision data:
15859 if (x86_sse_split_regs == true)
15860 movlpd mem, reg (gas syntax)
15861 else
15862 movsd mem, reg
15863
15864 Code generation for unaligned packed loads of single precision data
15865 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15866 if (x86_sse_unaligned_move_optimal)
15867 movups mem, reg
15868
15869 if (x86_sse_partial_reg_dependency == true)
15870 {
15871 xorps reg, reg
15872 movlps mem, reg
15873 movhps mem+8, reg
15874 }
15875 else
15876 {
15877 movlps mem, reg
15878 movhps mem+8, reg
15879 }
15880
15881 Code generation for unaligned packed loads of double precision data
15882 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15883 if (x86_sse_unaligned_move_optimal)
15884 movupd mem, reg
15885
15886 if (x86_sse_split_regs == true)
15887 {
15888 movlpd mem, reg
15889 movhpd mem+8, reg
15890 }
15891 else
15892 {
15893 movsd mem, reg
15894 movhpd mem+8, reg
15895 }
15896 */
15897
15898 void
15899 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15900 {
15901 rtx op0, op1, m;
15902
15903 op0 = operands[0];
15904 op1 = operands[1];
15905
15906 if (TARGET_AVX
15907 && GET_MODE_SIZE (mode) == 32)
15908 {
15909 switch (GET_MODE_CLASS (mode))
15910 {
15911 case MODE_VECTOR_INT:
15912 case MODE_INT:
15913 op0 = gen_lowpart (V32QImode, op0);
15914 op1 = gen_lowpart (V32QImode, op1);
15915 /* FALLTHRU */
15916
15917 case MODE_VECTOR_FLOAT:
15918 ix86_avx256_split_vector_move_misalign (op0, op1);
15919 break;
15920
15921 default:
15922 gcc_unreachable ();
15923 }
15924
15925 return;
15926 }
15927
15928 if (MEM_P (op1))
15929 {
15930 /* ??? If we have typed data, then it would appear that using
15931 movdqu is the only way to get unaligned data loaded with
15932 integer type. */
15933 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15934 {
15935 op0 = gen_lowpart (V16QImode, op0);
15936 op1 = gen_lowpart (V16QImode, op1);
15937 /* We will eventually emit movups based on insn attributes. */
15938 emit_insn (gen_sse2_movdqu (op0, op1));
15939 }
15940 else if (TARGET_SSE2 && mode == V2DFmode)
15941 {
15942 rtx zero;
15943
15944 if (TARGET_AVX
15945 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15946 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15947 || optimize_function_for_size_p (cfun))
15948 {
15949 /* We will eventually emit movups based on insn attributes. */
15950 emit_insn (gen_sse2_movupd (op0, op1));
15951 return;
15952 }
15953
15954 /* When SSE registers are split into halves, we can avoid
15955 writing to the top half twice. */
15956 if (TARGET_SSE_SPLIT_REGS)
15957 {
15958 emit_clobber (op0);
15959 zero = op0;
15960 }
15961 else
15962 {
15963 /* ??? Not sure about the best option for the Intel chips.
15964 The following would seem to satisfy; the register is
15965 entirely cleared, breaking the dependency chain. We
15966 then store to the upper half, with a dependency depth
15967 of one. A rumor has it that Intel recommends two movsd
15968 followed by an unpacklpd, but this is unconfirmed. And
15969 given that the dependency depth of the unpacklpd would
15970 still be one, I'm not sure why this would be better. */
15971 zero = CONST0_RTX (V2DFmode);
15972 }
15973
15974 m = adjust_address (op1, DFmode, 0);
15975 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15976 m = adjust_address (op1, DFmode, 8);
15977 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15978 }
15979 else
15980 {
15981 if (TARGET_AVX
15982 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15983 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15984 || optimize_function_for_size_p (cfun))
15985 {
15986 op0 = gen_lowpart (V4SFmode, op0);
15987 op1 = gen_lowpart (V4SFmode, op1);
15988 emit_insn (gen_sse_movups (op0, op1));
15989 return;
15990 }
15991
15992 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15993 emit_move_insn (op0, CONST0_RTX (mode));
15994 else
15995 emit_clobber (op0);
15996
15997 if (mode != V4SFmode)
15998 op0 = gen_lowpart (V4SFmode, op0);
15999
16000 m = adjust_address (op1, V2SFmode, 0);
16001 emit_insn (gen_sse_loadlps (op0, op0, m));
16002 m = adjust_address (op1, V2SFmode, 8);
16003 emit_insn (gen_sse_loadhps (op0, op0, m));
16004 }
16005 }
16006 else if (MEM_P (op0))
16007 {
16008 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16009 {
16010 op0 = gen_lowpart (V16QImode, op0);
16011 op1 = gen_lowpart (V16QImode, op1);
16012 /* We will eventually emit movups based on insn attributes. */
16013 emit_insn (gen_sse2_movdqu (op0, op1));
16014 }
16015 else if (TARGET_SSE2 && mode == V2DFmode)
16016 {
16017 if (TARGET_AVX
16018 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16019 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16020 || optimize_function_for_size_p (cfun))
16021 /* We will eventually emit movups based on insn attributes. */
16022 emit_insn (gen_sse2_movupd (op0, op1));
16023 else
16024 {
16025 m = adjust_address (op0, DFmode, 0);
16026 emit_insn (gen_sse2_storelpd (m, op1));
16027 m = adjust_address (op0, DFmode, 8);
16028 emit_insn (gen_sse2_storehpd (m, op1));
16029 }
16030 }
16031 else
16032 {
16033 if (mode != V4SFmode)
16034 op1 = gen_lowpart (V4SFmode, op1);
16035
16036 if (TARGET_AVX
16037 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16038 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16039 || optimize_function_for_size_p (cfun))
16040 {
16041 op0 = gen_lowpart (V4SFmode, op0);
16042 emit_insn (gen_sse_movups (op0, op1));
16043 }
16044 else
16045 {
16046 m = adjust_address (op0, V2SFmode, 0);
16047 emit_insn (gen_sse_storelps (m, op1));
16048 m = adjust_address (op0, V2SFmode, 8);
16049 emit_insn (gen_sse_storehps (m, op1));
16050 }
16051 }
16052 }
16053 else
16054 gcc_unreachable ();
16055 }
16056
16057 /* Expand a push in MODE. This is some mode for which we do not support
16058 proper push instructions, at least from the registers that we expect
16059 the value to live in. */
16060
16061 void
16062 ix86_expand_push (enum machine_mode mode, rtx x)
16063 {
16064 rtx tmp;
16065
16066 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16067 GEN_INT (-GET_MODE_SIZE (mode)),
16068 stack_pointer_rtx, 1, OPTAB_DIRECT);
16069 if (tmp != stack_pointer_rtx)
16070 emit_move_insn (stack_pointer_rtx, tmp);
16071
16072 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16073
16074 /* When we push an operand onto stack, it has to be aligned at least
16075 at the function argument boundary. However since we don't have
16076 the argument type, we can't determine the actual argument
16077 boundary. */
16078 emit_move_insn (tmp, x);
16079 }
16080
16081 /* Helper function of ix86_fixup_binary_operands to canonicalize
16082 operand order. Returns true if the operands should be swapped. */
16083
16084 static bool
16085 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16086 rtx operands[])
16087 {
16088 rtx dst = operands[0];
16089 rtx src1 = operands[1];
16090 rtx src2 = operands[2];
16091
16092 /* If the operation is not commutative, we can't do anything. */
16093 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16094 return false;
16095
16096 /* Highest priority is that src1 should match dst. */
16097 if (rtx_equal_p (dst, src1))
16098 return false;
16099 if (rtx_equal_p (dst, src2))
16100 return true;
16101
16102 /* Next highest priority is that immediate constants come second. */
16103 if (immediate_operand (src2, mode))
16104 return false;
16105 if (immediate_operand (src1, mode))
16106 return true;
16107
16108 /* Lowest priority is that memory references should come second. */
16109 if (MEM_P (src2))
16110 return false;
16111 if (MEM_P (src1))
16112 return true;
16113
16114 return false;
16115 }
16116
16117
16118 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16119 destination to use for the operation. If different from the true
16120 destination in operands[0], a copy operation will be required. */
16121
16122 rtx
16123 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16124 rtx operands[])
16125 {
16126 rtx dst = operands[0];
16127 rtx src1 = operands[1];
16128 rtx src2 = operands[2];
16129
16130 /* Canonicalize operand order. */
16131 if (ix86_swap_binary_operands_p (code, mode, operands))
16132 {
16133 rtx temp;
16134
16135 /* It is invalid to swap operands of different modes. */
16136 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16137
16138 temp = src1;
16139 src1 = src2;
16140 src2 = temp;
16141 }
16142
16143 /* Both source operands cannot be in memory. */
16144 if (MEM_P (src1) && MEM_P (src2))
16145 {
16146 /* Optimization: Only read from memory once. */
16147 if (rtx_equal_p (src1, src2))
16148 {
16149 src2 = force_reg (mode, src2);
16150 src1 = src2;
16151 }
16152 else
16153 src2 = force_reg (mode, src2);
16154 }
16155
16156 /* If the destination is memory, and we do not have matching source
16157 operands, do things in registers. */
16158 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16159 dst = gen_reg_rtx (mode);
16160
16161 /* Source 1 cannot be a constant. */
16162 if (CONSTANT_P (src1))
16163 src1 = force_reg (mode, src1);
16164
16165 /* Source 1 cannot be a non-matching memory. */
16166 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16167 src1 = force_reg (mode, src1);
16168
16169 /* Improve address combine. */
16170 if (code == PLUS
16171 && GET_MODE_CLASS (mode) == MODE_INT
16172 && MEM_P (src2))
16173 src2 = force_reg (mode, src2);
16174
16175 operands[1] = src1;
16176 operands[2] = src2;
16177 return dst;
16178 }
16179
16180 /* Similarly, but assume that the destination has already been
16181 set up properly. */
16182
16183 void
16184 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16185 enum machine_mode mode, rtx operands[])
16186 {
16187 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16188 gcc_assert (dst == operands[0]);
16189 }
16190
16191 /* Attempt to expand a binary operator. Make the expansion closer to the
16192 actual machine, then just general_operand, which will allow 3 separate
16193 memory references (one output, two input) in a single insn. */
16194
16195 void
16196 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16197 rtx operands[])
16198 {
16199 rtx src1, src2, dst, op, clob;
16200
16201 dst = ix86_fixup_binary_operands (code, mode, operands);
16202 src1 = operands[1];
16203 src2 = operands[2];
16204
16205 /* Emit the instruction. */
16206
16207 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16208 if (reload_in_progress)
16209 {
16210 /* Reload doesn't know about the flags register, and doesn't know that
16211 it doesn't want to clobber it. We can only do this with PLUS. */
16212 gcc_assert (code == PLUS);
16213 emit_insn (op);
16214 }
16215 else if (reload_completed
16216 && code == PLUS
16217 && !rtx_equal_p (dst, src1))
16218 {
16219 /* This is going to be an LEA; avoid splitting it later. */
16220 emit_insn (op);
16221 }
16222 else
16223 {
16224 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16225 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16226 }
16227
16228 /* Fix up the destination if needed. */
16229 if (dst != operands[0])
16230 emit_move_insn (operands[0], dst);
16231 }
16232
16233 /* Return TRUE or FALSE depending on whether the binary operator meets the
16234 appropriate constraints. */
16235
16236 bool
16237 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16238 rtx operands[3])
16239 {
16240 rtx dst = operands[0];
16241 rtx src1 = operands[1];
16242 rtx src2 = operands[2];
16243
16244 /* Both source operands cannot be in memory. */
16245 if (MEM_P (src1) && MEM_P (src2))
16246 return false;
16247
16248 /* Canonicalize operand order for commutative operators. */
16249 if (ix86_swap_binary_operands_p (code, mode, operands))
16250 {
16251 rtx temp = src1;
16252 src1 = src2;
16253 src2 = temp;
16254 }
16255
16256 /* If the destination is memory, we must have a matching source operand. */
16257 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16258 return false;
16259
16260 /* Source 1 cannot be a constant. */
16261 if (CONSTANT_P (src1))
16262 return false;
16263
16264 /* Source 1 cannot be a non-matching memory. */
16265 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16266 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16267 return (code == AND
16268 && (mode == HImode
16269 || mode == SImode
16270 || (TARGET_64BIT && mode == DImode))
16271 && satisfies_constraint_L (src2));
16272
16273 return true;
16274 }
16275
16276 /* Attempt to expand a unary operator. Make the expansion closer to the
16277 actual machine, then just general_operand, which will allow 2 separate
16278 memory references (one output, one input) in a single insn. */
16279
16280 void
16281 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16282 rtx operands[])
16283 {
16284 int matching_memory;
16285 rtx src, dst, op, clob;
16286
16287 dst = operands[0];
16288 src = operands[1];
16289
16290 /* If the destination is memory, and we do not have matching source
16291 operands, do things in registers. */
16292 matching_memory = 0;
16293 if (MEM_P (dst))
16294 {
16295 if (rtx_equal_p (dst, src))
16296 matching_memory = 1;
16297 else
16298 dst = gen_reg_rtx (mode);
16299 }
16300
16301 /* When source operand is memory, destination must match. */
16302 if (MEM_P (src) && !matching_memory)
16303 src = force_reg (mode, src);
16304
16305 /* Emit the instruction. */
16306
16307 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16308 if (reload_in_progress || code == NOT)
16309 {
16310 /* Reload doesn't know about the flags register, and doesn't know that
16311 it doesn't want to clobber it. */
16312 gcc_assert (code == NOT);
16313 emit_insn (op);
16314 }
16315 else
16316 {
16317 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16318 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16319 }
16320
16321 /* Fix up the destination if needed. */
16322 if (dst != operands[0])
16323 emit_move_insn (operands[0], dst);
16324 }
16325
16326 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16327 divisor are within the range [0-255]. */
16328
16329 void
16330 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16331 bool signed_p)
16332 {
16333 rtx end_label, qimode_label;
16334 rtx insn, div, mod;
16335 rtx scratch, tmp0, tmp1, tmp2;
16336 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16337 rtx (*gen_zero_extend) (rtx, rtx);
16338 rtx (*gen_test_ccno_1) (rtx, rtx);
16339
16340 switch (mode)
16341 {
16342 case SImode:
16343 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16344 gen_test_ccno_1 = gen_testsi_ccno_1;
16345 gen_zero_extend = gen_zero_extendqisi2;
16346 break;
16347 case DImode:
16348 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16349 gen_test_ccno_1 = gen_testdi_ccno_1;
16350 gen_zero_extend = gen_zero_extendqidi2;
16351 break;
16352 default:
16353 gcc_unreachable ();
16354 }
16355
16356 end_label = gen_label_rtx ();
16357 qimode_label = gen_label_rtx ();
16358
16359 scratch = gen_reg_rtx (mode);
16360
16361 /* Use 8bit unsigned divimod if dividend and divisor are within
16362 the range [0-255]. */
16363 emit_move_insn (scratch, operands[2]);
16364 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16365 scratch, 1, OPTAB_DIRECT);
16366 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16367 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16368 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16369 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16370 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16371 pc_rtx);
16372 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16373 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16374 JUMP_LABEL (insn) = qimode_label;
16375
16376 /* Generate original signed/unsigned divimod. */
16377 div = gen_divmod4_1 (operands[0], operands[1],
16378 operands[2], operands[3]);
16379 emit_insn (div);
16380
16381 /* Branch to the end. */
16382 emit_jump_insn (gen_jump (end_label));
16383 emit_barrier ();
16384
16385 /* Generate 8bit unsigned divide. */
16386 emit_label (qimode_label);
16387 /* Don't use operands[0] for result of 8bit divide since not all
16388 registers support QImode ZERO_EXTRACT. */
16389 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16390 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16391 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16392 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16393
16394 if (signed_p)
16395 {
16396 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16397 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16398 }
16399 else
16400 {
16401 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16402 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16403 }
16404
16405 /* Extract remainder from AH. */
16406 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16407 if (REG_P (operands[1]))
16408 insn = emit_move_insn (operands[1], tmp1);
16409 else
16410 {
16411 /* Need a new scratch register since the old one has result
16412 of 8bit divide. */
16413 scratch = gen_reg_rtx (mode);
16414 emit_move_insn (scratch, tmp1);
16415 insn = emit_move_insn (operands[1], scratch);
16416 }
16417 set_unique_reg_note (insn, REG_EQUAL, mod);
16418
16419 /* Zero extend quotient from AL. */
16420 tmp1 = gen_lowpart (QImode, tmp0);
16421 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16422 set_unique_reg_note (insn, REG_EQUAL, div);
16423
16424 emit_label (end_label);
16425 }
16426
16427 #define LEA_MAX_STALL (3)
16428 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16429
16430 /* Increase given DISTANCE in half-cycles according to
16431 dependencies between PREV and NEXT instructions.
16432 Add 1 half-cycle if there is no dependency and
16433 go to next cycle if there is some dependecy. */
16434
16435 static unsigned int
16436 increase_distance (rtx prev, rtx next, unsigned int distance)
16437 {
16438 df_ref *use_rec;
16439 df_ref *def_rec;
16440
16441 if (!prev || !next)
16442 return distance + (distance & 1) + 2;
16443
16444 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16445 return distance + 1;
16446
16447 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16448 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16449 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16450 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16451 return distance + (distance & 1) + 2;
16452
16453 return distance + 1;
16454 }
16455
16456 /* Function checks if instruction INSN defines register number
16457 REGNO1 or REGNO2. */
16458
16459 static bool
16460 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16461 rtx insn)
16462 {
16463 df_ref *def_rec;
16464
16465 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16466 if (DF_REF_REG_DEF_P (*def_rec)
16467 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16468 && (regno1 == DF_REF_REGNO (*def_rec)
16469 || regno2 == DF_REF_REGNO (*def_rec)))
16470 {
16471 return true;
16472 }
16473
16474 return false;
16475 }
16476
16477 /* Function checks if instruction INSN uses register number
16478 REGNO as a part of address expression. */
16479
16480 static bool
16481 insn_uses_reg_mem (unsigned int regno, rtx insn)
16482 {
16483 df_ref *use_rec;
16484
16485 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16486 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16487 return true;
16488
16489 return false;
16490 }
16491
16492 /* Search backward for non-agu definition of register number REGNO1
16493 or register number REGNO2 in basic block starting from instruction
16494 START up to head of basic block or instruction INSN.
16495
16496 Function puts true value into *FOUND var if definition was found
16497 and false otherwise.
16498
16499 Distance in half-cycles between START and found instruction or head
16500 of BB is added to DISTANCE and returned. */
16501
16502 static int
16503 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16504 rtx insn, int distance,
16505 rtx start, bool *found)
16506 {
16507 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16508 rtx prev = start;
16509 rtx next = NULL;
16510
16511 *found = false;
16512
16513 while (prev
16514 && prev != insn
16515 && distance < LEA_SEARCH_THRESHOLD)
16516 {
16517 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16518 {
16519 distance = increase_distance (prev, next, distance);
16520 if (insn_defines_reg (regno1, regno2, prev))
16521 {
16522 if (recog_memoized (prev) < 0
16523 || get_attr_type (prev) != TYPE_LEA)
16524 {
16525 *found = true;
16526 return distance;
16527 }
16528 }
16529
16530 next = prev;
16531 }
16532 if (prev == BB_HEAD (bb))
16533 break;
16534
16535 prev = PREV_INSN (prev);
16536 }
16537
16538 return distance;
16539 }
16540
16541 /* Search backward for non-agu definition of register number REGNO1
16542 or register number REGNO2 in INSN's basic block until
16543 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16544 2. Reach neighbour BBs boundary, or
16545 3. Reach agu definition.
16546 Returns the distance between the non-agu definition point and INSN.
16547 If no definition point, returns -1. */
16548
16549 static int
16550 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16551 rtx insn)
16552 {
16553 basic_block bb = BLOCK_FOR_INSN (insn);
16554 int distance = 0;
16555 bool found = false;
16556
16557 if (insn != BB_HEAD (bb))
16558 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16559 distance, PREV_INSN (insn),
16560 &found);
16561
16562 if (!found && distance < LEA_SEARCH_THRESHOLD)
16563 {
16564 edge e;
16565 edge_iterator ei;
16566 bool simple_loop = false;
16567
16568 FOR_EACH_EDGE (e, ei, bb->preds)
16569 if (e->src == bb)
16570 {
16571 simple_loop = true;
16572 break;
16573 }
16574
16575 if (simple_loop)
16576 distance = distance_non_agu_define_in_bb (regno1, regno2,
16577 insn, distance,
16578 BB_END (bb), &found);
16579 else
16580 {
16581 int shortest_dist = -1;
16582 bool found_in_bb = false;
16583
16584 FOR_EACH_EDGE (e, ei, bb->preds)
16585 {
16586 int bb_dist
16587 = distance_non_agu_define_in_bb (regno1, regno2,
16588 insn, distance,
16589 BB_END (e->src),
16590 &found_in_bb);
16591 if (found_in_bb)
16592 {
16593 if (shortest_dist < 0)
16594 shortest_dist = bb_dist;
16595 else if (bb_dist > 0)
16596 shortest_dist = MIN (bb_dist, shortest_dist);
16597
16598 found = true;
16599 }
16600 }
16601
16602 distance = shortest_dist;
16603 }
16604 }
16605
16606 /* get_attr_type may modify recog data. We want to make sure
16607 that recog data is valid for instruction INSN, on which
16608 distance_non_agu_define is called. INSN is unchanged here. */
16609 extract_insn_cached (insn);
16610
16611 if (!found)
16612 return -1;
16613
16614 return distance >> 1;
16615 }
16616
16617 /* Return the distance in half-cycles between INSN and the next
16618 insn that uses register number REGNO in memory address added
16619 to DISTANCE. Return -1 if REGNO0 is set.
16620
16621 Put true value into *FOUND if register usage was found and
16622 false otherwise.
16623 Put true value into *REDEFINED if register redefinition was
16624 found and false otherwise. */
16625
16626 static int
16627 distance_agu_use_in_bb (unsigned int regno,
16628 rtx insn, int distance, rtx start,
16629 bool *found, bool *redefined)
16630 {
16631 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16632 rtx next = start;
16633 rtx prev = NULL;
16634
16635 *found = false;
16636 *redefined = false;
16637
16638 while (next
16639 && next != insn
16640 && distance < LEA_SEARCH_THRESHOLD)
16641 {
16642 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16643 {
16644 distance = increase_distance(prev, next, distance);
16645 if (insn_uses_reg_mem (regno, next))
16646 {
16647 /* Return DISTANCE if OP0 is used in memory
16648 address in NEXT. */
16649 *found = true;
16650 return distance;
16651 }
16652
16653 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16654 {
16655 /* Return -1 if OP0 is set in NEXT. */
16656 *redefined = true;
16657 return -1;
16658 }
16659
16660 prev = next;
16661 }
16662
16663 if (next == BB_END (bb))
16664 break;
16665
16666 next = NEXT_INSN (next);
16667 }
16668
16669 return distance;
16670 }
16671
16672 /* Return the distance between INSN and the next insn that uses
16673 register number REGNO0 in memory address. Return -1 if no such
16674 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16675
16676 static int
16677 distance_agu_use (unsigned int regno0, rtx insn)
16678 {
16679 basic_block bb = BLOCK_FOR_INSN (insn);
16680 int distance = 0;
16681 bool found = false;
16682 bool redefined = false;
16683
16684 if (insn != BB_END (bb))
16685 distance = distance_agu_use_in_bb (regno0, insn, distance,
16686 NEXT_INSN (insn),
16687 &found, &redefined);
16688
16689 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16690 {
16691 edge e;
16692 edge_iterator ei;
16693 bool simple_loop = false;
16694
16695 FOR_EACH_EDGE (e, ei, bb->succs)
16696 if (e->dest == bb)
16697 {
16698 simple_loop = true;
16699 break;
16700 }
16701
16702 if (simple_loop)
16703 distance = distance_agu_use_in_bb (regno0, insn,
16704 distance, BB_HEAD (bb),
16705 &found, &redefined);
16706 else
16707 {
16708 int shortest_dist = -1;
16709 bool found_in_bb = false;
16710 bool redefined_in_bb = false;
16711
16712 FOR_EACH_EDGE (e, ei, bb->succs)
16713 {
16714 int bb_dist
16715 = distance_agu_use_in_bb (regno0, insn,
16716 distance, BB_HEAD (e->dest),
16717 &found_in_bb, &redefined_in_bb);
16718 if (found_in_bb)
16719 {
16720 if (shortest_dist < 0)
16721 shortest_dist = bb_dist;
16722 else if (bb_dist > 0)
16723 shortest_dist = MIN (bb_dist, shortest_dist);
16724
16725 found = true;
16726 }
16727 }
16728
16729 distance = shortest_dist;
16730 }
16731 }
16732
16733 if (!found || redefined)
16734 return -1;
16735
16736 return distance >> 1;
16737 }
16738
16739 /* Define this macro to tune LEA priority vs ADD, it take effect when
16740 there is a dilemma of choicing LEA or ADD
16741 Negative value: ADD is more preferred than LEA
16742 Zero: Netrual
16743 Positive value: LEA is more preferred than ADD*/
16744 #define IX86_LEA_PRIORITY 0
16745
16746 /* Return true if usage of lea INSN has performance advantage
16747 over a sequence of instructions. Instructions sequence has
16748 SPLIT_COST cycles higher latency than lea latency. */
16749
16750 bool
16751 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16752 unsigned int regno2, unsigned int split_cost)
16753 {
16754 int dist_define, dist_use;
16755
16756 dist_define = distance_non_agu_define (regno1, regno2, insn);
16757 dist_use = distance_agu_use (regno0, insn);
16758
16759 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16760 {
16761 /* If there is no non AGU operand definition, no AGU
16762 operand usage and split cost is 0 then both lea
16763 and non lea variants have same priority. Currently
16764 we prefer lea for 64 bit code and non lea on 32 bit
16765 code. */
16766 if (dist_use < 0 && split_cost == 0)
16767 return TARGET_64BIT || IX86_LEA_PRIORITY;
16768 else
16769 return true;
16770 }
16771
16772 /* With longer definitions distance lea is more preferable.
16773 Here we change it to take into account splitting cost and
16774 lea priority. */
16775 dist_define += split_cost + IX86_LEA_PRIORITY;
16776
16777 /* If there is no use in memory addess then we just check
16778 that split cost does not exceed AGU stall. */
16779 if (dist_use < 0)
16780 return dist_define >= LEA_MAX_STALL;
16781
16782 /* If this insn has both backward non-agu dependence and forward
16783 agu dependence, the one with short distance takes effect. */
16784 return dist_define >= dist_use;
16785 }
16786
16787 /* Return true if it is legal to clobber flags by INSN and
16788 false otherwise. */
16789
16790 static bool
16791 ix86_ok_to_clobber_flags (rtx insn)
16792 {
16793 basic_block bb = BLOCK_FOR_INSN (insn);
16794 df_ref *use;
16795 bitmap live;
16796
16797 while (insn)
16798 {
16799 if (NONDEBUG_INSN_P (insn))
16800 {
16801 for (use = DF_INSN_USES (insn); *use; use++)
16802 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16803 return false;
16804
16805 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16806 return true;
16807 }
16808
16809 if (insn == BB_END (bb))
16810 break;
16811
16812 insn = NEXT_INSN (insn);
16813 }
16814
16815 live = df_get_live_out(bb);
16816 return !REGNO_REG_SET_P (live, FLAGS_REG);
16817 }
16818
16819 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16820 move and add to avoid AGU stalls. */
16821
16822 bool
16823 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16824 {
16825 unsigned int regno0 = true_regnum (operands[0]);
16826 unsigned int regno1 = true_regnum (operands[1]);
16827 unsigned int regno2 = true_regnum (operands[2]);
16828
16829 /* Check if we need to optimize. */
16830 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16831 return false;
16832
16833 /* Check it is correct to split here. */
16834 if (!ix86_ok_to_clobber_flags(insn))
16835 return false;
16836
16837 /* We need to split only adds with non destructive
16838 destination operand. */
16839 if (regno0 == regno1 || regno0 == regno2)
16840 return false;
16841 else
16842 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16843 }
16844
16845 /* Return true if we should emit lea instruction instead of mov
16846 instruction. */
16847
16848 bool
16849 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16850 {
16851 unsigned int regno0;
16852 unsigned int regno1;
16853
16854 /* Check if we need to optimize. */
16855 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16856 return false;
16857
16858 /* Use lea for reg to reg moves only. */
16859 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16860 return false;
16861
16862 regno0 = true_regnum (operands[0]);
16863 regno1 = true_regnum (operands[1]);
16864
16865 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16866 }
16867
16868 /* Return true if we need to split lea into a sequence of
16869 instructions to avoid AGU stalls. */
16870
16871 bool
16872 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16873 {
16874 unsigned int regno0 = true_regnum (operands[0]) ;
16875 unsigned int regno1 = -1;
16876 unsigned int regno2 = -1;
16877 unsigned int split_cost = 0;
16878 struct ix86_address parts;
16879 int ok;
16880
16881 /* Check we need to optimize. */
16882 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16883 return false;
16884
16885 /* Check it is correct to split here. */
16886 if (!ix86_ok_to_clobber_flags(insn))
16887 return false;
16888
16889 ok = ix86_decompose_address (operands[1], &parts);
16890 gcc_assert (ok);
16891
16892 /* We should not split into add if non legitimate pic
16893 operand is used as displacement. */
16894 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16895 return false;
16896
16897 if (parts.base)
16898 regno1 = true_regnum (parts.base);
16899 if (parts.index)
16900 regno2 = true_regnum (parts.index);
16901
16902 /* Compute how many cycles we will add to execution time
16903 if split lea into a sequence of instructions. */
16904 if (parts.base || parts.index)
16905 {
16906 /* Have to use mov instruction if non desctructive
16907 destination form is used. */
16908 if (regno1 != regno0 && regno2 != regno0)
16909 split_cost += 1;
16910
16911 /* Have to add index to base if both exist. */
16912 if (parts.base && parts.index)
16913 split_cost += 1;
16914
16915 /* Have to use shift and adds if scale is 2 or greater. */
16916 if (parts.scale > 1)
16917 {
16918 if (regno0 != regno1)
16919 split_cost += 1;
16920 else if (regno2 == regno0)
16921 split_cost += 4;
16922 else
16923 split_cost += parts.scale;
16924 }
16925
16926 /* Have to use add instruction with immediate if
16927 disp is non zero. */
16928 if (parts.disp && parts.disp != const0_rtx)
16929 split_cost += 1;
16930
16931 /* Subtract the price of lea. */
16932 split_cost -= 1;
16933 }
16934
16935 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16936 }
16937
16938 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16939 matches destination. RTX includes clobber of FLAGS_REG. */
16940
16941 static void
16942 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16943 rtx dst, rtx src)
16944 {
16945 rtx op, clob;
16946
16947 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16948 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16949
16950 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16951 }
16952
16953 /* Split lea instructions into a sequence of instructions
16954 which are executed on ALU to avoid AGU stalls.
16955 It is assumed that it is allowed to clobber flags register
16956 at lea position. */
16957
16958 extern void
16959 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16960 {
16961 unsigned int regno0 = true_regnum (operands[0]) ;
16962 unsigned int regno1 = INVALID_REGNUM;
16963 unsigned int regno2 = INVALID_REGNUM;
16964 struct ix86_address parts;
16965 rtx tmp;
16966 int ok, adds;
16967
16968 ok = ix86_decompose_address (operands[1], &parts);
16969 gcc_assert (ok);
16970
16971 if (parts.base)
16972 {
16973 if (GET_MODE (parts.base) != mode)
16974 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16975 regno1 = true_regnum (parts.base);
16976 }
16977
16978 if (parts.index)
16979 {
16980 if (GET_MODE (parts.index) != mode)
16981 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16982 regno2 = true_regnum (parts.index);
16983 }
16984
16985 if (parts.scale > 1)
16986 {
16987 /* Case r1 = r1 + ... */
16988 if (regno1 == regno0)
16989 {
16990 /* If we have a case r1 = r1 + C * r1 then we
16991 should use multiplication which is very
16992 expensive. Assume cost model is wrong if we
16993 have such case here. */
16994 gcc_assert (regno2 != regno0);
16995
16996 for (adds = parts.scale; adds > 0; adds--)
16997 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16998 }
16999 else
17000 {
17001 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17002 if (regno0 != regno2)
17003 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17004
17005 /* Use shift for scaling. */
17006 ix86_emit_binop (ASHIFT, mode, operands[0],
17007 GEN_INT (exact_log2 (parts.scale)));
17008
17009 if (parts.base)
17010 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
17011
17012 if (parts.disp && parts.disp != const0_rtx)
17013 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17014 }
17015 }
17016 else if (!parts.base && !parts.index)
17017 {
17018 gcc_assert(parts.disp);
17019 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
17020 }
17021 else
17022 {
17023 if (!parts.base)
17024 {
17025 if (regno0 != regno2)
17026 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17027 }
17028 else if (!parts.index)
17029 {
17030 if (regno0 != regno1)
17031 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17032 }
17033 else
17034 {
17035 if (regno0 == regno1)
17036 tmp = parts.index;
17037 else if (regno0 == regno2)
17038 tmp = parts.base;
17039 else
17040 {
17041 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17042 tmp = parts.index;
17043 }
17044
17045 ix86_emit_binop (PLUS, mode, operands[0], tmp);
17046 }
17047
17048 if (parts.disp && parts.disp != const0_rtx)
17049 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17050 }
17051 }
17052
17053 /* Return true if it is ok to optimize an ADD operation to LEA
17054 operation to avoid flag register consumation. For most processors,
17055 ADD is faster than LEA. For the processors like ATOM, if the
17056 destination register of LEA holds an actual address which will be
17057 used soon, LEA is better and otherwise ADD is better. */
17058
17059 bool
17060 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17061 {
17062 unsigned int regno0 = true_regnum (operands[0]);
17063 unsigned int regno1 = true_regnum (operands[1]);
17064 unsigned int regno2 = true_regnum (operands[2]);
17065
17066 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17067 if (regno0 != regno1 && regno0 != regno2)
17068 return true;
17069
17070 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17071 return false;
17072
17073 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17074 }
17075
17076 /* Return true if destination reg of SET_BODY is shift count of
17077 USE_BODY. */
17078
17079 static bool
17080 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17081 {
17082 rtx set_dest;
17083 rtx shift_rtx;
17084 int i;
17085
17086 /* Retrieve destination of SET_BODY. */
17087 switch (GET_CODE (set_body))
17088 {
17089 case SET:
17090 set_dest = SET_DEST (set_body);
17091 if (!set_dest || !REG_P (set_dest))
17092 return false;
17093 break;
17094 case PARALLEL:
17095 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17096 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17097 use_body))
17098 return true;
17099 default:
17100 return false;
17101 break;
17102 }
17103
17104 /* Retrieve shift count of USE_BODY. */
17105 switch (GET_CODE (use_body))
17106 {
17107 case SET:
17108 shift_rtx = XEXP (use_body, 1);
17109 break;
17110 case PARALLEL:
17111 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17112 if (ix86_dep_by_shift_count_body (set_body,
17113 XVECEXP (use_body, 0, i)))
17114 return true;
17115 default:
17116 return false;
17117 break;
17118 }
17119
17120 if (shift_rtx
17121 && (GET_CODE (shift_rtx) == ASHIFT
17122 || GET_CODE (shift_rtx) == LSHIFTRT
17123 || GET_CODE (shift_rtx) == ASHIFTRT
17124 || GET_CODE (shift_rtx) == ROTATE
17125 || GET_CODE (shift_rtx) == ROTATERT))
17126 {
17127 rtx shift_count = XEXP (shift_rtx, 1);
17128
17129 /* Return true if shift count is dest of SET_BODY. */
17130 if (REG_P (shift_count)
17131 && true_regnum (set_dest) == true_regnum (shift_count))
17132 return true;
17133 }
17134
17135 return false;
17136 }
17137
17138 /* Return true if destination reg of SET_INSN is shift count of
17139 USE_INSN. */
17140
17141 bool
17142 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17143 {
17144 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17145 PATTERN (use_insn));
17146 }
17147
17148 /* Return TRUE or FALSE depending on whether the unary operator meets the
17149 appropriate constraints. */
17150
17151 bool
17152 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17153 enum machine_mode mode ATTRIBUTE_UNUSED,
17154 rtx operands[2] ATTRIBUTE_UNUSED)
17155 {
17156 /* If one of operands is memory, source and destination must match. */
17157 if ((MEM_P (operands[0])
17158 || MEM_P (operands[1]))
17159 && ! rtx_equal_p (operands[0], operands[1]))
17160 return false;
17161 return true;
17162 }
17163
17164 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17165 are ok, keeping in mind the possible movddup alternative. */
17166
17167 bool
17168 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17169 {
17170 if (MEM_P (operands[0]))
17171 return rtx_equal_p (operands[0], operands[1 + high]);
17172 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17173 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17174 return true;
17175 }
17176
17177 /* Post-reload splitter for converting an SF or DFmode value in an
17178 SSE register into an unsigned SImode. */
17179
17180 void
17181 ix86_split_convert_uns_si_sse (rtx operands[])
17182 {
17183 enum machine_mode vecmode;
17184 rtx value, large, zero_or_two31, input, two31, x;
17185
17186 large = operands[1];
17187 zero_or_two31 = operands[2];
17188 input = operands[3];
17189 two31 = operands[4];
17190 vecmode = GET_MODE (large);
17191 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17192
17193 /* Load up the value into the low element. We must ensure that the other
17194 elements are valid floats -- zero is the easiest such value. */
17195 if (MEM_P (input))
17196 {
17197 if (vecmode == V4SFmode)
17198 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17199 else
17200 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17201 }
17202 else
17203 {
17204 input = gen_rtx_REG (vecmode, REGNO (input));
17205 emit_move_insn (value, CONST0_RTX (vecmode));
17206 if (vecmode == V4SFmode)
17207 emit_insn (gen_sse_movss (value, value, input));
17208 else
17209 emit_insn (gen_sse2_movsd (value, value, input));
17210 }
17211
17212 emit_move_insn (large, two31);
17213 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17214
17215 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17216 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17217
17218 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17219 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17220
17221 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17222 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17223
17224 large = gen_rtx_REG (V4SImode, REGNO (large));
17225 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17226
17227 x = gen_rtx_REG (V4SImode, REGNO (value));
17228 if (vecmode == V4SFmode)
17229 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17230 else
17231 emit_insn (gen_sse2_cvttpd2dq (x, value));
17232 value = x;
17233
17234 emit_insn (gen_xorv4si3 (value, value, large));
17235 }
17236
17237 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17238 Expects the 64-bit DImode to be supplied in a pair of integral
17239 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17240 -mfpmath=sse, !optimize_size only. */
17241
17242 void
17243 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17244 {
17245 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17246 rtx int_xmm, fp_xmm;
17247 rtx biases, exponents;
17248 rtx x;
17249
17250 int_xmm = gen_reg_rtx (V4SImode);
17251 if (TARGET_INTER_UNIT_MOVES)
17252 emit_insn (gen_movdi_to_sse (int_xmm, input));
17253 else if (TARGET_SSE_SPLIT_REGS)
17254 {
17255 emit_clobber (int_xmm);
17256 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17257 }
17258 else
17259 {
17260 x = gen_reg_rtx (V2DImode);
17261 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17262 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17263 }
17264
17265 x = gen_rtx_CONST_VECTOR (V4SImode,
17266 gen_rtvec (4, GEN_INT (0x43300000UL),
17267 GEN_INT (0x45300000UL),
17268 const0_rtx, const0_rtx));
17269 exponents = validize_mem (force_const_mem (V4SImode, x));
17270
17271 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17272 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17273
17274 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17275 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17276 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17277 (0x1.0p84 + double(fp_value_hi_xmm)).
17278 Note these exponents differ by 32. */
17279
17280 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17281
17282 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17283 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17284 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17285 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17286 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17287 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17288 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17289 biases = validize_mem (force_const_mem (V2DFmode, biases));
17290 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17291
17292 /* Add the upper and lower DFmode values together. */
17293 if (TARGET_SSE3)
17294 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17295 else
17296 {
17297 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17298 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17299 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17300 }
17301
17302 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17303 }
17304
17305 /* Not used, but eases macroization of patterns. */
17306 void
17307 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17308 rtx input ATTRIBUTE_UNUSED)
17309 {
17310 gcc_unreachable ();
17311 }
17312
17313 /* Convert an unsigned SImode value into a DFmode. Only currently used
17314 for SSE, but applicable anywhere. */
17315
17316 void
17317 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17318 {
17319 REAL_VALUE_TYPE TWO31r;
17320 rtx x, fp;
17321
17322 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17323 NULL, 1, OPTAB_DIRECT);
17324
17325 fp = gen_reg_rtx (DFmode);
17326 emit_insn (gen_floatsidf2 (fp, x));
17327
17328 real_ldexp (&TWO31r, &dconst1, 31);
17329 x = const_double_from_real_value (TWO31r, DFmode);
17330
17331 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17332 if (x != target)
17333 emit_move_insn (target, x);
17334 }
17335
17336 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17337 32-bit mode; otherwise we have a direct convert instruction. */
17338
17339 void
17340 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17341 {
17342 REAL_VALUE_TYPE TWO32r;
17343 rtx fp_lo, fp_hi, x;
17344
17345 fp_lo = gen_reg_rtx (DFmode);
17346 fp_hi = gen_reg_rtx (DFmode);
17347
17348 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17349
17350 real_ldexp (&TWO32r, &dconst1, 32);
17351 x = const_double_from_real_value (TWO32r, DFmode);
17352 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17353
17354 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17355
17356 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17357 0, OPTAB_DIRECT);
17358 if (x != target)
17359 emit_move_insn (target, x);
17360 }
17361
17362 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17363 For x86_32, -mfpmath=sse, !optimize_size only. */
17364 void
17365 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17366 {
17367 REAL_VALUE_TYPE ONE16r;
17368 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17369
17370 real_ldexp (&ONE16r, &dconst1, 16);
17371 x = const_double_from_real_value (ONE16r, SFmode);
17372 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17373 NULL, 0, OPTAB_DIRECT);
17374 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17375 NULL, 0, OPTAB_DIRECT);
17376 fp_hi = gen_reg_rtx (SFmode);
17377 fp_lo = gen_reg_rtx (SFmode);
17378 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17379 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17380 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17381 0, OPTAB_DIRECT);
17382 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17383 0, OPTAB_DIRECT);
17384 if (!rtx_equal_p (target, fp_hi))
17385 emit_move_insn (target, fp_hi);
17386 }
17387
17388 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17389 a vector of unsigned ints VAL to vector of floats TARGET. */
17390
17391 void
17392 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17393 {
17394 rtx tmp[8];
17395 REAL_VALUE_TYPE TWO16r;
17396 enum machine_mode intmode = GET_MODE (val);
17397 enum machine_mode fltmode = GET_MODE (target);
17398 rtx (*cvt) (rtx, rtx);
17399
17400 if (intmode == V4SImode)
17401 cvt = gen_floatv4siv4sf2;
17402 else
17403 cvt = gen_floatv8siv8sf2;
17404 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17405 tmp[0] = force_reg (intmode, tmp[0]);
17406 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17407 OPTAB_DIRECT);
17408 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17409 NULL_RTX, 1, OPTAB_DIRECT);
17410 tmp[3] = gen_reg_rtx (fltmode);
17411 emit_insn (cvt (tmp[3], tmp[1]));
17412 tmp[4] = gen_reg_rtx (fltmode);
17413 emit_insn (cvt (tmp[4], tmp[2]));
17414 real_ldexp (&TWO16r, &dconst1, 16);
17415 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17416 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17417 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17418 OPTAB_DIRECT);
17419 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17420 OPTAB_DIRECT);
17421 if (tmp[7] != target)
17422 emit_move_insn (target, tmp[7]);
17423 }
17424
17425 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17426 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17427 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17428 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17429
17430 rtx
17431 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17432 {
17433 REAL_VALUE_TYPE TWO31r;
17434 rtx two31r, tmp[4];
17435 enum machine_mode mode = GET_MODE (val);
17436 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17437 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17438 rtx (*cmp) (rtx, rtx, rtx, rtx);
17439 int i;
17440
17441 for (i = 0; i < 3; i++)
17442 tmp[i] = gen_reg_rtx (mode);
17443 real_ldexp (&TWO31r, &dconst1, 31);
17444 two31r = const_double_from_real_value (TWO31r, scalarmode);
17445 two31r = ix86_build_const_vector (mode, 1, two31r);
17446 two31r = force_reg (mode, two31r);
17447 switch (mode)
17448 {
17449 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17450 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17451 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17452 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17453 default: gcc_unreachable ();
17454 }
17455 tmp[3] = gen_rtx_LE (mode, two31r, val);
17456 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17457 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17458 0, OPTAB_DIRECT);
17459 if (intmode == V4SImode || TARGET_AVX2)
17460 *xorp = expand_simple_binop (intmode, ASHIFT,
17461 gen_lowpart (intmode, tmp[0]),
17462 GEN_INT (31), NULL_RTX, 0,
17463 OPTAB_DIRECT);
17464 else
17465 {
17466 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17467 two31 = ix86_build_const_vector (intmode, 1, two31);
17468 *xorp = expand_simple_binop (intmode, AND,
17469 gen_lowpart (intmode, tmp[0]),
17470 two31, NULL_RTX, 0,
17471 OPTAB_DIRECT);
17472 }
17473 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17474 0, OPTAB_DIRECT);
17475 }
17476
17477 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17478 then replicate the value for all elements of the vector
17479 register. */
17480
17481 rtx
17482 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17483 {
17484 int i, n_elt;
17485 rtvec v;
17486 enum machine_mode scalar_mode;
17487
17488 switch (mode)
17489 {
17490 case V32QImode:
17491 case V16QImode:
17492 case V16HImode:
17493 case V8HImode:
17494 case V8SImode:
17495 case V4SImode:
17496 case V4DImode:
17497 case V2DImode:
17498 gcc_assert (vect);
17499 case V8SFmode:
17500 case V4SFmode:
17501 case V4DFmode:
17502 case V2DFmode:
17503 n_elt = GET_MODE_NUNITS (mode);
17504 v = rtvec_alloc (n_elt);
17505 scalar_mode = GET_MODE_INNER (mode);
17506
17507 RTVEC_ELT (v, 0) = value;
17508
17509 for (i = 1; i < n_elt; ++i)
17510 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17511
17512 return gen_rtx_CONST_VECTOR (mode, v);
17513
17514 default:
17515 gcc_unreachable ();
17516 }
17517 }
17518
17519 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17520 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17521 for an SSE register. If VECT is true, then replicate the mask for
17522 all elements of the vector register. If INVERT is true, then create
17523 a mask excluding the sign bit. */
17524
17525 rtx
17526 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17527 {
17528 enum machine_mode vec_mode, imode;
17529 HOST_WIDE_INT hi, lo;
17530 int shift = 63;
17531 rtx v;
17532 rtx mask;
17533
17534 /* Find the sign bit, sign extended to 2*HWI. */
17535 switch (mode)
17536 {
17537 case V8SImode:
17538 case V4SImode:
17539 case V8SFmode:
17540 case V4SFmode:
17541 vec_mode = mode;
17542 mode = GET_MODE_INNER (mode);
17543 imode = SImode;
17544 lo = 0x80000000, hi = lo < 0;
17545 break;
17546
17547 case V4DImode:
17548 case V2DImode:
17549 case V4DFmode:
17550 case V2DFmode:
17551 vec_mode = mode;
17552 mode = GET_MODE_INNER (mode);
17553 imode = DImode;
17554 if (HOST_BITS_PER_WIDE_INT >= 64)
17555 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17556 else
17557 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17558 break;
17559
17560 case TImode:
17561 case TFmode:
17562 vec_mode = VOIDmode;
17563 if (HOST_BITS_PER_WIDE_INT >= 64)
17564 {
17565 imode = TImode;
17566 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17567 }
17568 else
17569 {
17570 rtvec vec;
17571
17572 imode = DImode;
17573 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17574
17575 if (invert)
17576 {
17577 lo = ~lo, hi = ~hi;
17578 v = constm1_rtx;
17579 }
17580 else
17581 v = const0_rtx;
17582
17583 mask = immed_double_const (lo, hi, imode);
17584
17585 vec = gen_rtvec (2, v, mask);
17586 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17587 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17588
17589 return v;
17590 }
17591 break;
17592
17593 default:
17594 gcc_unreachable ();
17595 }
17596
17597 if (invert)
17598 lo = ~lo, hi = ~hi;
17599
17600 /* Force this value into the low part of a fp vector constant. */
17601 mask = immed_double_const (lo, hi, imode);
17602 mask = gen_lowpart (mode, mask);
17603
17604 if (vec_mode == VOIDmode)
17605 return force_reg (mode, mask);
17606
17607 v = ix86_build_const_vector (vec_mode, vect, mask);
17608 return force_reg (vec_mode, v);
17609 }
17610
17611 /* Generate code for floating point ABS or NEG. */
17612
17613 void
17614 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17615 rtx operands[])
17616 {
17617 rtx mask, set, dst, src;
17618 bool use_sse = false;
17619 bool vector_mode = VECTOR_MODE_P (mode);
17620 enum machine_mode vmode = mode;
17621
17622 if (vector_mode)
17623 use_sse = true;
17624 else if (mode == TFmode)
17625 use_sse = true;
17626 else if (TARGET_SSE_MATH)
17627 {
17628 use_sse = SSE_FLOAT_MODE_P (mode);
17629 if (mode == SFmode)
17630 vmode = V4SFmode;
17631 else if (mode == DFmode)
17632 vmode = V2DFmode;
17633 }
17634
17635 /* NEG and ABS performed with SSE use bitwise mask operations.
17636 Create the appropriate mask now. */
17637 if (use_sse)
17638 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17639 else
17640 mask = NULL_RTX;
17641
17642 dst = operands[0];
17643 src = operands[1];
17644
17645 set = gen_rtx_fmt_e (code, mode, src);
17646 set = gen_rtx_SET (VOIDmode, dst, set);
17647
17648 if (mask)
17649 {
17650 rtx use, clob;
17651 rtvec par;
17652
17653 use = gen_rtx_USE (VOIDmode, mask);
17654 if (vector_mode)
17655 par = gen_rtvec (2, set, use);
17656 else
17657 {
17658 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17659 par = gen_rtvec (3, set, use, clob);
17660 }
17661 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17662 }
17663 else
17664 emit_insn (set);
17665 }
17666
17667 /* Expand a copysign operation. Special case operand 0 being a constant. */
17668
17669 void
17670 ix86_expand_copysign (rtx operands[])
17671 {
17672 enum machine_mode mode, vmode;
17673 rtx dest, op0, op1, mask, nmask;
17674
17675 dest = operands[0];
17676 op0 = operands[1];
17677 op1 = operands[2];
17678
17679 mode = GET_MODE (dest);
17680
17681 if (mode == SFmode)
17682 vmode = V4SFmode;
17683 else if (mode == DFmode)
17684 vmode = V2DFmode;
17685 else
17686 vmode = mode;
17687
17688 if (GET_CODE (op0) == CONST_DOUBLE)
17689 {
17690 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17691
17692 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17693 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17694
17695 if (mode == SFmode || mode == DFmode)
17696 {
17697 if (op0 == CONST0_RTX (mode))
17698 op0 = CONST0_RTX (vmode);
17699 else
17700 {
17701 rtx v = ix86_build_const_vector (vmode, false, op0);
17702
17703 op0 = force_reg (vmode, v);
17704 }
17705 }
17706 else if (op0 != CONST0_RTX (mode))
17707 op0 = force_reg (mode, op0);
17708
17709 mask = ix86_build_signbit_mask (vmode, 0, 0);
17710
17711 if (mode == SFmode)
17712 copysign_insn = gen_copysignsf3_const;
17713 else if (mode == DFmode)
17714 copysign_insn = gen_copysigndf3_const;
17715 else
17716 copysign_insn = gen_copysigntf3_const;
17717
17718 emit_insn (copysign_insn (dest, op0, op1, mask));
17719 }
17720 else
17721 {
17722 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17723
17724 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17725 mask = ix86_build_signbit_mask (vmode, 0, 0);
17726
17727 if (mode == SFmode)
17728 copysign_insn = gen_copysignsf3_var;
17729 else if (mode == DFmode)
17730 copysign_insn = gen_copysigndf3_var;
17731 else
17732 copysign_insn = gen_copysigntf3_var;
17733
17734 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17735 }
17736 }
17737
17738 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17739 be a constant, and so has already been expanded into a vector constant. */
17740
17741 void
17742 ix86_split_copysign_const (rtx operands[])
17743 {
17744 enum machine_mode mode, vmode;
17745 rtx dest, op0, mask, x;
17746
17747 dest = operands[0];
17748 op0 = operands[1];
17749 mask = operands[3];
17750
17751 mode = GET_MODE (dest);
17752 vmode = GET_MODE (mask);
17753
17754 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17755 x = gen_rtx_AND (vmode, dest, mask);
17756 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17757
17758 if (op0 != CONST0_RTX (vmode))
17759 {
17760 x = gen_rtx_IOR (vmode, dest, op0);
17761 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17762 }
17763 }
17764
17765 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17766 so we have to do two masks. */
17767
17768 void
17769 ix86_split_copysign_var (rtx operands[])
17770 {
17771 enum machine_mode mode, vmode;
17772 rtx dest, scratch, op0, op1, mask, nmask, x;
17773
17774 dest = operands[0];
17775 scratch = operands[1];
17776 op0 = operands[2];
17777 op1 = operands[3];
17778 nmask = operands[4];
17779 mask = operands[5];
17780
17781 mode = GET_MODE (dest);
17782 vmode = GET_MODE (mask);
17783
17784 if (rtx_equal_p (op0, op1))
17785 {
17786 /* Shouldn't happen often (it's useless, obviously), but when it does
17787 we'd generate incorrect code if we continue below. */
17788 emit_move_insn (dest, op0);
17789 return;
17790 }
17791
17792 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17793 {
17794 gcc_assert (REGNO (op1) == REGNO (scratch));
17795
17796 x = gen_rtx_AND (vmode, scratch, mask);
17797 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17798
17799 dest = mask;
17800 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17801 x = gen_rtx_NOT (vmode, dest);
17802 x = gen_rtx_AND (vmode, x, op0);
17803 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17804 }
17805 else
17806 {
17807 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17808 {
17809 x = gen_rtx_AND (vmode, scratch, mask);
17810 }
17811 else /* alternative 2,4 */
17812 {
17813 gcc_assert (REGNO (mask) == REGNO (scratch));
17814 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17815 x = gen_rtx_AND (vmode, scratch, op1);
17816 }
17817 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17818
17819 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17820 {
17821 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17822 x = gen_rtx_AND (vmode, dest, nmask);
17823 }
17824 else /* alternative 3,4 */
17825 {
17826 gcc_assert (REGNO (nmask) == REGNO (dest));
17827 dest = nmask;
17828 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17829 x = gen_rtx_AND (vmode, dest, op0);
17830 }
17831 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17832 }
17833
17834 x = gen_rtx_IOR (vmode, dest, scratch);
17835 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17836 }
17837
17838 /* Return TRUE or FALSE depending on whether the first SET in INSN
17839 has source and destination with matching CC modes, and that the
17840 CC mode is at least as constrained as REQ_MODE. */
17841
17842 bool
17843 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17844 {
17845 rtx set;
17846 enum machine_mode set_mode;
17847
17848 set = PATTERN (insn);
17849 if (GET_CODE (set) == PARALLEL)
17850 set = XVECEXP (set, 0, 0);
17851 gcc_assert (GET_CODE (set) == SET);
17852 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17853
17854 set_mode = GET_MODE (SET_DEST (set));
17855 switch (set_mode)
17856 {
17857 case CCNOmode:
17858 if (req_mode != CCNOmode
17859 && (req_mode != CCmode
17860 || XEXP (SET_SRC (set), 1) != const0_rtx))
17861 return false;
17862 break;
17863 case CCmode:
17864 if (req_mode == CCGCmode)
17865 return false;
17866 /* FALLTHRU */
17867 case CCGCmode:
17868 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17869 return false;
17870 /* FALLTHRU */
17871 case CCGOCmode:
17872 if (req_mode == CCZmode)
17873 return false;
17874 /* FALLTHRU */
17875 case CCZmode:
17876 break;
17877
17878 case CCAmode:
17879 case CCCmode:
17880 case CCOmode:
17881 case CCSmode:
17882 if (set_mode != req_mode)
17883 return false;
17884 break;
17885
17886 default:
17887 gcc_unreachable ();
17888 }
17889
17890 return GET_MODE (SET_SRC (set)) == set_mode;
17891 }
17892
17893 /* Generate insn patterns to do an integer compare of OPERANDS. */
17894
17895 static rtx
17896 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17897 {
17898 enum machine_mode cmpmode;
17899 rtx tmp, flags;
17900
17901 cmpmode = SELECT_CC_MODE (code, op0, op1);
17902 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17903
17904 /* This is very simple, but making the interface the same as in the
17905 FP case makes the rest of the code easier. */
17906 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17907 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17908
17909 /* Return the test that should be put into the flags user, i.e.
17910 the bcc, scc, or cmov instruction. */
17911 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17912 }
17913
17914 /* Figure out whether to use ordered or unordered fp comparisons.
17915 Return the appropriate mode to use. */
17916
17917 enum machine_mode
17918 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17919 {
17920 /* ??? In order to make all comparisons reversible, we do all comparisons
17921 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17922 all forms trapping and nontrapping comparisons, we can make inequality
17923 comparisons trapping again, since it results in better code when using
17924 FCOM based compares. */
17925 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17926 }
17927
17928 enum machine_mode
17929 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17930 {
17931 enum machine_mode mode = GET_MODE (op0);
17932
17933 if (SCALAR_FLOAT_MODE_P (mode))
17934 {
17935 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17936 return ix86_fp_compare_mode (code);
17937 }
17938
17939 switch (code)
17940 {
17941 /* Only zero flag is needed. */
17942 case EQ: /* ZF=0 */
17943 case NE: /* ZF!=0 */
17944 return CCZmode;
17945 /* Codes needing carry flag. */
17946 case GEU: /* CF=0 */
17947 case LTU: /* CF=1 */
17948 /* Detect overflow checks. They need just the carry flag. */
17949 if (GET_CODE (op0) == PLUS
17950 && rtx_equal_p (op1, XEXP (op0, 0)))
17951 return CCCmode;
17952 else
17953 return CCmode;
17954 case GTU: /* CF=0 & ZF=0 */
17955 case LEU: /* CF=1 | ZF=1 */
17956 /* Detect overflow checks. They need just the carry flag. */
17957 if (GET_CODE (op0) == MINUS
17958 && rtx_equal_p (op1, XEXP (op0, 0)))
17959 return CCCmode;
17960 else
17961 return CCmode;
17962 /* Codes possibly doable only with sign flag when
17963 comparing against zero. */
17964 case GE: /* SF=OF or SF=0 */
17965 case LT: /* SF<>OF or SF=1 */
17966 if (op1 == const0_rtx)
17967 return CCGOCmode;
17968 else
17969 /* For other cases Carry flag is not required. */
17970 return CCGCmode;
17971 /* Codes doable only with sign flag when comparing
17972 against zero, but we miss jump instruction for it
17973 so we need to use relational tests against overflow
17974 that thus needs to be zero. */
17975 case GT: /* ZF=0 & SF=OF */
17976 case LE: /* ZF=1 | SF<>OF */
17977 if (op1 == const0_rtx)
17978 return CCNOmode;
17979 else
17980 return CCGCmode;
17981 /* strcmp pattern do (use flags) and combine may ask us for proper
17982 mode. */
17983 case USE:
17984 return CCmode;
17985 default:
17986 gcc_unreachable ();
17987 }
17988 }
17989
17990 /* Return the fixed registers used for condition codes. */
17991
17992 static bool
17993 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17994 {
17995 *p1 = FLAGS_REG;
17996 *p2 = FPSR_REG;
17997 return true;
17998 }
17999
18000 /* If two condition code modes are compatible, return a condition code
18001 mode which is compatible with both. Otherwise, return
18002 VOIDmode. */
18003
18004 static enum machine_mode
18005 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18006 {
18007 if (m1 == m2)
18008 return m1;
18009
18010 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18011 return VOIDmode;
18012
18013 if ((m1 == CCGCmode && m2 == CCGOCmode)
18014 || (m1 == CCGOCmode && m2 == CCGCmode))
18015 return CCGCmode;
18016
18017 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18018 return m2;
18019 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18020 return m1;
18021
18022 switch (m1)
18023 {
18024 default:
18025 gcc_unreachable ();
18026
18027 case CCmode:
18028 case CCGCmode:
18029 case CCGOCmode:
18030 case CCNOmode:
18031 case CCAmode:
18032 case CCCmode:
18033 case CCOmode:
18034 case CCSmode:
18035 case CCZmode:
18036 switch (m2)
18037 {
18038 default:
18039 return VOIDmode;
18040
18041 case CCmode:
18042 case CCGCmode:
18043 case CCGOCmode:
18044 case CCNOmode:
18045 case CCAmode:
18046 case CCCmode:
18047 case CCOmode:
18048 case CCSmode:
18049 case CCZmode:
18050 return CCmode;
18051 }
18052
18053 case CCFPmode:
18054 case CCFPUmode:
18055 /* These are only compatible with themselves, which we already
18056 checked above. */
18057 return VOIDmode;
18058 }
18059 }
18060
18061
18062 /* Return a comparison we can do and that it is equivalent to
18063 swap_condition (code) apart possibly from orderedness.
18064 But, never change orderedness if TARGET_IEEE_FP, returning
18065 UNKNOWN in that case if necessary. */
18066
18067 static enum rtx_code
18068 ix86_fp_swap_condition (enum rtx_code code)
18069 {
18070 switch (code)
18071 {
18072 case GT: /* GTU - CF=0 & ZF=0 */
18073 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18074 case GE: /* GEU - CF=0 */
18075 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18076 case UNLT: /* LTU - CF=1 */
18077 return TARGET_IEEE_FP ? UNKNOWN : GT;
18078 case UNLE: /* LEU - CF=1 | ZF=1 */
18079 return TARGET_IEEE_FP ? UNKNOWN : GE;
18080 default:
18081 return swap_condition (code);
18082 }
18083 }
18084
18085 /* Return cost of comparison CODE using the best strategy for performance.
18086 All following functions do use number of instructions as a cost metrics.
18087 In future this should be tweaked to compute bytes for optimize_size and
18088 take into account performance of various instructions on various CPUs. */
18089
18090 static int
18091 ix86_fp_comparison_cost (enum rtx_code code)
18092 {
18093 int arith_cost;
18094
18095 /* The cost of code using bit-twiddling on %ah. */
18096 switch (code)
18097 {
18098 case UNLE:
18099 case UNLT:
18100 case LTGT:
18101 case GT:
18102 case GE:
18103 case UNORDERED:
18104 case ORDERED:
18105 case UNEQ:
18106 arith_cost = 4;
18107 break;
18108 case LT:
18109 case NE:
18110 case EQ:
18111 case UNGE:
18112 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18113 break;
18114 case LE:
18115 case UNGT:
18116 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18117 break;
18118 default:
18119 gcc_unreachable ();
18120 }
18121
18122 switch (ix86_fp_comparison_strategy (code))
18123 {
18124 case IX86_FPCMP_COMI:
18125 return arith_cost > 4 ? 3 : 2;
18126 case IX86_FPCMP_SAHF:
18127 return arith_cost > 4 ? 4 : 3;
18128 default:
18129 return arith_cost;
18130 }
18131 }
18132
18133 /* Return strategy to use for floating-point. We assume that fcomi is always
18134 preferrable where available, since that is also true when looking at size
18135 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18136
18137 enum ix86_fpcmp_strategy
18138 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18139 {
18140 /* Do fcomi/sahf based test when profitable. */
18141
18142 if (TARGET_CMOVE)
18143 return IX86_FPCMP_COMI;
18144
18145 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18146 return IX86_FPCMP_SAHF;
18147
18148 return IX86_FPCMP_ARITH;
18149 }
18150
18151 /* Swap, force into registers, or otherwise massage the two operands
18152 to a fp comparison. The operands are updated in place; the new
18153 comparison code is returned. */
18154
18155 static enum rtx_code
18156 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18157 {
18158 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18159 rtx op0 = *pop0, op1 = *pop1;
18160 enum machine_mode op_mode = GET_MODE (op0);
18161 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18162
18163 /* All of the unordered compare instructions only work on registers.
18164 The same is true of the fcomi compare instructions. The XFmode
18165 compare instructions require registers except when comparing
18166 against zero or when converting operand 1 from fixed point to
18167 floating point. */
18168
18169 if (!is_sse
18170 && (fpcmp_mode == CCFPUmode
18171 || (op_mode == XFmode
18172 && ! (standard_80387_constant_p (op0) == 1
18173 || standard_80387_constant_p (op1) == 1)
18174 && GET_CODE (op1) != FLOAT)
18175 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18176 {
18177 op0 = force_reg (op_mode, op0);
18178 op1 = force_reg (op_mode, op1);
18179 }
18180 else
18181 {
18182 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18183 things around if they appear profitable, otherwise force op0
18184 into a register. */
18185
18186 if (standard_80387_constant_p (op0) == 0
18187 || (MEM_P (op0)
18188 && ! (standard_80387_constant_p (op1) == 0
18189 || MEM_P (op1))))
18190 {
18191 enum rtx_code new_code = ix86_fp_swap_condition (code);
18192 if (new_code != UNKNOWN)
18193 {
18194 rtx tmp;
18195 tmp = op0, op0 = op1, op1 = tmp;
18196 code = new_code;
18197 }
18198 }
18199
18200 if (!REG_P (op0))
18201 op0 = force_reg (op_mode, op0);
18202
18203 if (CONSTANT_P (op1))
18204 {
18205 int tmp = standard_80387_constant_p (op1);
18206 if (tmp == 0)
18207 op1 = validize_mem (force_const_mem (op_mode, op1));
18208 else if (tmp == 1)
18209 {
18210 if (TARGET_CMOVE)
18211 op1 = force_reg (op_mode, op1);
18212 }
18213 else
18214 op1 = force_reg (op_mode, op1);
18215 }
18216 }
18217
18218 /* Try to rearrange the comparison to make it cheaper. */
18219 if (ix86_fp_comparison_cost (code)
18220 > ix86_fp_comparison_cost (swap_condition (code))
18221 && (REG_P (op1) || can_create_pseudo_p ()))
18222 {
18223 rtx tmp;
18224 tmp = op0, op0 = op1, op1 = tmp;
18225 code = swap_condition (code);
18226 if (!REG_P (op0))
18227 op0 = force_reg (op_mode, op0);
18228 }
18229
18230 *pop0 = op0;
18231 *pop1 = op1;
18232 return code;
18233 }
18234
18235 /* Convert comparison codes we use to represent FP comparison to integer
18236 code that will result in proper branch. Return UNKNOWN if no such code
18237 is available. */
18238
18239 enum rtx_code
18240 ix86_fp_compare_code_to_integer (enum rtx_code code)
18241 {
18242 switch (code)
18243 {
18244 case GT:
18245 return GTU;
18246 case GE:
18247 return GEU;
18248 case ORDERED:
18249 case UNORDERED:
18250 return code;
18251 break;
18252 case UNEQ:
18253 return EQ;
18254 break;
18255 case UNLT:
18256 return LTU;
18257 break;
18258 case UNLE:
18259 return LEU;
18260 break;
18261 case LTGT:
18262 return NE;
18263 break;
18264 default:
18265 return UNKNOWN;
18266 }
18267 }
18268
18269 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18270
18271 static rtx
18272 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18273 {
18274 enum machine_mode fpcmp_mode, intcmp_mode;
18275 rtx tmp, tmp2;
18276
18277 fpcmp_mode = ix86_fp_compare_mode (code);
18278 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18279
18280 /* Do fcomi/sahf based test when profitable. */
18281 switch (ix86_fp_comparison_strategy (code))
18282 {
18283 case IX86_FPCMP_COMI:
18284 intcmp_mode = fpcmp_mode;
18285 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18286 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18287 tmp);
18288 emit_insn (tmp);
18289 break;
18290
18291 case IX86_FPCMP_SAHF:
18292 intcmp_mode = fpcmp_mode;
18293 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18294 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18295 tmp);
18296
18297 if (!scratch)
18298 scratch = gen_reg_rtx (HImode);
18299 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18300 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18301 break;
18302
18303 case IX86_FPCMP_ARITH:
18304 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18305 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18306 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18307 if (!scratch)
18308 scratch = gen_reg_rtx (HImode);
18309 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18310
18311 /* In the unordered case, we have to check C2 for NaN's, which
18312 doesn't happen to work out to anything nice combination-wise.
18313 So do some bit twiddling on the value we've got in AH to come
18314 up with an appropriate set of condition codes. */
18315
18316 intcmp_mode = CCNOmode;
18317 switch (code)
18318 {
18319 case GT:
18320 case UNGT:
18321 if (code == GT || !TARGET_IEEE_FP)
18322 {
18323 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18324 code = EQ;
18325 }
18326 else
18327 {
18328 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18329 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18330 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18331 intcmp_mode = CCmode;
18332 code = GEU;
18333 }
18334 break;
18335 case LT:
18336 case UNLT:
18337 if (code == LT && TARGET_IEEE_FP)
18338 {
18339 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18340 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18341 intcmp_mode = CCmode;
18342 code = EQ;
18343 }
18344 else
18345 {
18346 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18347 code = NE;
18348 }
18349 break;
18350 case GE:
18351 case UNGE:
18352 if (code == GE || !TARGET_IEEE_FP)
18353 {
18354 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18355 code = EQ;
18356 }
18357 else
18358 {
18359 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18360 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18361 code = NE;
18362 }
18363 break;
18364 case LE:
18365 case UNLE:
18366 if (code == LE && TARGET_IEEE_FP)
18367 {
18368 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18369 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18370 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18371 intcmp_mode = CCmode;
18372 code = LTU;
18373 }
18374 else
18375 {
18376 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18377 code = NE;
18378 }
18379 break;
18380 case EQ:
18381 case UNEQ:
18382 if (code == EQ && TARGET_IEEE_FP)
18383 {
18384 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18385 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18386 intcmp_mode = CCmode;
18387 code = EQ;
18388 }
18389 else
18390 {
18391 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18392 code = NE;
18393 }
18394 break;
18395 case NE:
18396 case LTGT:
18397 if (code == NE && TARGET_IEEE_FP)
18398 {
18399 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18400 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18401 GEN_INT (0x40)));
18402 code = NE;
18403 }
18404 else
18405 {
18406 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18407 code = EQ;
18408 }
18409 break;
18410
18411 case UNORDERED:
18412 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18413 code = NE;
18414 break;
18415 case ORDERED:
18416 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18417 code = EQ;
18418 break;
18419
18420 default:
18421 gcc_unreachable ();
18422 }
18423 break;
18424
18425 default:
18426 gcc_unreachable();
18427 }
18428
18429 /* Return the test that should be put into the flags user, i.e.
18430 the bcc, scc, or cmov instruction. */
18431 return gen_rtx_fmt_ee (code, VOIDmode,
18432 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18433 const0_rtx);
18434 }
18435
18436 static rtx
18437 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18438 {
18439 rtx ret;
18440
18441 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18442 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18443
18444 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18445 {
18446 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18447 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18448 }
18449 else
18450 ret = ix86_expand_int_compare (code, op0, op1);
18451
18452 return ret;
18453 }
18454
18455 void
18456 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18457 {
18458 enum machine_mode mode = GET_MODE (op0);
18459 rtx tmp;
18460
18461 switch (mode)
18462 {
18463 case SFmode:
18464 case DFmode:
18465 case XFmode:
18466 case QImode:
18467 case HImode:
18468 case SImode:
18469 simple:
18470 tmp = ix86_expand_compare (code, op0, op1);
18471 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18472 gen_rtx_LABEL_REF (VOIDmode, label),
18473 pc_rtx);
18474 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18475 return;
18476
18477 case DImode:
18478 if (TARGET_64BIT)
18479 goto simple;
18480 case TImode:
18481 /* Expand DImode branch into multiple compare+branch. */
18482 {
18483 rtx lo[2], hi[2], label2;
18484 enum rtx_code code1, code2, code3;
18485 enum machine_mode submode;
18486
18487 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18488 {
18489 tmp = op0, op0 = op1, op1 = tmp;
18490 code = swap_condition (code);
18491 }
18492
18493 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18494 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18495
18496 submode = mode == DImode ? SImode : DImode;
18497
18498 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18499 avoid two branches. This costs one extra insn, so disable when
18500 optimizing for size. */
18501
18502 if ((code == EQ || code == NE)
18503 && (!optimize_insn_for_size_p ()
18504 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18505 {
18506 rtx xor0, xor1;
18507
18508 xor1 = hi[0];
18509 if (hi[1] != const0_rtx)
18510 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18511 NULL_RTX, 0, OPTAB_WIDEN);
18512
18513 xor0 = lo[0];
18514 if (lo[1] != const0_rtx)
18515 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18516 NULL_RTX, 0, OPTAB_WIDEN);
18517
18518 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18519 NULL_RTX, 0, OPTAB_WIDEN);
18520
18521 ix86_expand_branch (code, tmp, const0_rtx, label);
18522 return;
18523 }
18524
18525 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18526 op1 is a constant and the low word is zero, then we can just
18527 examine the high word. Similarly for low word -1 and
18528 less-or-equal-than or greater-than. */
18529
18530 if (CONST_INT_P (hi[1]))
18531 switch (code)
18532 {
18533 case LT: case LTU: case GE: case GEU:
18534 if (lo[1] == const0_rtx)
18535 {
18536 ix86_expand_branch (code, hi[0], hi[1], label);
18537 return;
18538 }
18539 break;
18540 case LE: case LEU: case GT: case GTU:
18541 if (lo[1] == constm1_rtx)
18542 {
18543 ix86_expand_branch (code, hi[0], hi[1], label);
18544 return;
18545 }
18546 break;
18547 default:
18548 break;
18549 }
18550
18551 /* Otherwise, we need two or three jumps. */
18552
18553 label2 = gen_label_rtx ();
18554
18555 code1 = code;
18556 code2 = swap_condition (code);
18557 code3 = unsigned_condition (code);
18558
18559 switch (code)
18560 {
18561 case LT: case GT: case LTU: case GTU:
18562 break;
18563
18564 case LE: code1 = LT; code2 = GT; break;
18565 case GE: code1 = GT; code2 = LT; break;
18566 case LEU: code1 = LTU; code2 = GTU; break;
18567 case GEU: code1 = GTU; code2 = LTU; break;
18568
18569 case EQ: code1 = UNKNOWN; code2 = NE; break;
18570 case NE: code2 = UNKNOWN; break;
18571
18572 default:
18573 gcc_unreachable ();
18574 }
18575
18576 /*
18577 * a < b =>
18578 * if (hi(a) < hi(b)) goto true;
18579 * if (hi(a) > hi(b)) goto false;
18580 * if (lo(a) < lo(b)) goto true;
18581 * false:
18582 */
18583
18584 if (code1 != UNKNOWN)
18585 ix86_expand_branch (code1, hi[0], hi[1], label);
18586 if (code2 != UNKNOWN)
18587 ix86_expand_branch (code2, hi[0], hi[1], label2);
18588
18589 ix86_expand_branch (code3, lo[0], lo[1], label);
18590
18591 if (code2 != UNKNOWN)
18592 emit_label (label2);
18593 return;
18594 }
18595
18596 default:
18597 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18598 goto simple;
18599 }
18600 }
18601
18602 /* Split branch based on floating point condition. */
18603 void
18604 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18605 rtx target1, rtx target2, rtx tmp, rtx pushed)
18606 {
18607 rtx condition;
18608 rtx i;
18609
18610 if (target2 != pc_rtx)
18611 {
18612 rtx tmp = target2;
18613 code = reverse_condition_maybe_unordered (code);
18614 target2 = target1;
18615 target1 = tmp;
18616 }
18617
18618 condition = ix86_expand_fp_compare (code, op1, op2,
18619 tmp);
18620
18621 /* Remove pushed operand from stack. */
18622 if (pushed)
18623 ix86_free_from_memory (GET_MODE (pushed));
18624
18625 i = emit_jump_insn (gen_rtx_SET
18626 (VOIDmode, pc_rtx,
18627 gen_rtx_IF_THEN_ELSE (VOIDmode,
18628 condition, target1, target2)));
18629 if (split_branch_probability >= 0)
18630 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18631 }
18632
18633 void
18634 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18635 {
18636 rtx ret;
18637
18638 gcc_assert (GET_MODE (dest) == QImode);
18639
18640 ret = ix86_expand_compare (code, op0, op1);
18641 PUT_MODE (ret, QImode);
18642 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18643 }
18644
18645 /* Expand comparison setting or clearing carry flag. Return true when
18646 successful and set pop for the operation. */
18647 static bool
18648 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18649 {
18650 enum machine_mode mode =
18651 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18652
18653 /* Do not handle double-mode compares that go through special path. */
18654 if (mode == (TARGET_64BIT ? TImode : DImode))
18655 return false;
18656
18657 if (SCALAR_FLOAT_MODE_P (mode))
18658 {
18659 rtx compare_op, compare_seq;
18660
18661 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18662
18663 /* Shortcut: following common codes never translate
18664 into carry flag compares. */
18665 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18666 || code == ORDERED || code == UNORDERED)
18667 return false;
18668
18669 /* These comparisons require zero flag; swap operands so they won't. */
18670 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18671 && !TARGET_IEEE_FP)
18672 {
18673 rtx tmp = op0;
18674 op0 = op1;
18675 op1 = tmp;
18676 code = swap_condition (code);
18677 }
18678
18679 /* Try to expand the comparison and verify that we end up with
18680 carry flag based comparison. This fails to be true only when
18681 we decide to expand comparison using arithmetic that is not
18682 too common scenario. */
18683 start_sequence ();
18684 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18685 compare_seq = get_insns ();
18686 end_sequence ();
18687
18688 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18689 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18690 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18691 else
18692 code = GET_CODE (compare_op);
18693
18694 if (code != LTU && code != GEU)
18695 return false;
18696
18697 emit_insn (compare_seq);
18698 *pop = compare_op;
18699 return true;
18700 }
18701
18702 if (!INTEGRAL_MODE_P (mode))
18703 return false;
18704
18705 switch (code)
18706 {
18707 case LTU:
18708 case GEU:
18709 break;
18710
18711 /* Convert a==0 into (unsigned)a<1. */
18712 case EQ:
18713 case NE:
18714 if (op1 != const0_rtx)
18715 return false;
18716 op1 = const1_rtx;
18717 code = (code == EQ ? LTU : GEU);
18718 break;
18719
18720 /* Convert a>b into b<a or a>=b-1. */
18721 case GTU:
18722 case LEU:
18723 if (CONST_INT_P (op1))
18724 {
18725 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18726 /* Bail out on overflow. We still can swap operands but that
18727 would force loading of the constant into register. */
18728 if (op1 == const0_rtx
18729 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18730 return false;
18731 code = (code == GTU ? GEU : LTU);
18732 }
18733 else
18734 {
18735 rtx tmp = op1;
18736 op1 = op0;
18737 op0 = tmp;
18738 code = (code == GTU ? LTU : GEU);
18739 }
18740 break;
18741
18742 /* Convert a>=0 into (unsigned)a<0x80000000. */
18743 case LT:
18744 case GE:
18745 if (mode == DImode || op1 != const0_rtx)
18746 return false;
18747 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18748 code = (code == LT ? GEU : LTU);
18749 break;
18750 case LE:
18751 case GT:
18752 if (mode == DImode || op1 != constm1_rtx)
18753 return false;
18754 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18755 code = (code == LE ? GEU : LTU);
18756 break;
18757
18758 default:
18759 return false;
18760 }
18761 /* Swapping operands may cause constant to appear as first operand. */
18762 if (!nonimmediate_operand (op0, VOIDmode))
18763 {
18764 if (!can_create_pseudo_p ())
18765 return false;
18766 op0 = force_reg (mode, op0);
18767 }
18768 *pop = ix86_expand_compare (code, op0, op1);
18769 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18770 return true;
18771 }
18772
18773 bool
18774 ix86_expand_int_movcc (rtx operands[])
18775 {
18776 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18777 rtx compare_seq, compare_op;
18778 enum machine_mode mode = GET_MODE (operands[0]);
18779 bool sign_bit_compare_p = false;
18780 rtx op0 = XEXP (operands[1], 0);
18781 rtx op1 = XEXP (operands[1], 1);
18782
18783 if (GET_MODE (op0) == TImode
18784 || (GET_MODE (op0) == DImode
18785 && !TARGET_64BIT))
18786 return false;
18787
18788 start_sequence ();
18789 compare_op = ix86_expand_compare (code, op0, op1);
18790 compare_seq = get_insns ();
18791 end_sequence ();
18792
18793 compare_code = GET_CODE (compare_op);
18794
18795 if ((op1 == const0_rtx && (code == GE || code == LT))
18796 || (op1 == constm1_rtx && (code == GT || code == LE)))
18797 sign_bit_compare_p = true;
18798
18799 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18800 HImode insns, we'd be swallowed in word prefix ops. */
18801
18802 if ((mode != HImode || TARGET_FAST_PREFIX)
18803 && (mode != (TARGET_64BIT ? TImode : DImode))
18804 && CONST_INT_P (operands[2])
18805 && CONST_INT_P (operands[3]))
18806 {
18807 rtx out = operands[0];
18808 HOST_WIDE_INT ct = INTVAL (operands[2]);
18809 HOST_WIDE_INT cf = INTVAL (operands[3]);
18810 HOST_WIDE_INT diff;
18811
18812 diff = ct - cf;
18813 /* Sign bit compares are better done using shifts than we do by using
18814 sbb. */
18815 if (sign_bit_compare_p
18816 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18817 {
18818 /* Detect overlap between destination and compare sources. */
18819 rtx tmp = out;
18820
18821 if (!sign_bit_compare_p)
18822 {
18823 rtx flags;
18824 bool fpcmp = false;
18825
18826 compare_code = GET_CODE (compare_op);
18827
18828 flags = XEXP (compare_op, 0);
18829
18830 if (GET_MODE (flags) == CCFPmode
18831 || GET_MODE (flags) == CCFPUmode)
18832 {
18833 fpcmp = true;
18834 compare_code
18835 = ix86_fp_compare_code_to_integer (compare_code);
18836 }
18837
18838 /* To simplify rest of code, restrict to the GEU case. */
18839 if (compare_code == LTU)
18840 {
18841 HOST_WIDE_INT tmp = ct;
18842 ct = cf;
18843 cf = tmp;
18844 compare_code = reverse_condition (compare_code);
18845 code = reverse_condition (code);
18846 }
18847 else
18848 {
18849 if (fpcmp)
18850 PUT_CODE (compare_op,
18851 reverse_condition_maybe_unordered
18852 (GET_CODE (compare_op)));
18853 else
18854 PUT_CODE (compare_op,
18855 reverse_condition (GET_CODE (compare_op)));
18856 }
18857 diff = ct - cf;
18858
18859 if (reg_overlap_mentioned_p (out, op0)
18860 || reg_overlap_mentioned_p (out, op1))
18861 tmp = gen_reg_rtx (mode);
18862
18863 if (mode == DImode)
18864 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18865 else
18866 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18867 flags, compare_op));
18868 }
18869 else
18870 {
18871 if (code == GT || code == GE)
18872 code = reverse_condition (code);
18873 else
18874 {
18875 HOST_WIDE_INT tmp = ct;
18876 ct = cf;
18877 cf = tmp;
18878 diff = ct - cf;
18879 }
18880 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18881 }
18882
18883 if (diff == 1)
18884 {
18885 /*
18886 * cmpl op0,op1
18887 * sbbl dest,dest
18888 * [addl dest, ct]
18889 *
18890 * Size 5 - 8.
18891 */
18892 if (ct)
18893 tmp = expand_simple_binop (mode, PLUS,
18894 tmp, GEN_INT (ct),
18895 copy_rtx (tmp), 1, OPTAB_DIRECT);
18896 }
18897 else if (cf == -1)
18898 {
18899 /*
18900 * cmpl op0,op1
18901 * sbbl dest,dest
18902 * orl $ct, dest
18903 *
18904 * Size 8.
18905 */
18906 tmp = expand_simple_binop (mode, IOR,
18907 tmp, GEN_INT (ct),
18908 copy_rtx (tmp), 1, OPTAB_DIRECT);
18909 }
18910 else if (diff == -1 && ct)
18911 {
18912 /*
18913 * cmpl op0,op1
18914 * sbbl dest,dest
18915 * notl dest
18916 * [addl dest, cf]
18917 *
18918 * Size 8 - 11.
18919 */
18920 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18921 if (cf)
18922 tmp = expand_simple_binop (mode, PLUS,
18923 copy_rtx (tmp), GEN_INT (cf),
18924 copy_rtx (tmp), 1, OPTAB_DIRECT);
18925 }
18926 else
18927 {
18928 /*
18929 * cmpl op0,op1
18930 * sbbl dest,dest
18931 * [notl dest]
18932 * andl cf - ct, dest
18933 * [addl dest, ct]
18934 *
18935 * Size 8 - 11.
18936 */
18937
18938 if (cf == 0)
18939 {
18940 cf = ct;
18941 ct = 0;
18942 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18943 }
18944
18945 tmp = expand_simple_binop (mode, AND,
18946 copy_rtx (tmp),
18947 gen_int_mode (cf - ct, mode),
18948 copy_rtx (tmp), 1, OPTAB_DIRECT);
18949 if (ct)
18950 tmp = expand_simple_binop (mode, PLUS,
18951 copy_rtx (tmp), GEN_INT (ct),
18952 copy_rtx (tmp), 1, OPTAB_DIRECT);
18953 }
18954
18955 if (!rtx_equal_p (tmp, out))
18956 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18957
18958 return true;
18959 }
18960
18961 if (diff < 0)
18962 {
18963 enum machine_mode cmp_mode = GET_MODE (op0);
18964
18965 HOST_WIDE_INT tmp;
18966 tmp = ct, ct = cf, cf = tmp;
18967 diff = -diff;
18968
18969 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18970 {
18971 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18972
18973 /* We may be reversing unordered compare to normal compare, that
18974 is not valid in general (we may convert non-trapping condition
18975 to trapping one), however on i386 we currently emit all
18976 comparisons unordered. */
18977 compare_code = reverse_condition_maybe_unordered (compare_code);
18978 code = reverse_condition_maybe_unordered (code);
18979 }
18980 else
18981 {
18982 compare_code = reverse_condition (compare_code);
18983 code = reverse_condition (code);
18984 }
18985 }
18986
18987 compare_code = UNKNOWN;
18988 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18989 && CONST_INT_P (op1))
18990 {
18991 if (op1 == const0_rtx
18992 && (code == LT || code == GE))
18993 compare_code = code;
18994 else if (op1 == constm1_rtx)
18995 {
18996 if (code == LE)
18997 compare_code = LT;
18998 else if (code == GT)
18999 compare_code = GE;
19000 }
19001 }
19002
19003 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19004 if (compare_code != UNKNOWN
19005 && GET_MODE (op0) == GET_MODE (out)
19006 && (cf == -1 || ct == -1))
19007 {
19008 /* If lea code below could be used, only optimize
19009 if it results in a 2 insn sequence. */
19010
19011 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19012 || diff == 3 || diff == 5 || diff == 9)
19013 || (compare_code == LT && ct == -1)
19014 || (compare_code == GE && cf == -1))
19015 {
19016 /*
19017 * notl op1 (if necessary)
19018 * sarl $31, op1
19019 * orl cf, op1
19020 */
19021 if (ct != -1)
19022 {
19023 cf = ct;
19024 ct = -1;
19025 code = reverse_condition (code);
19026 }
19027
19028 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19029
19030 out = expand_simple_binop (mode, IOR,
19031 out, GEN_INT (cf),
19032 out, 1, OPTAB_DIRECT);
19033 if (out != operands[0])
19034 emit_move_insn (operands[0], out);
19035
19036 return true;
19037 }
19038 }
19039
19040
19041 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19042 || diff == 3 || diff == 5 || diff == 9)
19043 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19044 && (mode != DImode
19045 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19046 {
19047 /*
19048 * xorl dest,dest
19049 * cmpl op1,op2
19050 * setcc dest
19051 * lea cf(dest*(ct-cf)),dest
19052 *
19053 * Size 14.
19054 *
19055 * This also catches the degenerate setcc-only case.
19056 */
19057
19058 rtx tmp;
19059 int nops;
19060
19061 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19062
19063 nops = 0;
19064 /* On x86_64 the lea instruction operates on Pmode, so we need
19065 to get arithmetics done in proper mode to match. */
19066 if (diff == 1)
19067 tmp = copy_rtx (out);
19068 else
19069 {
19070 rtx out1;
19071 out1 = copy_rtx (out);
19072 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19073 nops++;
19074 if (diff & 1)
19075 {
19076 tmp = gen_rtx_PLUS (mode, tmp, out1);
19077 nops++;
19078 }
19079 }
19080 if (cf != 0)
19081 {
19082 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19083 nops++;
19084 }
19085 if (!rtx_equal_p (tmp, out))
19086 {
19087 if (nops == 1)
19088 out = force_operand (tmp, copy_rtx (out));
19089 else
19090 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19091 }
19092 if (!rtx_equal_p (out, operands[0]))
19093 emit_move_insn (operands[0], copy_rtx (out));
19094
19095 return true;
19096 }
19097
19098 /*
19099 * General case: Jumpful:
19100 * xorl dest,dest cmpl op1, op2
19101 * cmpl op1, op2 movl ct, dest
19102 * setcc dest jcc 1f
19103 * decl dest movl cf, dest
19104 * andl (cf-ct),dest 1:
19105 * addl ct,dest
19106 *
19107 * Size 20. Size 14.
19108 *
19109 * This is reasonably steep, but branch mispredict costs are
19110 * high on modern cpus, so consider failing only if optimizing
19111 * for space.
19112 */
19113
19114 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19115 && BRANCH_COST (optimize_insn_for_speed_p (),
19116 false) >= 2)
19117 {
19118 if (cf == 0)
19119 {
19120 enum machine_mode cmp_mode = GET_MODE (op0);
19121
19122 cf = ct;
19123 ct = 0;
19124
19125 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19126 {
19127 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19128
19129 /* We may be reversing unordered compare to normal compare,
19130 that is not valid in general (we may convert non-trapping
19131 condition to trapping one), however on i386 we currently
19132 emit all comparisons unordered. */
19133 code = reverse_condition_maybe_unordered (code);
19134 }
19135 else
19136 {
19137 code = reverse_condition (code);
19138 if (compare_code != UNKNOWN)
19139 compare_code = reverse_condition (compare_code);
19140 }
19141 }
19142
19143 if (compare_code != UNKNOWN)
19144 {
19145 /* notl op1 (if needed)
19146 sarl $31, op1
19147 andl (cf-ct), op1
19148 addl ct, op1
19149
19150 For x < 0 (resp. x <= -1) there will be no notl,
19151 so if possible swap the constants to get rid of the
19152 complement.
19153 True/false will be -1/0 while code below (store flag
19154 followed by decrement) is 0/-1, so the constants need
19155 to be exchanged once more. */
19156
19157 if (compare_code == GE || !cf)
19158 {
19159 code = reverse_condition (code);
19160 compare_code = LT;
19161 }
19162 else
19163 {
19164 HOST_WIDE_INT tmp = cf;
19165 cf = ct;
19166 ct = tmp;
19167 }
19168
19169 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19170 }
19171 else
19172 {
19173 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19174
19175 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19176 constm1_rtx,
19177 copy_rtx (out), 1, OPTAB_DIRECT);
19178 }
19179
19180 out = expand_simple_binop (mode, AND, copy_rtx (out),
19181 gen_int_mode (cf - ct, mode),
19182 copy_rtx (out), 1, OPTAB_DIRECT);
19183 if (ct)
19184 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19185 copy_rtx (out), 1, OPTAB_DIRECT);
19186 if (!rtx_equal_p (out, operands[0]))
19187 emit_move_insn (operands[0], copy_rtx (out));
19188
19189 return true;
19190 }
19191 }
19192
19193 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19194 {
19195 /* Try a few things more with specific constants and a variable. */
19196
19197 optab op;
19198 rtx var, orig_out, out, tmp;
19199
19200 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19201 return false;
19202
19203 /* If one of the two operands is an interesting constant, load a
19204 constant with the above and mask it in with a logical operation. */
19205
19206 if (CONST_INT_P (operands[2]))
19207 {
19208 var = operands[3];
19209 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19210 operands[3] = constm1_rtx, op = and_optab;
19211 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19212 operands[3] = const0_rtx, op = ior_optab;
19213 else
19214 return false;
19215 }
19216 else if (CONST_INT_P (operands[3]))
19217 {
19218 var = operands[2];
19219 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19220 operands[2] = constm1_rtx, op = and_optab;
19221 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19222 operands[2] = const0_rtx, op = ior_optab;
19223 else
19224 return false;
19225 }
19226 else
19227 return false;
19228
19229 orig_out = operands[0];
19230 tmp = gen_reg_rtx (mode);
19231 operands[0] = tmp;
19232
19233 /* Recurse to get the constant loaded. */
19234 if (ix86_expand_int_movcc (operands) == 0)
19235 return false;
19236
19237 /* Mask in the interesting variable. */
19238 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19239 OPTAB_WIDEN);
19240 if (!rtx_equal_p (out, orig_out))
19241 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19242
19243 return true;
19244 }
19245
19246 /*
19247 * For comparison with above,
19248 *
19249 * movl cf,dest
19250 * movl ct,tmp
19251 * cmpl op1,op2
19252 * cmovcc tmp,dest
19253 *
19254 * Size 15.
19255 */
19256
19257 if (! nonimmediate_operand (operands[2], mode))
19258 operands[2] = force_reg (mode, operands[2]);
19259 if (! nonimmediate_operand (operands[3], mode))
19260 operands[3] = force_reg (mode, operands[3]);
19261
19262 if (! register_operand (operands[2], VOIDmode)
19263 && (mode == QImode
19264 || ! register_operand (operands[3], VOIDmode)))
19265 operands[2] = force_reg (mode, operands[2]);
19266
19267 if (mode == QImode
19268 && ! register_operand (operands[3], VOIDmode))
19269 operands[3] = force_reg (mode, operands[3]);
19270
19271 emit_insn (compare_seq);
19272 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19273 gen_rtx_IF_THEN_ELSE (mode,
19274 compare_op, operands[2],
19275 operands[3])));
19276 return true;
19277 }
19278
19279 /* Swap, force into registers, or otherwise massage the two operands
19280 to an sse comparison with a mask result. Thus we differ a bit from
19281 ix86_prepare_fp_compare_args which expects to produce a flags result.
19282
19283 The DEST operand exists to help determine whether to commute commutative
19284 operators. The POP0/POP1 operands are updated in place. The new
19285 comparison code is returned, or UNKNOWN if not implementable. */
19286
19287 static enum rtx_code
19288 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19289 rtx *pop0, rtx *pop1)
19290 {
19291 rtx tmp;
19292
19293 switch (code)
19294 {
19295 case LTGT:
19296 case UNEQ:
19297 /* AVX supports all the needed comparisons. */
19298 if (TARGET_AVX)
19299 break;
19300 /* We have no LTGT as an operator. We could implement it with
19301 NE & ORDERED, but this requires an extra temporary. It's
19302 not clear that it's worth it. */
19303 return UNKNOWN;
19304
19305 case LT:
19306 case LE:
19307 case UNGT:
19308 case UNGE:
19309 /* These are supported directly. */
19310 break;
19311
19312 case EQ:
19313 case NE:
19314 case UNORDERED:
19315 case ORDERED:
19316 /* AVX has 3 operand comparisons, no need to swap anything. */
19317 if (TARGET_AVX)
19318 break;
19319 /* For commutative operators, try to canonicalize the destination
19320 operand to be first in the comparison - this helps reload to
19321 avoid extra moves. */
19322 if (!dest || !rtx_equal_p (dest, *pop1))
19323 break;
19324 /* FALLTHRU */
19325
19326 case GE:
19327 case GT:
19328 case UNLE:
19329 case UNLT:
19330 /* These are not supported directly before AVX, and furthermore
19331 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19332 comparison operands to transform into something that is
19333 supported. */
19334 tmp = *pop0;
19335 *pop0 = *pop1;
19336 *pop1 = tmp;
19337 code = swap_condition (code);
19338 break;
19339
19340 default:
19341 gcc_unreachable ();
19342 }
19343
19344 return code;
19345 }
19346
19347 /* Detect conditional moves that exactly match min/max operational
19348 semantics. Note that this is IEEE safe, as long as we don't
19349 interchange the operands.
19350
19351 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19352 and TRUE if the operation is successful and instructions are emitted. */
19353
19354 static bool
19355 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19356 rtx cmp_op1, rtx if_true, rtx if_false)
19357 {
19358 enum machine_mode mode;
19359 bool is_min;
19360 rtx tmp;
19361
19362 if (code == LT)
19363 ;
19364 else if (code == UNGE)
19365 {
19366 tmp = if_true;
19367 if_true = if_false;
19368 if_false = tmp;
19369 }
19370 else
19371 return false;
19372
19373 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19374 is_min = true;
19375 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19376 is_min = false;
19377 else
19378 return false;
19379
19380 mode = GET_MODE (dest);
19381
19382 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19383 but MODE may be a vector mode and thus not appropriate. */
19384 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19385 {
19386 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19387 rtvec v;
19388
19389 if_true = force_reg (mode, if_true);
19390 v = gen_rtvec (2, if_true, if_false);
19391 tmp = gen_rtx_UNSPEC (mode, v, u);
19392 }
19393 else
19394 {
19395 code = is_min ? SMIN : SMAX;
19396 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19397 }
19398
19399 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19400 return true;
19401 }
19402
19403 /* Expand an sse vector comparison. Return the register with the result. */
19404
19405 static rtx
19406 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19407 rtx op_true, rtx op_false)
19408 {
19409 enum machine_mode mode = GET_MODE (dest);
19410 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19411 rtx x;
19412
19413 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19414 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19415 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19416
19417 if (optimize
19418 || reg_overlap_mentioned_p (dest, op_true)
19419 || reg_overlap_mentioned_p (dest, op_false))
19420 dest = gen_reg_rtx (mode);
19421
19422 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19423 if (cmp_mode != mode)
19424 {
19425 x = force_reg (cmp_mode, x);
19426 convert_move (dest, x, false);
19427 }
19428 else
19429 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19430
19431 return dest;
19432 }
19433
19434 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19435 operations. This is used for both scalar and vector conditional moves. */
19436
19437 static void
19438 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19439 {
19440 enum machine_mode mode = GET_MODE (dest);
19441 rtx t2, t3, x;
19442
19443 if (vector_all_ones_operand (op_true, mode)
19444 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19445 {
19446 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19447 }
19448 else if (op_false == CONST0_RTX (mode))
19449 {
19450 op_true = force_reg (mode, op_true);
19451 x = gen_rtx_AND (mode, cmp, op_true);
19452 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19453 }
19454 else if (op_true == CONST0_RTX (mode))
19455 {
19456 op_false = force_reg (mode, op_false);
19457 x = gen_rtx_NOT (mode, cmp);
19458 x = gen_rtx_AND (mode, x, op_false);
19459 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19460 }
19461 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19462 {
19463 op_false = force_reg (mode, op_false);
19464 x = gen_rtx_IOR (mode, cmp, op_false);
19465 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19466 }
19467 else if (TARGET_XOP)
19468 {
19469 op_true = force_reg (mode, op_true);
19470
19471 if (!nonimmediate_operand (op_false, mode))
19472 op_false = force_reg (mode, op_false);
19473
19474 emit_insn (gen_rtx_SET (mode, dest,
19475 gen_rtx_IF_THEN_ELSE (mode, cmp,
19476 op_true,
19477 op_false)));
19478 }
19479 else
19480 {
19481 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19482
19483 if (!nonimmediate_operand (op_true, mode))
19484 op_true = force_reg (mode, op_true);
19485
19486 op_false = force_reg (mode, op_false);
19487
19488 switch (mode)
19489 {
19490 case V4SFmode:
19491 if (TARGET_SSE4_1)
19492 gen = gen_sse4_1_blendvps;
19493 break;
19494 case V2DFmode:
19495 if (TARGET_SSE4_1)
19496 gen = gen_sse4_1_blendvpd;
19497 break;
19498 case V16QImode:
19499 case V8HImode:
19500 case V4SImode:
19501 case V2DImode:
19502 if (TARGET_SSE4_1)
19503 {
19504 gen = gen_sse4_1_pblendvb;
19505 dest = gen_lowpart (V16QImode, dest);
19506 op_false = gen_lowpart (V16QImode, op_false);
19507 op_true = gen_lowpart (V16QImode, op_true);
19508 cmp = gen_lowpart (V16QImode, cmp);
19509 }
19510 break;
19511 case V8SFmode:
19512 if (TARGET_AVX)
19513 gen = gen_avx_blendvps256;
19514 break;
19515 case V4DFmode:
19516 if (TARGET_AVX)
19517 gen = gen_avx_blendvpd256;
19518 break;
19519 case V32QImode:
19520 case V16HImode:
19521 case V8SImode:
19522 case V4DImode:
19523 if (TARGET_AVX2)
19524 {
19525 gen = gen_avx2_pblendvb;
19526 dest = gen_lowpart (V32QImode, dest);
19527 op_false = gen_lowpart (V32QImode, op_false);
19528 op_true = gen_lowpart (V32QImode, op_true);
19529 cmp = gen_lowpart (V32QImode, cmp);
19530 }
19531 break;
19532 default:
19533 break;
19534 }
19535
19536 if (gen != NULL)
19537 emit_insn (gen (dest, op_false, op_true, cmp));
19538 else
19539 {
19540 op_true = force_reg (mode, op_true);
19541
19542 t2 = gen_reg_rtx (mode);
19543 if (optimize)
19544 t3 = gen_reg_rtx (mode);
19545 else
19546 t3 = dest;
19547
19548 x = gen_rtx_AND (mode, op_true, cmp);
19549 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19550
19551 x = gen_rtx_NOT (mode, cmp);
19552 x = gen_rtx_AND (mode, x, op_false);
19553 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19554
19555 x = gen_rtx_IOR (mode, t3, t2);
19556 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19557 }
19558 }
19559 }
19560
19561 /* Expand a floating-point conditional move. Return true if successful. */
19562
19563 bool
19564 ix86_expand_fp_movcc (rtx operands[])
19565 {
19566 enum machine_mode mode = GET_MODE (operands[0]);
19567 enum rtx_code code = GET_CODE (operands[1]);
19568 rtx tmp, compare_op;
19569 rtx op0 = XEXP (operands[1], 0);
19570 rtx op1 = XEXP (operands[1], 1);
19571
19572 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19573 {
19574 enum machine_mode cmode;
19575
19576 /* Since we've no cmove for sse registers, don't force bad register
19577 allocation just to gain access to it. Deny movcc when the
19578 comparison mode doesn't match the move mode. */
19579 cmode = GET_MODE (op0);
19580 if (cmode == VOIDmode)
19581 cmode = GET_MODE (op1);
19582 if (cmode != mode)
19583 return false;
19584
19585 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19586 if (code == UNKNOWN)
19587 return false;
19588
19589 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19590 operands[2], operands[3]))
19591 return true;
19592
19593 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19594 operands[2], operands[3]);
19595 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19596 return true;
19597 }
19598
19599 /* The floating point conditional move instructions don't directly
19600 support conditions resulting from a signed integer comparison. */
19601
19602 compare_op = ix86_expand_compare (code, op0, op1);
19603 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19604 {
19605 tmp = gen_reg_rtx (QImode);
19606 ix86_expand_setcc (tmp, code, op0, op1);
19607
19608 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19609 }
19610
19611 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19612 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19613 operands[2], operands[3])));
19614
19615 return true;
19616 }
19617
19618 /* Expand a floating-point vector conditional move; a vcond operation
19619 rather than a movcc operation. */
19620
19621 bool
19622 ix86_expand_fp_vcond (rtx operands[])
19623 {
19624 enum rtx_code code = GET_CODE (operands[3]);
19625 rtx cmp;
19626
19627 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19628 &operands[4], &operands[5]);
19629 if (code == UNKNOWN)
19630 {
19631 rtx temp;
19632 switch (GET_CODE (operands[3]))
19633 {
19634 case LTGT:
19635 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19636 operands[5], operands[0], operands[0]);
19637 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19638 operands[5], operands[1], operands[2]);
19639 code = AND;
19640 break;
19641 case UNEQ:
19642 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19643 operands[5], operands[0], operands[0]);
19644 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19645 operands[5], operands[1], operands[2]);
19646 code = IOR;
19647 break;
19648 default:
19649 gcc_unreachable ();
19650 }
19651 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19652 OPTAB_DIRECT);
19653 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19654 return true;
19655 }
19656
19657 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19658 operands[5], operands[1], operands[2]))
19659 return true;
19660
19661 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19662 operands[1], operands[2]);
19663 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19664 return true;
19665 }
19666
19667 /* Expand a signed/unsigned integral vector conditional move. */
19668
19669 bool
19670 ix86_expand_int_vcond (rtx operands[])
19671 {
19672 enum machine_mode data_mode = GET_MODE (operands[0]);
19673 enum machine_mode mode = GET_MODE (operands[4]);
19674 enum rtx_code code = GET_CODE (operands[3]);
19675 bool negate = false;
19676 rtx x, cop0, cop1;
19677
19678 cop0 = operands[4];
19679 cop1 = operands[5];
19680
19681 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19682 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19683 if ((code == LT || code == GE)
19684 && data_mode == mode
19685 && cop1 == CONST0_RTX (mode)
19686 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19687 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19688 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19689 && (GET_MODE_SIZE (data_mode) == 16
19690 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19691 {
19692 rtx negop = operands[2 - (code == LT)];
19693 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19694 if (negop == CONST1_RTX (data_mode))
19695 {
19696 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19697 operands[0], 1, OPTAB_DIRECT);
19698 if (res != operands[0])
19699 emit_move_insn (operands[0], res);
19700 return true;
19701 }
19702 else if (GET_MODE_INNER (data_mode) != DImode
19703 && vector_all_ones_operand (negop, data_mode))
19704 {
19705 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19706 operands[0], 0, OPTAB_DIRECT);
19707 if (res != operands[0])
19708 emit_move_insn (operands[0], res);
19709 return true;
19710 }
19711 }
19712
19713 if (!nonimmediate_operand (cop1, mode))
19714 cop1 = force_reg (mode, cop1);
19715 if (!general_operand (operands[1], data_mode))
19716 operands[1] = force_reg (data_mode, operands[1]);
19717 if (!general_operand (operands[2], data_mode))
19718 operands[2] = force_reg (data_mode, operands[2]);
19719
19720 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19721 if (TARGET_XOP
19722 && (mode == V16QImode || mode == V8HImode
19723 || mode == V4SImode || mode == V2DImode))
19724 ;
19725 else
19726 {
19727 /* Canonicalize the comparison to EQ, GT, GTU. */
19728 switch (code)
19729 {
19730 case EQ:
19731 case GT:
19732 case GTU:
19733 break;
19734
19735 case NE:
19736 case LE:
19737 case LEU:
19738 code = reverse_condition (code);
19739 negate = true;
19740 break;
19741
19742 case GE:
19743 case GEU:
19744 code = reverse_condition (code);
19745 negate = true;
19746 /* FALLTHRU */
19747
19748 case LT:
19749 case LTU:
19750 code = swap_condition (code);
19751 x = cop0, cop0 = cop1, cop1 = x;
19752 break;
19753
19754 default:
19755 gcc_unreachable ();
19756 }
19757
19758 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19759 if (mode == V2DImode)
19760 {
19761 switch (code)
19762 {
19763 case EQ:
19764 /* SSE4.1 supports EQ. */
19765 if (!TARGET_SSE4_1)
19766 return false;
19767 break;
19768
19769 case GT:
19770 case GTU:
19771 /* SSE4.2 supports GT/GTU. */
19772 if (!TARGET_SSE4_2)
19773 return false;
19774 break;
19775
19776 default:
19777 gcc_unreachable ();
19778 }
19779 }
19780
19781 /* Unsigned parallel compare is not supported by the hardware.
19782 Play some tricks to turn this into a signed comparison
19783 against 0. */
19784 if (code == GTU)
19785 {
19786 cop0 = force_reg (mode, cop0);
19787
19788 switch (mode)
19789 {
19790 case V8SImode:
19791 case V4DImode:
19792 case V4SImode:
19793 case V2DImode:
19794 {
19795 rtx t1, t2, mask;
19796 rtx (*gen_sub3) (rtx, rtx, rtx);
19797
19798 switch (mode)
19799 {
19800 case V8SImode: gen_sub3 = gen_subv8si3; break;
19801 case V4DImode: gen_sub3 = gen_subv4di3; break;
19802 case V4SImode: gen_sub3 = gen_subv4si3; break;
19803 case V2DImode: gen_sub3 = gen_subv2di3; break;
19804 default:
19805 gcc_unreachable ();
19806 }
19807 /* Subtract (-(INT MAX) - 1) from both operands to make
19808 them signed. */
19809 mask = ix86_build_signbit_mask (mode, true, false);
19810 t1 = gen_reg_rtx (mode);
19811 emit_insn (gen_sub3 (t1, cop0, mask));
19812
19813 t2 = gen_reg_rtx (mode);
19814 emit_insn (gen_sub3 (t2, cop1, mask));
19815
19816 cop0 = t1;
19817 cop1 = t2;
19818 code = GT;
19819 }
19820 break;
19821
19822 case V32QImode:
19823 case V16HImode:
19824 case V16QImode:
19825 case V8HImode:
19826 /* Perform a parallel unsigned saturating subtraction. */
19827 x = gen_reg_rtx (mode);
19828 emit_insn (gen_rtx_SET (VOIDmode, x,
19829 gen_rtx_US_MINUS (mode, cop0, cop1)));
19830
19831 cop0 = x;
19832 cop1 = CONST0_RTX (mode);
19833 code = EQ;
19834 negate = !negate;
19835 break;
19836
19837 default:
19838 gcc_unreachable ();
19839 }
19840 }
19841 }
19842
19843 /* Allow the comparison to be done in one mode, but the movcc to
19844 happen in another mode. */
19845 if (data_mode == mode)
19846 {
19847 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19848 operands[1+negate], operands[2-negate]);
19849 }
19850 else
19851 {
19852 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19853 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19854 code, cop0, cop1,
19855 operands[1+negate], operands[2-negate]);
19856 x = gen_lowpart (data_mode, x);
19857 }
19858
19859 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19860 operands[2-negate]);
19861 return true;
19862 }
19863
19864 /* Expand a variable vector permutation. */
19865
19866 void
19867 ix86_expand_vec_perm (rtx operands[])
19868 {
19869 rtx target = operands[0];
19870 rtx op0 = operands[1];
19871 rtx op1 = operands[2];
19872 rtx mask = operands[3];
19873 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19874 enum machine_mode mode = GET_MODE (op0);
19875 enum machine_mode maskmode = GET_MODE (mask);
19876 int w, e, i;
19877 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19878
19879 /* Number of elements in the vector. */
19880 w = GET_MODE_NUNITS (mode);
19881 e = GET_MODE_UNIT_SIZE (mode);
19882 gcc_assert (w <= 32);
19883
19884 if (TARGET_AVX2)
19885 {
19886 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19887 {
19888 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19889 an constant shuffle operand. With a tiny bit of effort we can
19890 use VPERMD instead. A re-interpretation stall for V4DFmode is
19891 unfortunate but there's no avoiding it.
19892 Similarly for V16HImode we don't have instructions for variable
19893 shuffling, while for V32QImode we can use after preparing suitable
19894 masks vpshufb; vpshufb; vpermq; vpor. */
19895
19896 if (mode == V16HImode)
19897 {
19898 maskmode = mode = V32QImode;
19899 w = 32;
19900 e = 1;
19901 }
19902 else
19903 {
19904 maskmode = mode = V8SImode;
19905 w = 8;
19906 e = 4;
19907 }
19908 t1 = gen_reg_rtx (maskmode);
19909
19910 /* Replicate the low bits of the V4DImode mask into V8SImode:
19911 mask = { A B C D }
19912 t1 = { A A B B C C D D }. */
19913 for (i = 0; i < w / 2; ++i)
19914 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19915 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19916 vt = force_reg (maskmode, vt);
19917 mask = gen_lowpart (maskmode, mask);
19918 if (maskmode == V8SImode)
19919 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
19920 else
19921 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19922
19923 /* Multiply the shuffle indicies by two. */
19924 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19925 OPTAB_DIRECT);
19926
19927 /* Add one to the odd shuffle indicies:
19928 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19929 for (i = 0; i < w / 2; ++i)
19930 {
19931 vec[i * 2] = const0_rtx;
19932 vec[i * 2 + 1] = const1_rtx;
19933 }
19934 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19935 vt = force_const_mem (maskmode, vt);
19936 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19937 OPTAB_DIRECT);
19938
19939 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19940 operands[3] = mask = t1;
19941 target = gen_lowpart (mode, target);
19942 op0 = gen_lowpart (mode, op0);
19943 op1 = gen_lowpart (mode, op1);
19944 }
19945
19946 switch (mode)
19947 {
19948 case V8SImode:
19949 /* The VPERMD and VPERMPS instructions already properly ignore
19950 the high bits of the shuffle elements. No need for us to
19951 perform an AND ourselves. */
19952 if (one_operand_shuffle)
19953 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
19954 else
19955 {
19956 t1 = gen_reg_rtx (V8SImode);
19957 t2 = gen_reg_rtx (V8SImode);
19958 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
19959 emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
19960 goto merge_two;
19961 }
19962 return;
19963
19964 case V8SFmode:
19965 mask = gen_lowpart (V8SFmode, mask);
19966 if (one_operand_shuffle)
19967 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
19968 else
19969 {
19970 t1 = gen_reg_rtx (V8SFmode);
19971 t2 = gen_reg_rtx (V8SFmode);
19972 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
19973 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
19974 goto merge_two;
19975 }
19976 return;
19977
19978 case V4SImode:
19979 /* By combining the two 128-bit input vectors into one 256-bit
19980 input vector, we can use VPERMD and VPERMPS for the full
19981 two-operand shuffle. */
19982 t1 = gen_reg_rtx (V8SImode);
19983 t2 = gen_reg_rtx (V8SImode);
19984 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19985 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19986 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
19987 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19988 return;
19989
19990 case V4SFmode:
19991 t1 = gen_reg_rtx (V8SFmode);
19992 t2 = gen_reg_rtx (V8SFmode);
19993 mask = gen_lowpart (V4SFmode, mask);
19994 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19995 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19996 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
19997 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19998 return;
19999
20000 case V32QImode:
20001 t1 = gen_reg_rtx (V32QImode);
20002 t2 = gen_reg_rtx (V32QImode);
20003 t3 = gen_reg_rtx (V32QImode);
20004 vt2 = GEN_INT (128);
20005 for (i = 0; i < 32; i++)
20006 vec[i] = vt2;
20007 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20008 vt = force_reg (V32QImode, vt);
20009 for (i = 0; i < 32; i++)
20010 vec[i] = i < 16 ? vt2 : const0_rtx;
20011 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20012 vt2 = force_reg (V32QImode, vt2);
20013 /* From mask create two adjusted masks, which contain the same
20014 bits as mask in the low 7 bits of each vector element.
20015 The first mask will have the most significant bit clear
20016 if it requests element from the same 128-bit lane
20017 and MSB set if it requests element from the other 128-bit lane.
20018 The second mask will have the opposite values of the MSB,
20019 and additionally will have its 128-bit lanes swapped.
20020 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20021 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20022 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20023 stands for other 12 bytes. */
20024 /* The bit whether element is from the same lane or the other
20025 lane is bit 4, so shift it up by 3 to the MSB position. */
20026 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20027 gen_lowpart (V4DImode, mask),
20028 GEN_INT (3)));
20029 /* Clear MSB bits from the mask just in case it had them set. */
20030 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20031 /* After this t1 will have MSB set for elements from other lane. */
20032 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20033 /* Clear bits other than MSB. */
20034 emit_insn (gen_andv32qi3 (t1, t1, vt));
20035 /* Or in the lower bits from mask into t3. */
20036 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20037 /* And invert MSB bits in t1, so MSB is set for elements from the same
20038 lane. */
20039 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20040 /* Swap 128-bit lanes in t3. */
20041 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20042 gen_lowpart (V4DImode, t3),
20043 const2_rtx, GEN_INT (3),
20044 const0_rtx, const1_rtx));
20045 /* And or in the lower bits from mask into t1. */
20046 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20047 if (one_operand_shuffle)
20048 {
20049 /* Each of these shuffles will put 0s in places where
20050 element from the other 128-bit lane is needed, otherwise
20051 will shuffle in the requested value. */
20052 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20053 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20054 /* For t3 the 128-bit lanes are swapped again. */
20055 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20056 gen_lowpart (V4DImode, t3),
20057 const2_rtx, GEN_INT (3),
20058 const0_rtx, const1_rtx));
20059 /* And oring both together leads to the result. */
20060 emit_insn (gen_iorv32qi3 (target, t1, t3));
20061 return;
20062 }
20063
20064 t4 = gen_reg_rtx (V32QImode);
20065 /* Similarly to the above one_operand_shuffle code,
20066 just for repeated twice for each operand. merge_two:
20067 code will merge the two results together. */
20068 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20069 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20070 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20071 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20072 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20073 gen_lowpart (V4DImode, t4),
20074 const2_rtx, GEN_INT (3),
20075 const0_rtx, const1_rtx));
20076 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20077 gen_lowpart (V4DImode, t3),
20078 const2_rtx, GEN_INT (3),
20079 const0_rtx, const1_rtx));
20080 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20081 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20082 t1 = t4;
20083 t2 = t3;
20084 goto merge_two;
20085
20086 default:
20087 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20088 break;
20089 }
20090 }
20091
20092 if (TARGET_XOP)
20093 {
20094 /* The XOP VPPERM insn supports three inputs. By ignoring the
20095 one_operand_shuffle special case, we avoid creating another
20096 set of constant vectors in memory. */
20097 one_operand_shuffle = false;
20098
20099 /* mask = mask & {2*w-1, ...} */
20100 vt = GEN_INT (2*w - 1);
20101 }
20102 else
20103 {
20104 /* mask = mask & {w-1, ...} */
20105 vt = GEN_INT (w - 1);
20106 }
20107
20108 for (i = 0; i < w; i++)
20109 vec[i] = vt;
20110 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20111 mask = expand_simple_binop (maskmode, AND, mask, vt,
20112 NULL_RTX, 0, OPTAB_DIRECT);
20113
20114 /* For non-QImode operations, convert the word permutation control
20115 into a byte permutation control. */
20116 if (mode != V16QImode)
20117 {
20118 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20119 GEN_INT (exact_log2 (e)),
20120 NULL_RTX, 0, OPTAB_DIRECT);
20121
20122 /* Convert mask to vector of chars. */
20123 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20124
20125 /* Replicate each of the input bytes into byte positions:
20126 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20127 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20128 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20129 for (i = 0; i < 16; ++i)
20130 vec[i] = GEN_INT (i/e * e);
20131 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20132 vt = force_const_mem (V16QImode, vt);
20133 if (TARGET_XOP)
20134 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20135 else
20136 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20137
20138 /* Convert it into the byte positions by doing
20139 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20140 for (i = 0; i < 16; ++i)
20141 vec[i] = GEN_INT (i % e);
20142 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20143 vt = force_const_mem (V16QImode, vt);
20144 emit_insn (gen_addv16qi3 (mask, mask, vt));
20145 }
20146
20147 /* The actual shuffle operations all operate on V16QImode. */
20148 op0 = gen_lowpart (V16QImode, op0);
20149 op1 = gen_lowpart (V16QImode, op1);
20150 target = gen_lowpart (V16QImode, target);
20151
20152 if (TARGET_XOP)
20153 {
20154 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20155 }
20156 else if (one_operand_shuffle)
20157 {
20158 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20159 }
20160 else
20161 {
20162 rtx xops[6];
20163 bool ok;
20164
20165 /* Shuffle the two input vectors independently. */
20166 t1 = gen_reg_rtx (V16QImode);
20167 t2 = gen_reg_rtx (V16QImode);
20168 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20169 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20170
20171 merge_two:
20172 /* Then merge them together. The key is whether any given control
20173 element contained a bit set that indicates the second word. */
20174 mask = operands[3];
20175 vt = GEN_INT (w);
20176 if (maskmode == V2DImode && !TARGET_SSE4_1)
20177 {
20178 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20179 more shuffle to convert the V2DI input mask into a V4SI
20180 input mask. At which point the masking that expand_int_vcond
20181 will work as desired. */
20182 rtx t3 = gen_reg_rtx (V4SImode);
20183 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20184 const0_rtx, const0_rtx,
20185 const2_rtx, const2_rtx));
20186 mask = t3;
20187 maskmode = V4SImode;
20188 e = w = 4;
20189 }
20190
20191 for (i = 0; i < w; i++)
20192 vec[i] = vt;
20193 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20194 vt = force_reg (maskmode, vt);
20195 mask = expand_simple_binop (maskmode, AND, mask, vt,
20196 NULL_RTX, 0, OPTAB_DIRECT);
20197
20198 xops[0] = gen_lowpart (mode, operands[0]);
20199 xops[1] = gen_lowpart (mode, t2);
20200 xops[2] = gen_lowpart (mode, t1);
20201 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20202 xops[4] = mask;
20203 xops[5] = vt;
20204 ok = ix86_expand_int_vcond (xops);
20205 gcc_assert (ok);
20206 }
20207 }
20208
20209 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20210 true if we should do zero extension, else sign extension. HIGH_P is
20211 true if we want the N/2 high elements, else the low elements. */
20212
20213 void
20214 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20215 {
20216 enum machine_mode imode = GET_MODE (operands[1]);
20217 rtx tmp, dest;
20218
20219 if (TARGET_SSE4_1)
20220 {
20221 rtx (*unpack)(rtx, rtx);
20222 rtx (*extract)(rtx, rtx) = NULL;
20223 enum machine_mode halfmode = BLKmode;
20224
20225 switch (imode)
20226 {
20227 case V32QImode:
20228 if (unsigned_p)
20229 unpack = gen_avx2_zero_extendv16qiv16hi2;
20230 else
20231 unpack = gen_avx2_sign_extendv16qiv16hi2;
20232 halfmode = V16QImode;
20233 extract
20234 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20235 break;
20236 case V16HImode:
20237 if (unsigned_p)
20238 unpack = gen_avx2_zero_extendv8hiv8si2;
20239 else
20240 unpack = gen_avx2_sign_extendv8hiv8si2;
20241 halfmode = V8HImode;
20242 extract
20243 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20244 break;
20245 case V8SImode:
20246 if (unsigned_p)
20247 unpack = gen_avx2_zero_extendv4siv4di2;
20248 else
20249 unpack = gen_avx2_sign_extendv4siv4di2;
20250 halfmode = V4SImode;
20251 extract
20252 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20253 break;
20254 case V16QImode:
20255 if (unsigned_p)
20256 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20257 else
20258 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20259 break;
20260 case V8HImode:
20261 if (unsigned_p)
20262 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20263 else
20264 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20265 break;
20266 case V4SImode:
20267 if (unsigned_p)
20268 unpack = gen_sse4_1_zero_extendv2siv2di2;
20269 else
20270 unpack = gen_sse4_1_sign_extendv2siv2di2;
20271 break;
20272 default:
20273 gcc_unreachable ();
20274 }
20275
20276 if (GET_MODE_SIZE (imode) == 32)
20277 {
20278 tmp = gen_reg_rtx (halfmode);
20279 emit_insn (extract (tmp, operands[1]));
20280 }
20281 else if (high_p)
20282 {
20283 /* Shift higher 8 bytes to lower 8 bytes. */
20284 tmp = gen_reg_rtx (imode);
20285 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20286 gen_lowpart (V1TImode, operands[1]),
20287 GEN_INT (64)));
20288 }
20289 else
20290 tmp = operands[1];
20291
20292 emit_insn (unpack (operands[0], tmp));
20293 }
20294 else
20295 {
20296 rtx (*unpack)(rtx, rtx, rtx);
20297
20298 switch (imode)
20299 {
20300 case V16QImode:
20301 if (high_p)
20302 unpack = gen_vec_interleave_highv16qi;
20303 else
20304 unpack = gen_vec_interleave_lowv16qi;
20305 break;
20306 case V8HImode:
20307 if (high_p)
20308 unpack = gen_vec_interleave_highv8hi;
20309 else
20310 unpack = gen_vec_interleave_lowv8hi;
20311 break;
20312 case V4SImode:
20313 if (high_p)
20314 unpack = gen_vec_interleave_highv4si;
20315 else
20316 unpack = gen_vec_interleave_lowv4si;
20317 break;
20318 default:
20319 gcc_unreachable ();
20320 }
20321
20322 dest = gen_lowpart (imode, operands[0]);
20323
20324 if (unsigned_p)
20325 tmp = force_reg (imode, CONST0_RTX (imode));
20326 else
20327 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20328 operands[1], pc_rtx, pc_rtx);
20329
20330 emit_insn (unpack (dest, operands[1], tmp));
20331 }
20332 }
20333
20334 /* Expand conditional increment or decrement using adb/sbb instructions.
20335 The default case using setcc followed by the conditional move can be
20336 done by generic code. */
20337 bool
20338 ix86_expand_int_addcc (rtx operands[])
20339 {
20340 enum rtx_code code = GET_CODE (operands[1]);
20341 rtx flags;
20342 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20343 rtx compare_op;
20344 rtx val = const0_rtx;
20345 bool fpcmp = false;
20346 enum machine_mode mode;
20347 rtx op0 = XEXP (operands[1], 0);
20348 rtx op1 = XEXP (operands[1], 1);
20349
20350 if (operands[3] != const1_rtx
20351 && operands[3] != constm1_rtx)
20352 return false;
20353 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20354 return false;
20355 code = GET_CODE (compare_op);
20356
20357 flags = XEXP (compare_op, 0);
20358
20359 if (GET_MODE (flags) == CCFPmode
20360 || GET_MODE (flags) == CCFPUmode)
20361 {
20362 fpcmp = true;
20363 code = ix86_fp_compare_code_to_integer (code);
20364 }
20365
20366 if (code != LTU)
20367 {
20368 val = constm1_rtx;
20369 if (fpcmp)
20370 PUT_CODE (compare_op,
20371 reverse_condition_maybe_unordered
20372 (GET_CODE (compare_op)));
20373 else
20374 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20375 }
20376
20377 mode = GET_MODE (operands[0]);
20378
20379 /* Construct either adc or sbb insn. */
20380 if ((code == LTU) == (operands[3] == constm1_rtx))
20381 {
20382 switch (mode)
20383 {
20384 case QImode:
20385 insn = gen_subqi3_carry;
20386 break;
20387 case HImode:
20388 insn = gen_subhi3_carry;
20389 break;
20390 case SImode:
20391 insn = gen_subsi3_carry;
20392 break;
20393 case DImode:
20394 insn = gen_subdi3_carry;
20395 break;
20396 default:
20397 gcc_unreachable ();
20398 }
20399 }
20400 else
20401 {
20402 switch (mode)
20403 {
20404 case QImode:
20405 insn = gen_addqi3_carry;
20406 break;
20407 case HImode:
20408 insn = gen_addhi3_carry;
20409 break;
20410 case SImode:
20411 insn = gen_addsi3_carry;
20412 break;
20413 case DImode:
20414 insn = gen_adddi3_carry;
20415 break;
20416 default:
20417 gcc_unreachable ();
20418 }
20419 }
20420 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20421
20422 return true;
20423 }
20424
20425
20426 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20427 but works for floating pointer parameters and nonoffsetable memories.
20428 For pushes, it returns just stack offsets; the values will be saved
20429 in the right order. Maximally three parts are generated. */
20430
20431 static int
20432 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20433 {
20434 int size;
20435
20436 if (!TARGET_64BIT)
20437 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20438 else
20439 size = (GET_MODE_SIZE (mode) + 4) / 8;
20440
20441 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20442 gcc_assert (size >= 2 && size <= 4);
20443
20444 /* Optimize constant pool reference to immediates. This is used by fp
20445 moves, that force all constants to memory to allow combining. */
20446 if (MEM_P (operand) && MEM_READONLY_P (operand))
20447 {
20448 rtx tmp = maybe_get_pool_constant (operand);
20449 if (tmp)
20450 operand = tmp;
20451 }
20452
20453 if (MEM_P (operand) && !offsettable_memref_p (operand))
20454 {
20455 /* The only non-offsetable memories we handle are pushes. */
20456 int ok = push_operand (operand, VOIDmode);
20457
20458 gcc_assert (ok);
20459
20460 operand = copy_rtx (operand);
20461 PUT_MODE (operand, word_mode);
20462 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20463 return size;
20464 }
20465
20466 if (GET_CODE (operand) == CONST_VECTOR)
20467 {
20468 enum machine_mode imode = int_mode_for_mode (mode);
20469 /* Caution: if we looked through a constant pool memory above,
20470 the operand may actually have a different mode now. That's
20471 ok, since we want to pun this all the way back to an integer. */
20472 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20473 gcc_assert (operand != NULL);
20474 mode = imode;
20475 }
20476
20477 if (!TARGET_64BIT)
20478 {
20479 if (mode == DImode)
20480 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20481 else
20482 {
20483 int i;
20484
20485 if (REG_P (operand))
20486 {
20487 gcc_assert (reload_completed);
20488 for (i = 0; i < size; i++)
20489 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20490 }
20491 else if (offsettable_memref_p (operand))
20492 {
20493 operand = adjust_address (operand, SImode, 0);
20494 parts[0] = operand;
20495 for (i = 1; i < size; i++)
20496 parts[i] = adjust_address (operand, SImode, 4 * i);
20497 }
20498 else if (GET_CODE (operand) == CONST_DOUBLE)
20499 {
20500 REAL_VALUE_TYPE r;
20501 long l[4];
20502
20503 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20504 switch (mode)
20505 {
20506 case TFmode:
20507 real_to_target (l, &r, mode);
20508 parts[3] = gen_int_mode (l[3], SImode);
20509 parts[2] = gen_int_mode (l[2], SImode);
20510 break;
20511 case XFmode:
20512 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20513 parts[2] = gen_int_mode (l[2], SImode);
20514 break;
20515 case DFmode:
20516 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20517 break;
20518 default:
20519 gcc_unreachable ();
20520 }
20521 parts[1] = gen_int_mode (l[1], SImode);
20522 parts[0] = gen_int_mode (l[0], SImode);
20523 }
20524 else
20525 gcc_unreachable ();
20526 }
20527 }
20528 else
20529 {
20530 if (mode == TImode)
20531 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20532 if (mode == XFmode || mode == TFmode)
20533 {
20534 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20535 if (REG_P (operand))
20536 {
20537 gcc_assert (reload_completed);
20538 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20539 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20540 }
20541 else if (offsettable_memref_p (operand))
20542 {
20543 operand = adjust_address (operand, DImode, 0);
20544 parts[0] = operand;
20545 parts[1] = adjust_address (operand, upper_mode, 8);
20546 }
20547 else if (GET_CODE (operand) == CONST_DOUBLE)
20548 {
20549 REAL_VALUE_TYPE r;
20550 long l[4];
20551
20552 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20553 real_to_target (l, &r, mode);
20554
20555 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20556 if (HOST_BITS_PER_WIDE_INT >= 64)
20557 parts[0]
20558 = gen_int_mode
20559 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20560 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20561 DImode);
20562 else
20563 parts[0] = immed_double_const (l[0], l[1], DImode);
20564
20565 if (upper_mode == SImode)
20566 parts[1] = gen_int_mode (l[2], SImode);
20567 else if (HOST_BITS_PER_WIDE_INT >= 64)
20568 parts[1]
20569 = gen_int_mode
20570 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20571 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20572 DImode);
20573 else
20574 parts[1] = immed_double_const (l[2], l[3], DImode);
20575 }
20576 else
20577 gcc_unreachable ();
20578 }
20579 }
20580
20581 return size;
20582 }
20583
20584 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20585 Return false when normal moves are needed; true when all required
20586 insns have been emitted. Operands 2-4 contain the input values
20587 int the correct order; operands 5-7 contain the output values. */
20588
20589 void
20590 ix86_split_long_move (rtx operands[])
20591 {
20592 rtx part[2][4];
20593 int nparts, i, j;
20594 int push = 0;
20595 int collisions = 0;
20596 enum machine_mode mode = GET_MODE (operands[0]);
20597 bool collisionparts[4];
20598
20599 /* The DFmode expanders may ask us to move double.
20600 For 64bit target this is single move. By hiding the fact
20601 here we simplify i386.md splitters. */
20602 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20603 {
20604 /* Optimize constant pool reference to immediates. This is used by
20605 fp moves, that force all constants to memory to allow combining. */
20606
20607 if (MEM_P (operands[1])
20608 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20609 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20610 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20611 if (push_operand (operands[0], VOIDmode))
20612 {
20613 operands[0] = copy_rtx (operands[0]);
20614 PUT_MODE (operands[0], word_mode);
20615 }
20616 else
20617 operands[0] = gen_lowpart (DImode, operands[0]);
20618 operands[1] = gen_lowpart (DImode, operands[1]);
20619 emit_move_insn (operands[0], operands[1]);
20620 return;
20621 }
20622
20623 /* The only non-offsettable memory we handle is push. */
20624 if (push_operand (operands[0], VOIDmode))
20625 push = 1;
20626 else
20627 gcc_assert (!MEM_P (operands[0])
20628 || offsettable_memref_p (operands[0]));
20629
20630 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20631 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20632
20633 /* When emitting push, take care for source operands on the stack. */
20634 if (push && MEM_P (operands[1])
20635 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20636 {
20637 rtx src_base = XEXP (part[1][nparts - 1], 0);
20638
20639 /* Compensate for the stack decrement by 4. */
20640 if (!TARGET_64BIT && nparts == 3
20641 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20642 src_base = plus_constant (Pmode, src_base, 4);
20643
20644 /* src_base refers to the stack pointer and is
20645 automatically decreased by emitted push. */
20646 for (i = 0; i < nparts; i++)
20647 part[1][i] = change_address (part[1][i],
20648 GET_MODE (part[1][i]), src_base);
20649 }
20650
20651 /* We need to do copy in the right order in case an address register
20652 of the source overlaps the destination. */
20653 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20654 {
20655 rtx tmp;
20656
20657 for (i = 0; i < nparts; i++)
20658 {
20659 collisionparts[i]
20660 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20661 if (collisionparts[i])
20662 collisions++;
20663 }
20664
20665 /* Collision in the middle part can be handled by reordering. */
20666 if (collisions == 1 && nparts == 3 && collisionparts [1])
20667 {
20668 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20669 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20670 }
20671 else if (collisions == 1
20672 && nparts == 4
20673 && (collisionparts [1] || collisionparts [2]))
20674 {
20675 if (collisionparts [1])
20676 {
20677 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20678 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20679 }
20680 else
20681 {
20682 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20683 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20684 }
20685 }
20686
20687 /* If there are more collisions, we can't handle it by reordering.
20688 Do an lea to the last part and use only one colliding move. */
20689 else if (collisions > 1)
20690 {
20691 rtx base;
20692
20693 collisions = 1;
20694
20695 base = part[0][nparts - 1];
20696
20697 /* Handle the case when the last part isn't valid for lea.
20698 Happens in 64-bit mode storing the 12-byte XFmode. */
20699 if (GET_MODE (base) != Pmode)
20700 base = gen_rtx_REG (Pmode, REGNO (base));
20701
20702 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20703 part[1][0] = replace_equiv_address (part[1][0], base);
20704 for (i = 1; i < nparts; i++)
20705 {
20706 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20707 part[1][i] = replace_equiv_address (part[1][i], tmp);
20708 }
20709 }
20710 }
20711
20712 if (push)
20713 {
20714 if (!TARGET_64BIT)
20715 {
20716 if (nparts == 3)
20717 {
20718 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20719 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20720 stack_pointer_rtx, GEN_INT (-4)));
20721 emit_move_insn (part[0][2], part[1][2]);
20722 }
20723 else if (nparts == 4)
20724 {
20725 emit_move_insn (part[0][3], part[1][3]);
20726 emit_move_insn (part[0][2], part[1][2]);
20727 }
20728 }
20729 else
20730 {
20731 /* In 64bit mode we don't have 32bit push available. In case this is
20732 register, it is OK - we will just use larger counterpart. We also
20733 retype memory - these comes from attempt to avoid REX prefix on
20734 moving of second half of TFmode value. */
20735 if (GET_MODE (part[1][1]) == SImode)
20736 {
20737 switch (GET_CODE (part[1][1]))
20738 {
20739 case MEM:
20740 part[1][1] = adjust_address (part[1][1], DImode, 0);
20741 break;
20742
20743 case REG:
20744 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20745 break;
20746
20747 default:
20748 gcc_unreachable ();
20749 }
20750
20751 if (GET_MODE (part[1][0]) == SImode)
20752 part[1][0] = part[1][1];
20753 }
20754 }
20755 emit_move_insn (part[0][1], part[1][1]);
20756 emit_move_insn (part[0][0], part[1][0]);
20757 return;
20758 }
20759
20760 /* Choose correct order to not overwrite the source before it is copied. */
20761 if ((REG_P (part[0][0])
20762 && REG_P (part[1][1])
20763 && (REGNO (part[0][0]) == REGNO (part[1][1])
20764 || (nparts == 3
20765 && REGNO (part[0][0]) == REGNO (part[1][2]))
20766 || (nparts == 4
20767 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20768 || (collisions > 0
20769 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20770 {
20771 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20772 {
20773 operands[2 + i] = part[0][j];
20774 operands[6 + i] = part[1][j];
20775 }
20776 }
20777 else
20778 {
20779 for (i = 0; i < nparts; i++)
20780 {
20781 operands[2 + i] = part[0][i];
20782 operands[6 + i] = part[1][i];
20783 }
20784 }
20785
20786 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20787 if (optimize_insn_for_size_p ())
20788 {
20789 for (j = 0; j < nparts - 1; j++)
20790 if (CONST_INT_P (operands[6 + j])
20791 && operands[6 + j] != const0_rtx
20792 && REG_P (operands[2 + j]))
20793 for (i = j; i < nparts - 1; i++)
20794 if (CONST_INT_P (operands[7 + i])
20795 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20796 operands[7 + i] = operands[2 + j];
20797 }
20798
20799 for (i = 0; i < nparts; i++)
20800 emit_move_insn (operands[2 + i], operands[6 + i]);
20801
20802 return;
20803 }
20804
20805 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20806 left shift by a constant, either using a single shift or
20807 a sequence of add instructions. */
20808
20809 static void
20810 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20811 {
20812 rtx (*insn)(rtx, rtx, rtx);
20813
20814 if (count == 1
20815 || (count * ix86_cost->add <= ix86_cost->shift_const
20816 && !optimize_insn_for_size_p ()))
20817 {
20818 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20819 while (count-- > 0)
20820 emit_insn (insn (operand, operand, operand));
20821 }
20822 else
20823 {
20824 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20825 emit_insn (insn (operand, operand, GEN_INT (count)));
20826 }
20827 }
20828
20829 void
20830 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20831 {
20832 rtx (*gen_ashl3)(rtx, rtx, rtx);
20833 rtx (*gen_shld)(rtx, rtx, rtx);
20834 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20835
20836 rtx low[2], high[2];
20837 int count;
20838
20839 if (CONST_INT_P (operands[2]))
20840 {
20841 split_double_mode (mode, operands, 2, low, high);
20842 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20843
20844 if (count >= half_width)
20845 {
20846 emit_move_insn (high[0], low[1]);
20847 emit_move_insn (low[0], const0_rtx);
20848
20849 if (count > half_width)
20850 ix86_expand_ashl_const (high[0], count - half_width, mode);
20851 }
20852 else
20853 {
20854 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20855
20856 if (!rtx_equal_p (operands[0], operands[1]))
20857 emit_move_insn (operands[0], operands[1]);
20858
20859 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20860 ix86_expand_ashl_const (low[0], count, mode);
20861 }
20862 return;
20863 }
20864
20865 split_double_mode (mode, operands, 1, low, high);
20866
20867 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20868
20869 if (operands[1] == const1_rtx)
20870 {
20871 /* Assuming we've chosen a QImode capable registers, then 1 << N
20872 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20873 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20874 {
20875 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20876
20877 ix86_expand_clear (low[0]);
20878 ix86_expand_clear (high[0]);
20879 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20880
20881 d = gen_lowpart (QImode, low[0]);
20882 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20883 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20884 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20885
20886 d = gen_lowpart (QImode, high[0]);
20887 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20888 s = gen_rtx_NE (QImode, flags, const0_rtx);
20889 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20890 }
20891
20892 /* Otherwise, we can get the same results by manually performing
20893 a bit extract operation on bit 5/6, and then performing the two
20894 shifts. The two methods of getting 0/1 into low/high are exactly
20895 the same size. Avoiding the shift in the bit extract case helps
20896 pentium4 a bit; no one else seems to care much either way. */
20897 else
20898 {
20899 enum machine_mode half_mode;
20900 rtx (*gen_lshr3)(rtx, rtx, rtx);
20901 rtx (*gen_and3)(rtx, rtx, rtx);
20902 rtx (*gen_xor3)(rtx, rtx, rtx);
20903 HOST_WIDE_INT bits;
20904 rtx x;
20905
20906 if (mode == DImode)
20907 {
20908 half_mode = SImode;
20909 gen_lshr3 = gen_lshrsi3;
20910 gen_and3 = gen_andsi3;
20911 gen_xor3 = gen_xorsi3;
20912 bits = 5;
20913 }
20914 else
20915 {
20916 half_mode = DImode;
20917 gen_lshr3 = gen_lshrdi3;
20918 gen_and3 = gen_anddi3;
20919 gen_xor3 = gen_xordi3;
20920 bits = 6;
20921 }
20922
20923 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20924 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20925 else
20926 x = gen_lowpart (half_mode, operands[2]);
20927 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20928
20929 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20930 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20931 emit_move_insn (low[0], high[0]);
20932 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20933 }
20934
20935 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20936 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20937 return;
20938 }
20939
20940 if (operands[1] == constm1_rtx)
20941 {
20942 /* For -1 << N, we can avoid the shld instruction, because we
20943 know that we're shifting 0...31/63 ones into a -1. */
20944 emit_move_insn (low[0], constm1_rtx);
20945 if (optimize_insn_for_size_p ())
20946 emit_move_insn (high[0], low[0]);
20947 else
20948 emit_move_insn (high[0], constm1_rtx);
20949 }
20950 else
20951 {
20952 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20953
20954 if (!rtx_equal_p (operands[0], operands[1]))
20955 emit_move_insn (operands[0], operands[1]);
20956
20957 split_double_mode (mode, operands, 1, low, high);
20958 emit_insn (gen_shld (high[0], low[0], operands[2]));
20959 }
20960
20961 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20962
20963 if (TARGET_CMOVE && scratch)
20964 {
20965 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20966 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20967
20968 ix86_expand_clear (scratch);
20969 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20970 }
20971 else
20972 {
20973 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20974 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20975
20976 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20977 }
20978 }
20979
20980 void
20981 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20982 {
20983 rtx (*gen_ashr3)(rtx, rtx, rtx)
20984 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20985 rtx (*gen_shrd)(rtx, rtx, rtx);
20986 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20987
20988 rtx low[2], high[2];
20989 int count;
20990
20991 if (CONST_INT_P (operands[2]))
20992 {
20993 split_double_mode (mode, operands, 2, low, high);
20994 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20995
20996 if (count == GET_MODE_BITSIZE (mode) - 1)
20997 {
20998 emit_move_insn (high[0], high[1]);
20999 emit_insn (gen_ashr3 (high[0], high[0],
21000 GEN_INT (half_width - 1)));
21001 emit_move_insn (low[0], high[0]);
21002
21003 }
21004 else if (count >= half_width)
21005 {
21006 emit_move_insn (low[0], high[1]);
21007 emit_move_insn (high[0], low[0]);
21008 emit_insn (gen_ashr3 (high[0], high[0],
21009 GEN_INT (half_width - 1)));
21010
21011 if (count > half_width)
21012 emit_insn (gen_ashr3 (low[0], low[0],
21013 GEN_INT (count - half_width)));
21014 }
21015 else
21016 {
21017 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21018
21019 if (!rtx_equal_p (operands[0], operands[1]))
21020 emit_move_insn (operands[0], operands[1]);
21021
21022 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21023 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21024 }
21025 }
21026 else
21027 {
21028 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21029
21030 if (!rtx_equal_p (operands[0], operands[1]))
21031 emit_move_insn (operands[0], operands[1]);
21032
21033 split_double_mode (mode, operands, 1, low, high);
21034
21035 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21036 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21037
21038 if (TARGET_CMOVE && scratch)
21039 {
21040 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21041 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21042
21043 emit_move_insn (scratch, high[0]);
21044 emit_insn (gen_ashr3 (scratch, scratch,
21045 GEN_INT (half_width - 1)));
21046 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21047 scratch));
21048 }
21049 else
21050 {
21051 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21052 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21053
21054 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21055 }
21056 }
21057 }
21058
21059 void
21060 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21061 {
21062 rtx (*gen_lshr3)(rtx, rtx, rtx)
21063 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21064 rtx (*gen_shrd)(rtx, rtx, rtx);
21065 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21066
21067 rtx low[2], high[2];
21068 int count;
21069
21070 if (CONST_INT_P (operands[2]))
21071 {
21072 split_double_mode (mode, operands, 2, low, high);
21073 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21074
21075 if (count >= half_width)
21076 {
21077 emit_move_insn (low[0], high[1]);
21078 ix86_expand_clear (high[0]);
21079
21080 if (count > half_width)
21081 emit_insn (gen_lshr3 (low[0], low[0],
21082 GEN_INT (count - half_width)));
21083 }
21084 else
21085 {
21086 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21087
21088 if (!rtx_equal_p (operands[0], operands[1]))
21089 emit_move_insn (operands[0], operands[1]);
21090
21091 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21092 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21093 }
21094 }
21095 else
21096 {
21097 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21098
21099 if (!rtx_equal_p (operands[0], operands[1]))
21100 emit_move_insn (operands[0], operands[1]);
21101
21102 split_double_mode (mode, operands, 1, low, high);
21103
21104 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21105 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21106
21107 if (TARGET_CMOVE && scratch)
21108 {
21109 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21110 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21111
21112 ix86_expand_clear (scratch);
21113 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21114 scratch));
21115 }
21116 else
21117 {
21118 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21119 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21120
21121 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21122 }
21123 }
21124 }
21125
21126 /* Predict just emitted jump instruction to be taken with probability PROB. */
21127 static void
21128 predict_jump (int prob)
21129 {
21130 rtx insn = get_last_insn ();
21131 gcc_assert (JUMP_P (insn));
21132 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21133 }
21134
21135 /* Helper function for the string operations below. Dest VARIABLE whether
21136 it is aligned to VALUE bytes. If true, jump to the label. */
21137 static rtx
21138 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21139 {
21140 rtx label = gen_label_rtx ();
21141 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21142 if (GET_MODE (variable) == DImode)
21143 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21144 else
21145 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21146 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21147 1, label);
21148 if (epilogue)
21149 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21150 else
21151 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21152 return label;
21153 }
21154
21155 /* Adjust COUNTER by the VALUE. */
21156 static void
21157 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21158 {
21159 rtx (*gen_add)(rtx, rtx, rtx)
21160 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21161
21162 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21163 }
21164
21165 /* Zero extend possibly SImode EXP to Pmode register. */
21166 rtx
21167 ix86_zero_extend_to_Pmode (rtx exp)
21168 {
21169 if (GET_MODE (exp) != Pmode)
21170 exp = convert_to_mode (Pmode, exp, 1);
21171 return force_reg (Pmode, exp);
21172 }
21173
21174 /* Divide COUNTREG by SCALE. */
21175 static rtx
21176 scale_counter (rtx countreg, int scale)
21177 {
21178 rtx sc;
21179
21180 if (scale == 1)
21181 return countreg;
21182 if (CONST_INT_P (countreg))
21183 return GEN_INT (INTVAL (countreg) / scale);
21184 gcc_assert (REG_P (countreg));
21185
21186 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21187 GEN_INT (exact_log2 (scale)),
21188 NULL, 1, OPTAB_DIRECT);
21189 return sc;
21190 }
21191
21192 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21193 DImode for constant loop counts. */
21194
21195 static enum machine_mode
21196 counter_mode (rtx count_exp)
21197 {
21198 if (GET_MODE (count_exp) != VOIDmode)
21199 return GET_MODE (count_exp);
21200 if (!CONST_INT_P (count_exp))
21201 return Pmode;
21202 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21203 return DImode;
21204 return SImode;
21205 }
21206
21207 /* When SRCPTR is non-NULL, output simple loop to move memory
21208 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21209 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21210 equivalent loop to set memory by VALUE (supposed to be in MODE).
21211
21212 The size is rounded down to whole number of chunk size moved at once.
21213 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21214
21215
21216 static void
21217 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21218 rtx destptr, rtx srcptr, rtx value,
21219 rtx count, enum machine_mode mode, int unroll,
21220 int expected_size)
21221 {
21222 rtx out_label, top_label, iter, tmp;
21223 enum machine_mode iter_mode = counter_mode (count);
21224 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21225 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21226 rtx size;
21227 rtx x_addr;
21228 rtx y_addr;
21229 int i;
21230
21231 top_label = gen_label_rtx ();
21232 out_label = gen_label_rtx ();
21233 iter = gen_reg_rtx (iter_mode);
21234
21235 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21236 NULL, 1, OPTAB_DIRECT);
21237 /* Those two should combine. */
21238 if (piece_size == const1_rtx)
21239 {
21240 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21241 true, out_label);
21242 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21243 }
21244 emit_move_insn (iter, const0_rtx);
21245
21246 emit_label (top_label);
21247
21248 tmp = convert_modes (Pmode, iter_mode, iter, true);
21249 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21250 destmem = change_address (destmem, mode, x_addr);
21251
21252 if (srcmem)
21253 {
21254 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21255 srcmem = change_address (srcmem, mode, y_addr);
21256
21257 /* When unrolling for chips that reorder memory reads and writes,
21258 we can save registers by using single temporary.
21259 Also using 4 temporaries is overkill in 32bit mode. */
21260 if (!TARGET_64BIT && 0)
21261 {
21262 for (i = 0; i < unroll; i++)
21263 {
21264 if (i)
21265 {
21266 destmem =
21267 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21268 srcmem =
21269 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21270 }
21271 emit_move_insn (destmem, srcmem);
21272 }
21273 }
21274 else
21275 {
21276 rtx tmpreg[4];
21277 gcc_assert (unroll <= 4);
21278 for (i = 0; i < unroll; i++)
21279 {
21280 tmpreg[i] = gen_reg_rtx (mode);
21281 if (i)
21282 {
21283 srcmem =
21284 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21285 }
21286 emit_move_insn (tmpreg[i], srcmem);
21287 }
21288 for (i = 0; i < unroll; i++)
21289 {
21290 if (i)
21291 {
21292 destmem =
21293 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21294 }
21295 emit_move_insn (destmem, tmpreg[i]);
21296 }
21297 }
21298 }
21299 else
21300 for (i = 0; i < unroll; i++)
21301 {
21302 if (i)
21303 destmem =
21304 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21305 emit_move_insn (destmem, value);
21306 }
21307
21308 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21309 true, OPTAB_LIB_WIDEN);
21310 if (tmp != iter)
21311 emit_move_insn (iter, tmp);
21312
21313 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21314 true, top_label);
21315 if (expected_size != -1)
21316 {
21317 expected_size /= GET_MODE_SIZE (mode) * unroll;
21318 if (expected_size == 0)
21319 predict_jump (0);
21320 else if (expected_size > REG_BR_PROB_BASE)
21321 predict_jump (REG_BR_PROB_BASE - 1);
21322 else
21323 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21324 }
21325 else
21326 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21327 iter = ix86_zero_extend_to_Pmode (iter);
21328 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21329 true, OPTAB_LIB_WIDEN);
21330 if (tmp != destptr)
21331 emit_move_insn (destptr, tmp);
21332 if (srcptr)
21333 {
21334 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21335 true, OPTAB_LIB_WIDEN);
21336 if (tmp != srcptr)
21337 emit_move_insn (srcptr, tmp);
21338 }
21339 emit_label (out_label);
21340 }
21341
21342 /* Output "rep; mov" instruction.
21343 Arguments have same meaning as for previous function */
21344 static void
21345 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21346 rtx destptr, rtx srcptr,
21347 rtx count,
21348 enum machine_mode mode)
21349 {
21350 rtx destexp;
21351 rtx srcexp;
21352 rtx countreg;
21353 HOST_WIDE_INT rounded_count;
21354
21355 /* If the size is known, it is shorter to use rep movs. */
21356 if (mode == QImode && CONST_INT_P (count)
21357 && !(INTVAL (count) & 3))
21358 mode = SImode;
21359
21360 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21361 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21362 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21363 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21364 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21365 if (mode != QImode)
21366 {
21367 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21368 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21369 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21370 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21371 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21372 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21373 }
21374 else
21375 {
21376 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21377 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21378 }
21379 if (CONST_INT_P (count))
21380 {
21381 rounded_count = (INTVAL (count)
21382 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21383 destmem = shallow_copy_rtx (destmem);
21384 srcmem = shallow_copy_rtx (srcmem);
21385 set_mem_size (destmem, rounded_count);
21386 set_mem_size (srcmem, rounded_count);
21387 }
21388 else
21389 {
21390 if (MEM_SIZE_KNOWN_P (destmem))
21391 clear_mem_size (destmem);
21392 if (MEM_SIZE_KNOWN_P (srcmem))
21393 clear_mem_size (srcmem);
21394 }
21395 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21396 destexp, srcexp));
21397 }
21398
21399 /* Output "rep; stos" instruction.
21400 Arguments have same meaning as for previous function */
21401 static void
21402 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21403 rtx count, enum machine_mode mode,
21404 rtx orig_value)
21405 {
21406 rtx destexp;
21407 rtx countreg;
21408 HOST_WIDE_INT rounded_count;
21409
21410 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21411 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21412 value = force_reg (mode, gen_lowpart (mode, value));
21413 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21414 if (mode != QImode)
21415 {
21416 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21417 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21418 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21419 }
21420 else
21421 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21422 if (orig_value == const0_rtx && CONST_INT_P (count))
21423 {
21424 rounded_count = (INTVAL (count)
21425 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21426 destmem = shallow_copy_rtx (destmem);
21427 set_mem_size (destmem, rounded_count);
21428 }
21429 else if (MEM_SIZE_KNOWN_P (destmem))
21430 clear_mem_size (destmem);
21431 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21432 }
21433
21434 static void
21435 emit_strmov (rtx destmem, rtx srcmem,
21436 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21437 {
21438 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21439 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21440 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21441 }
21442
21443 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21444 static void
21445 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21446 rtx destptr, rtx srcptr, rtx count, int max_size)
21447 {
21448 rtx src, dest;
21449 if (CONST_INT_P (count))
21450 {
21451 HOST_WIDE_INT countval = INTVAL (count);
21452 int offset = 0;
21453
21454 if ((countval & 0x10) && max_size > 16)
21455 {
21456 if (TARGET_64BIT)
21457 {
21458 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21459 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21460 }
21461 else
21462 gcc_unreachable ();
21463 offset += 16;
21464 }
21465 if ((countval & 0x08) && max_size > 8)
21466 {
21467 if (TARGET_64BIT)
21468 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21469 else
21470 {
21471 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21472 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21473 }
21474 offset += 8;
21475 }
21476 if ((countval & 0x04) && max_size > 4)
21477 {
21478 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21479 offset += 4;
21480 }
21481 if ((countval & 0x02) && max_size > 2)
21482 {
21483 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21484 offset += 2;
21485 }
21486 if ((countval & 0x01) && max_size > 1)
21487 {
21488 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21489 offset += 1;
21490 }
21491 return;
21492 }
21493 if (max_size > 8)
21494 {
21495 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21496 count, 1, OPTAB_DIRECT);
21497 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21498 count, QImode, 1, 4);
21499 return;
21500 }
21501
21502 /* When there are stringops, we can cheaply increase dest and src pointers.
21503 Otherwise we save code size by maintaining offset (zero is readily
21504 available from preceding rep operation) and using x86 addressing modes.
21505 */
21506 if (TARGET_SINGLE_STRINGOP)
21507 {
21508 if (max_size > 4)
21509 {
21510 rtx label = ix86_expand_aligntest (count, 4, true);
21511 src = change_address (srcmem, SImode, srcptr);
21512 dest = change_address (destmem, SImode, destptr);
21513 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21514 emit_label (label);
21515 LABEL_NUSES (label) = 1;
21516 }
21517 if (max_size > 2)
21518 {
21519 rtx label = ix86_expand_aligntest (count, 2, true);
21520 src = change_address (srcmem, HImode, srcptr);
21521 dest = change_address (destmem, HImode, destptr);
21522 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21523 emit_label (label);
21524 LABEL_NUSES (label) = 1;
21525 }
21526 if (max_size > 1)
21527 {
21528 rtx label = ix86_expand_aligntest (count, 1, true);
21529 src = change_address (srcmem, QImode, srcptr);
21530 dest = change_address (destmem, QImode, destptr);
21531 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21532 emit_label (label);
21533 LABEL_NUSES (label) = 1;
21534 }
21535 }
21536 else
21537 {
21538 rtx offset = force_reg (Pmode, const0_rtx);
21539 rtx tmp;
21540
21541 if (max_size > 4)
21542 {
21543 rtx label = ix86_expand_aligntest (count, 4, true);
21544 src = change_address (srcmem, SImode, srcptr);
21545 dest = change_address (destmem, SImode, destptr);
21546 emit_move_insn (dest, src);
21547 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21548 true, OPTAB_LIB_WIDEN);
21549 if (tmp != offset)
21550 emit_move_insn (offset, tmp);
21551 emit_label (label);
21552 LABEL_NUSES (label) = 1;
21553 }
21554 if (max_size > 2)
21555 {
21556 rtx label = ix86_expand_aligntest (count, 2, true);
21557 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21558 src = change_address (srcmem, HImode, tmp);
21559 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21560 dest = change_address (destmem, HImode, tmp);
21561 emit_move_insn (dest, src);
21562 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21563 true, OPTAB_LIB_WIDEN);
21564 if (tmp != offset)
21565 emit_move_insn (offset, tmp);
21566 emit_label (label);
21567 LABEL_NUSES (label) = 1;
21568 }
21569 if (max_size > 1)
21570 {
21571 rtx label = ix86_expand_aligntest (count, 1, true);
21572 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21573 src = change_address (srcmem, QImode, tmp);
21574 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21575 dest = change_address (destmem, QImode, tmp);
21576 emit_move_insn (dest, src);
21577 emit_label (label);
21578 LABEL_NUSES (label) = 1;
21579 }
21580 }
21581 }
21582
21583 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21584 static void
21585 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21586 rtx count, int max_size)
21587 {
21588 count =
21589 expand_simple_binop (counter_mode (count), AND, count,
21590 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21591 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21592 gen_lowpart (QImode, value), count, QImode,
21593 1, max_size / 2);
21594 }
21595
21596 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21597 static void
21598 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21599 {
21600 rtx dest;
21601
21602 if (CONST_INT_P (count))
21603 {
21604 HOST_WIDE_INT countval = INTVAL (count);
21605 int offset = 0;
21606
21607 if ((countval & 0x10) && max_size > 16)
21608 {
21609 if (TARGET_64BIT)
21610 {
21611 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21612 emit_insn (gen_strset (destptr, dest, value));
21613 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21614 emit_insn (gen_strset (destptr, dest, value));
21615 }
21616 else
21617 gcc_unreachable ();
21618 offset += 16;
21619 }
21620 if ((countval & 0x08) && max_size > 8)
21621 {
21622 if (TARGET_64BIT)
21623 {
21624 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21625 emit_insn (gen_strset (destptr, dest, value));
21626 }
21627 else
21628 {
21629 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21630 emit_insn (gen_strset (destptr, dest, value));
21631 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21632 emit_insn (gen_strset (destptr, dest, value));
21633 }
21634 offset += 8;
21635 }
21636 if ((countval & 0x04) && max_size > 4)
21637 {
21638 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21639 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21640 offset += 4;
21641 }
21642 if ((countval & 0x02) && max_size > 2)
21643 {
21644 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21645 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21646 offset += 2;
21647 }
21648 if ((countval & 0x01) && max_size > 1)
21649 {
21650 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21651 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21652 offset += 1;
21653 }
21654 return;
21655 }
21656 if (max_size > 32)
21657 {
21658 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21659 return;
21660 }
21661 if (max_size > 16)
21662 {
21663 rtx label = ix86_expand_aligntest (count, 16, true);
21664 if (TARGET_64BIT)
21665 {
21666 dest = change_address (destmem, DImode, destptr);
21667 emit_insn (gen_strset (destptr, dest, value));
21668 emit_insn (gen_strset (destptr, dest, value));
21669 }
21670 else
21671 {
21672 dest = change_address (destmem, SImode, destptr);
21673 emit_insn (gen_strset (destptr, dest, value));
21674 emit_insn (gen_strset (destptr, dest, value));
21675 emit_insn (gen_strset (destptr, dest, value));
21676 emit_insn (gen_strset (destptr, dest, value));
21677 }
21678 emit_label (label);
21679 LABEL_NUSES (label) = 1;
21680 }
21681 if (max_size > 8)
21682 {
21683 rtx label = ix86_expand_aligntest (count, 8, true);
21684 if (TARGET_64BIT)
21685 {
21686 dest = change_address (destmem, DImode, destptr);
21687 emit_insn (gen_strset (destptr, dest, value));
21688 }
21689 else
21690 {
21691 dest = change_address (destmem, SImode, destptr);
21692 emit_insn (gen_strset (destptr, dest, value));
21693 emit_insn (gen_strset (destptr, dest, value));
21694 }
21695 emit_label (label);
21696 LABEL_NUSES (label) = 1;
21697 }
21698 if (max_size > 4)
21699 {
21700 rtx label = ix86_expand_aligntest (count, 4, true);
21701 dest = change_address (destmem, SImode, destptr);
21702 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21703 emit_label (label);
21704 LABEL_NUSES (label) = 1;
21705 }
21706 if (max_size > 2)
21707 {
21708 rtx label = ix86_expand_aligntest (count, 2, true);
21709 dest = change_address (destmem, HImode, destptr);
21710 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21711 emit_label (label);
21712 LABEL_NUSES (label) = 1;
21713 }
21714 if (max_size > 1)
21715 {
21716 rtx label = ix86_expand_aligntest (count, 1, true);
21717 dest = change_address (destmem, QImode, destptr);
21718 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21719 emit_label (label);
21720 LABEL_NUSES (label) = 1;
21721 }
21722 }
21723
21724 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21725 DESIRED_ALIGNMENT. */
21726 static void
21727 expand_movmem_prologue (rtx destmem, rtx srcmem,
21728 rtx destptr, rtx srcptr, rtx count,
21729 int align, int desired_alignment)
21730 {
21731 if (align <= 1 && desired_alignment > 1)
21732 {
21733 rtx label = ix86_expand_aligntest (destptr, 1, false);
21734 srcmem = change_address (srcmem, QImode, srcptr);
21735 destmem = change_address (destmem, QImode, destptr);
21736 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21737 ix86_adjust_counter (count, 1);
21738 emit_label (label);
21739 LABEL_NUSES (label) = 1;
21740 }
21741 if (align <= 2 && desired_alignment > 2)
21742 {
21743 rtx label = ix86_expand_aligntest (destptr, 2, false);
21744 srcmem = change_address (srcmem, HImode, srcptr);
21745 destmem = change_address (destmem, HImode, destptr);
21746 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21747 ix86_adjust_counter (count, 2);
21748 emit_label (label);
21749 LABEL_NUSES (label) = 1;
21750 }
21751 if (align <= 4 && desired_alignment > 4)
21752 {
21753 rtx label = ix86_expand_aligntest (destptr, 4, false);
21754 srcmem = change_address (srcmem, SImode, srcptr);
21755 destmem = change_address (destmem, SImode, destptr);
21756 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21757 ix86_adjust_counter (count, 4);
21758 emit_label (label);
21759 LABEL_NUSES (label) = 1;
21760 }
21761 gcc_assert (desired_alignment <= 8);
21762 }
21763
21764 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21765 ALIGN_BYTES is how many bytes need to be copied. */
21766 static rtx
21767 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21768 int desired_align, int align_bytes)
21769 {
21770 rtx src = *srcp;
21771 rtx orig_dst = dst;
21772 rtx orig_src = src;
21773 int off = 0;
21774 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21775 if (src_align_bytes >= 0)
21776 src_align_bytes = desired_align - src_align_bytes;
21777 if (align_bytes & 1)
21778 {
21779 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21780 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21781 off = 1;
21782 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21783 }
21784 if (align_bytes & 2)
21785 {
21786 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21787 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21788 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21789 set_mem_align (dst, 2 * BITS_PER_UNIT);
21790 if (src_align_bytes >= 0
21791 && (src_align_bytes & 1) == (align_bytes & 1)
21792 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21793 set_mem_align (src, 2 * BITS_PER_UNIT);
21794 off = 2;
21795 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21796 }
21797 if (align_bytes & 4)
21798 {
21799 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21800 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21801 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21802 set_mem_align (dst, 4 * BITS_PER_UNIT);
21803 if (src_align_bytes >= 0)
21804 {
21805 unsigned int src_align = 0;
21806 if ((src_align_bytes & 3) == (align_bytes & 3))
21807 src_align = 4;
21808 else if ((src_align_bytes & 1) == (align_bytes & 1))
21809 src_align = 2;
21810 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21811 set_mem_align (src, src_align * BITS_PER_UNIT);
21812 }
21813 off = 4;
21814 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21815 }
21816 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21817 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21818 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21819 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21820 if (src_align_bytes >= 0)
21821 {
21822 unsigned int src_align = 0;
21823 if ((src_align_bytes & 7) == (align_bytes & 7))
21824 src_align = 8;
21825 else if ((src_align_bytes & 3) == (align_bytes & 3))
21826 src_align = 4;
21827 else if ((src_align_bytes & 1) == (align_bytes & 1))
21828 src_align = 2;
21829 if (src_align > (unsigned int) desired_align)
21830 src_align = desired_align;
21831 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21832 set_mem_align (src, src_align * BITS_PER_UNIT);
21833 }
21834 if (MEM_SIZE_KNOWN_P (orig_dst))
21835 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21836 if (MEM_SIZE_KNOWN_P (orig_src))
21837 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21838 *srcp = src;
21839 return dst;
21840 }
21841
21842 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21843 DESIRED_ALIGNMENT. */
21844 static void
21845 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21846 int align, int desired_alignment)
21847 {
21848 if (align <= 1 && desired_alignment > 1)
21849 {
21850 rtx label = ix86_expand_aligntest (destptr, 1, false);
21851 destmem = change_address (destmem, QImode, destptr);
21852 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21853 ix86_adjust_counter (count, 1);
21854 emit_label (label);
21855 LABEL_NUSES (label) = 1;
21856 }
21857 if (align <= 2 && desired_alignment > 2)
21858 {
21859 rtx label = ix86_expand_aligntest (destptr, 2, false);
21860 destmem = change_address (destmem, HImode, destptr);
21861 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21862 ix86_adjust_counter (count, 2);
21863 emit_label (label);
21864 LABEL_NUSES (label) = 1;
21865 }
21866 if (align <= 4 && desired_alignment > 4)
21867 {
21868 rtx label = ix86_expand_aligntest (destptr, 4, false);
21869 destmem = change_address (destmem, SImode, destptr);
21870 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21871 ix86_adjust_counter (count, 4);
21872 emit_label (label);
21873 LABEL_NUSES (label) = 1;
21874 }
21875 gcc_assert (desired_alignment <= 8);
21876 }
21877
21878 /* Set enough from DST to align DST known to by aligned by ALIGN to
21879 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21880 static rtx
21881 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21882 int desired_align, int align_bytes)
21883 {
21884 int off = 0;
21885 rtx orig_dst = dst;
21886 if (align_bytes & 1)
21887 {
21888 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21889 off = 1;
21890 emit_insn (gen_strset (destreg, dst,
21891 gen_lowpart (QImode, value)));
21892 }
21893 if (align_bytes & 2)
21894 {
21895 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21896 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21897 set_mem_align (dst, 2 * BITS_PER_UNIT);
21898 off = 2;
21899 emit_insn (gen_strset (destreg, dst,
21900 gen_lowpart (HImode, value)));
21901 }
21902 if (align_bytes & 4)
21903 {
21904 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21905 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21906 set_mem_align (dst, 4 * BITS_PER_UNIT);
21907 off = 4;
21908 emit_insn (gen_strset (destreg, dst,
21909 gen_lowpart (SImode, value)));
21910 }
21911 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21912 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21913 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21914 if (MEM_SIZE_KNOWN_P (orig_dst))
21915 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21916 return dst;
21917 }
21918
21919 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21920 static enum stringop_alg
21921 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21922 int *dynamic_check)
21923 {
21924 const struct stringop_algs * algs;
21925 bool optimize_for_speed;
21926 /* Algorithms using the rep prefix want at least edi and ecx;
21927 additionally, memset wants eax and memcpy wants esi. Don't
21928 consider such algorithms if the user has appropriated those
21929 registers for their own purposes. */
21930 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21931 || (memset
21932 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21933
21934 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21935 || (alg != rep_prefix_1_byte \
21936 && alg != rep_prefix_4_byte \
21937 && alg != rep_prefix_8_byte))
21938 const struct processor_costs *cost;
21939
21940 /* Even if the string operation call is cold, we still might spend a lot
21941 of time processing large blocks. */
21942 if (optimize_function_for_size_p (cfun)
21943 || (optimize_insn_for_size_p ()
21944 && expected_size != -1 && expected_size < 256))
21945 optimize_for_speed = false;
21946 else
21947 optimize_for_speed = true;
21948
21949 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21950
21951 *dynamic_check = -1;
21952 if (memset)
21953 algs = &cost->memset[TARGET_64BIT != 0];
21954 else
21955 algs = &cost->memcpy[TARGET_64BIT != 0];
21956 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21957 return ix86_stringop_alg;
21958 /* rep; movq or rep; movl is the smallest variant. */
21959 else if (!optimize_for_speed)
21960 {
21961 if (!count || (count & 3))
21962 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21963 else
21964 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21965 }
21966 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21967 */
21968 else if (expected_size != -1 && expected_size < 4)
21969 return loop_1_byte;
21970 else if (expected_size != -1)
21971 {
21972 unsigned int i;
21973 enum stringop_alg alg = libcall;
21974 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21975 {
21976 /* We get here if the algorithms that were not libcall-based
21977 were rep-prefix based and we are unable to use rep prefixes
21978 based on global register usage. Break out of the loop and
21979 use the heuristic below. */
21980 if (algs->size[i].max == 0)
21981 break;
21982 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21983 {
21984 enum stringop_alg candidate = algs->size[i].alg;
21985
21986 if (candidate != libcall && ALG_USABLE_P (candidate))
21987 alg = candidate;
21988 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21989 last non-libcall inline algorithm. */
21990 if (TARGET_INLINE_ALL_STRINGOPS)
21991 {
21992 /* When the current size is best to be copied by a libcall,
21993 but we are still forced to inline, run the heuristic below
21994 that will pick code for medium sized blocks. */
21995 if (alg != libcall)
21996 return alg;
21997 break;
21998 }
21999 else if (ALG_USABLE_P (candidate))
22000 return candidate;
22001 }
22002 }
22003 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22004 }
22005 /* When asked to inline the call anyway, try to pick meaningful choice.
22006 We look for maximal size of block that is faster to copy by hand and
22007 take blocks of at most of that size guessing that average size will
22008 be roughly half of the block.
22009
22010 If this turns out to be bad, we might simply specify the preferred
22011 choice in ix86_costs. */
22012 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22013 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22014 {
22015 int max = -1;
22016 enum stringop_alg alg;
22017 int i;
22018 bool any_alg_usable_p = true;
22019
22020 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22021 {
22022 enum stringop_alg candidate = algs->size[i].alg;
22023 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22024
22025 if (candidate != libcall && candidate
22026 && ALG_USABLE_P (candidate))
22027 max = algs->size[i].max;
22028 }
22029 /* If there aren't any usable algorithms, then recursing on
22030 smaller sizes isn't going to find anything. Just return the
22031 simple byte-at-a-time copy loop. */
22032 if (!any_alg_usable_p)
22033 {
22034 /* Pick something reasonable. */
22035 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22036 *dynamic_check = 128;
22037 return loop_1_byte;
22038 }
22039 if (max == -1)
22040 max = 4096;
22041 alg = decide_alg (count, max / 2, memset, dynamic_check);
22042 gcc_assert (*dynamic_check == -1);
22043 gcc_assert (alg != libcall);
22044 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22045 *dynamic_check = max;
22046 return alg;
22047 }
22048 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22049 #undef ALG_USABLE_P
22050 }
22051
22052 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22053 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22054 static int
22055 decide_alignment (int align,
22056 enum stringop_alg alg,
22057 int expected_size)
22058 {
22059 int desired_align = 0;
22060 switch (alg)
22061 {
22062 case no_stringop:
22063 gcc_unreachable ();
22064 case loop:
22065 case unrolled_loop:
22066 desired_align = GET_MODE_SIZE (Pmode);
22067 break;
22068 case rep_prefix_8_byte:
22069 desired_align = 8;
22070 break;
22071 case rep_prefix_4_byte:
22072 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22073 copying whole cacheline at once. */
22074 if (TARGET_PENTIUMPRO)
22075 desired_align = 8;
22076 else
22077 desired_align = 4;
22078 break;
22079 case rep_prefix_1_byte:
22080 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22081 copying whole cacheline at once. */
22082 if (TARGET_PENTIUMPRO)
22083 desired_align = 8;
22084 else
22085 desired_align = 1;
22086 break;
22087 case loop_1_byte:
22088 desired_align = 1;
22089 break;
22090 case libcall:
22091 return 0;
22092 }
22093
22094 if (optimize_size)
22095 desired_align = 1;
22096 if (desired_align < align)
22097 desired_align = align;
22098 if (expected_size != -1 && expected_size < 4)
22099 desired_align = align;
22100 return desired_align;
22101 }
22102
22103 /* Return the smallest power of 2 greater than VAL. */
22104 static int
22105 smallest_pow2_greater_than (int val)
22106 {
22107 int ret = 1;
22108 while (ret <= val)
22109 ret <<= 1;
22110 return ret;
22111 }
22112
22113 /* Expand string move (memcpy) operation. Use i386 string operations
22114 when profitable. expand_setmem contains similar code. The code
22115 depends upon architecture, block size and alignment, but always has
22116 the same overall structure:
22117
22118 1) Prologue guard: Conditional that jumps up to epilogues for small
22119 blocks that can be handled by epilogue alone. This is faster
22120 but also needed for correctness, since prologue assume the block
22121 is larger than the desired alignment.
22122
22123 Optional dynamic check for size and libcall for large
22124 blocks is emitted here too, with -minline-stringops-dynamically.
22125
22126 2) Prologue: copy first few bytes in order to get destination
22127 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22128 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22129 copied. We emit either a jump tree on power of two sized
22130 blocks, or a byte loop.
22131
22132 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22133 with specified algorithm.
22134
22135 4) Epilogue: code copying tail of the block that is too small to be
22136 handled by main body (or up to size guarded by prologue guard). */
22137
22138 bool
22139 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22140 rtx expected_align_exp, rtx expected_size_exp)
22141 {
22142 rtx destreg;
22143 rtx srcreg;
22144 rtx label = NULL;
22145 rtx tmp;
22146 rtx jump_around_label = NULL;
22147 HOST_WIDE_INT align = 1;
22148 unsigned HOST_WIDE_INT count = 0;
22149 HOST_WIDE_INT expected_size = -1;
22150 int size_needed = 0, epilogue_size_needed;
22151 int desired_align = 0, align_bytes = 0;
22152 enum stringop_alg alg;
22153 int dynamic_check;
22154 bool need_zero_guard = false;
22155
22156 if (CONST_INT_P (align_exp))
22157 align = INTVAL (align_exp);
22158 /* i386 can do misaligned access on reasonably increased cost. */
22159 if (CONST_INT_P (expected_align_exp)
22160 && INTVAL (expected_align_exp) > align)
22161 align = INTVAL (expected_align_exp);
22162 /* ALIGN is the minimum of destination and source alignment, but we care here
22163 just about destination alignment. */
22164 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22165 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22166
22167 if (CONST_INT_P (count_exp))
22168 count = expected_size = INTVAL (count_exp);
22169 if (CONST_INT_P (expected_size_exp) && count == 0)
22170 expected_size = INTVAL (expected_size_exp);
22171
22172 /* Make sure we don't need to care about overflow later on. */
22173 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22174 return false;
22175
22176 /* Step 0: Decide on preferred algorithm, desired alignment and
22177 size of chunks to be copied by main loop. */
22178
22179 alg = decide_alg (count, expected_size, false, &dynamic_check);
22180 desired_align = decide_alignment (align, alg, expected_size);
22181
22182 if (!TARGET_ALIGN_STRINGOPS)
22183 align = desired_align;
22184
22185 if (alg == libcall)
22186 return false;
22187 gcc_assert (alg != no_stringop);
22188 if (!count)
22189 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22190 destreg = copy_addr_to_reg (XEXP (dst, 0));
22191 srcreg = copy_addr_to_reg (XEXP (src, 0));
22192 switch (alg)
22193 {
22194 case libcall:
22195 case no_stringop:
22196 gcc_unreachable ();
22197 case loop:
22198 need_zero_guard = true;
22199 size_needed = GET_MODE_SIZE (word_mode);
22200 break;
22201 case unrolled_loop:
22202 need_zero_guard = true;
22203 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22204 break;
22205 case rep_prefix_8_byte:
22206 size_needed = 8;
22207 break;
22208 case rep_prefix_4_byte:
22209 size_needed = 4;
22210 break;
22211 case rep_prefix_1_byte:
22212 size_needed = 1;
22213 break;
22214 case loop_1_byte:
22215 need_zero_guard = true;
22216 size_needed = 1;
22217 break;
22218 }
22219
22220 epilogue_size_needed = size_needed;
22221
22222 /* Step 1: Prologue guard. */
22223
22224 /* Alignment code needs count to be in register. */
22225 if (CONST_INT_P (count_exp) && desired_align > align)
22226 {
22227 if (INTVAL (count_exp) > desired_align
22228 && INTVAL (count_exp) > size_needed)
22229 {
22230 align_bytes
22231 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22232 if (align_bytes <= 0)
22233 align_bytes = 0;
22234 else
22235 align_bytes = desired_align - align_bytes;
22236 }
22237 if (align_bytes == 0)
22238 count_exp = force_reg (counter_mode (count_exp), count_exp);
22239 }
22240 gcc_assert (desired_align >= 1 && align >= 1);
22241
22242 /* Ensure that alignment prologue won't copy past end of block. */
22243 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22244 {
22245 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22246 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22247 Make sure it is power of 2. */
22248 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22249
22250 if (count)
22251 {
22252 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22253 {
22254 /* If main algorithm works on QImode, no epilogue is needed.
22255 For small sizes just don't align anything. */
22256 if (size_needed == 1)
22257 desired_align = align;
22258 else
22259 goto epilogue;
22260 }
22261 }
22262 else
22263 {
22264 label = gen_label_rtx ();
22265 emit_cmp_and_jump_insns (count_exp,
22266 GEN_INT (epilogue_size_needed),
22267 LTU, 0, counter_mode (count_exp), 1, label);
22268 if (expected_size == -1 || expected_size < epilogue_size_needed)
22269 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22270 else
22271 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22272 }
22273 }
22274
22275 /* Emit code to decide on runtime whether library call or inline should be
22276 used. */
22277 if (dynamic_check != -1)
22278 {
22279 if (CONST_INT_P (count_exp))
22280 {
22281 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22282 {
22283 emit_block_move_via_libcall (dst, src, count_exp, false);
22284 count_exp = const0_rtx;
22285 goto epilogue;
22286 }
22287 }
22288 else
22289 {
22290 rtx hot_label = gen_label_rtx ();
22291 jump_around_label = gen_label_rtx ();
22292 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22293 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22294 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22295 emit_block_move_via_libcall (dst, src, count_exp, false);
22296 emit_jump (jump_around_label);
22297 emit_label (hot_label);
22298 }
22299 }
22300
22301 /* Step 2: Alignment prologue. */
22302
22303 if (desired_align > align)
22304 {
22305 if (align_bytes == 0)
22306 {
22307 /* Except for the first move in epilogue, we no longer know
22308 constant offset in aliasing info. It don't seems to worth
22309 the pain to maintain it for the first move, so throw away
22310 the info early. */
22311 src = change_address (src, BLKmode, srcreg);
22312 dst = change_address (dst, BLKmode, destreg);
22313 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22314 desired_align);
22315 }
22316 else
22317 {
22318 /* If we know how many bytes need to be stored before dst is
22319 sufficiently aligned, maintain aliasing info accurately. */
22320 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22321 desired_align, align_bytes);
22322 count_exp = plus_constant (counter_mode (count_exp),
22323 count_exp, -align_bytes);
22324 count -= align_bytes;
22325 }
22326 if (need_zero_guard
22327 && (count < (unsigned HOST_WIDE_INT) size_needed
22328 || (align_bytes == 0
22329 && count < ((unsigned HOST_WIDE_INT) size_needed
22330 + desired_align - align))))
22331 {
22332 /* It is possible that we copied enough so the main loop will not
22333 execute. */
22334 gcc_assert (size_needed > 1);
22335 if (label == NULL_RTX)
22336 label = gen_label_rtx ();
22337 emit_cmp_and_jump_insns (count_exp,
22338 GEN_INT (size_needed),
22339 LTU, 0, counter_mode (count_exp), 1, label);
22340 if (expected_size == -1
22341 || expected_size < (desired_align - align) / 2 + size_needed)
22342 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22343 else
22344 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22345 }
22346 }
22347 if (label && size_needed == 1)
22348 {
22349 emit_label (label);
22350 LABEL_NUSES (label) = 1;
22351 label = NULL;
22352 epilogue_size_needed = 1;
22353 }
22354 else if (label == NULL_RTX)
22355 epilogue_size_needed = size_needed;
22356
22357 /* Step 3: Main loop. */
22358
22359 switch (alg)
22360 {
22361 case libcall:
22362 case no_stringop:
22363 gcc_unreachable ();
22364 case loop_1_byte:
22365 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22366 count_exp, QImode, 1, expected_size);
22367 break;
22368 case loop:
22369 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22370 count_exp, word_mode, 1, expected_size);
22371 break;
22372 case unrolled_loop:
22373 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22374 registers for 4 temporaries anyway. */
22375 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22376 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22377 expected_size);
22378 break;
22379 case rep_prefix_8_byte:
22380 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22381 DImode);
22382 break;
22383 case rep_prefix_4_byte:
22384 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22385 SImode);
22386 break;
22387 case rep_prefix_1_byte:
22388 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22389 QImode);
22390 break;
22391 }
22392 /* Adjust properly the offset of src and dest memory for aliasing. */
22393 if (CONST_INT_P (count_exp))
22394 {
22395 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22396 (count / size_needed) * size_needed);
22397 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22398 (count / size_needed) * size_needed);
22399 }
22400 else
22401 {
22402 src = change_address (src, BLKmode, srcreg);
22403 dst = change_address (dst, BLKmode, destreg);
22404 }
22405
22406 /* Step 4: Epilogue to copy the remaining bytes. */
22407 epilogue:
22408 if (label)
22409 {
22410 /* When the main loop is done, COUNT_EXP might hold original count,
22411 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22412 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22413 bytes. Compensate if needed. */
22414
22415 if (size_needed < epilogue_size_needed)
22416 {
22417 tmp =
22418 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22419 GEN_INT (size_needed - 1), count_exp, 1,
22420 OPTAB_DIRECT);
22421 if (tmp != count_exp)
22422 emit_move_insn (count_exp, tmp);
22423 }
22424 emit_label (label);
22425 LABEL_NUSES (label) = 1;
22426 }
22427
22428 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22429 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22430 epilogue_size_needed);
22431 if (jump_around_label)
22432 emit_label (jump_around_label);
22433 return true;
22434 }
22435
22436 /* Helper function for memcpy. For QImode value 0xXY produce
22437 0xXYXYXYXY of wide specified by MODE. This is essentially
22438 a * 0x10101010, but we can do slightly better than
22439 synth_mult by unwinding the sequence by hand on CPUs with
22440 slow multiply. */
22441 static rtx
22442 promote_duplicated_reg (enum machine_mode mode, rtx val)
22443 {
22444 enum machine_mode valmode = GET_MODE (val);
22445 rtx tmp;
22446 int nops = mode == DImode ? 3 : 2;
22447
22448 gcc_assert (mode == SImode || mode == DImode);
22449 if (val == const0_rtx)
22450 return copy_to_mode_reg (mode, const0_rtx);
22451 if (CONST_INT_P (val))
22452 {
22453 HOST_WIDE_INT v = INTVAL (val) & 255;
22454
22455 v |= v << 8;
22456 v |= v << 16;
22457 if (mode == DImode)
22458 v |= (v << 16) << 16;
22459 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22460 }
22461
22462 if (valmode == VOIDmode)
22463 valmode = QImode;
22464 if (valmode != QImode)
22465 val = gen_lowpart (QImode, val);
22466 if (mode == QImode)
22467 return val;
22468 if (!TARGET_PARTIAL_REG_STALL)
22469 nops--;
22470 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22471 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22472 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22473 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22474 {
22475 rtx reg = convert_modes (mode, QImode, val, true);
22476 tmp = promote_duplicated_reg (mode, const1_rtx);
22477 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22478 OPTAB_DIRECT);
22479 }
22480 else
22481 {
22482 rtx reg = convert_modes (mode, QImode, val, true);
22483
22484 if (!TARGET_PARTIAL_REG_STALL)
22485 if (mode == SImode)
22486 emit_insn (gen_movsi_insv_1 (reg, reg));
22487 else
22488 emit_insn (gen_movdi_insv_1 (reg, reg));
22489 else
22490 {
22491 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22492 NULL, 1, OPTAB_DIRECT);
22493 reg =
22494 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22495 }
22496 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22497 NULL, 1, OPTAB_DIRECT);
22498 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22499 if (mode == SImode)
22500 return reg;
22501 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22502 NULL, 1, OPTAB_DIRECT);
22503 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22504 return reg;
22505 }
22506 }
22507
22508 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22509 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22510 alignment from ALIGN to DESIRED_ALIGN. */
22511 static rtx
22512 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22513 {
22514 rtx promoted_val;
22515
22516 if (TARGET_64BIT
22517 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22518 promoted_val = promote_duplicated_reg (DImode, val);
22519 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22520 promoted_val = promote_duplicated_reg (SImode, val);
22521 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22522 promoted_val = promote_duplicated_reg (HImode, val);
22523 else
22524 promoted_val = val;
22525
22526 return promoted_val;
22527 }
22528
22529 /* Expand string clear operation (bzero). Use i386 string operations when
22530 profitable. See expand_movmem comment for explanation of individual
22531 steps performed. */
22532 bool
22533 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22534 rtx expected_align_exp, rtx expected_size_exp)
22535 {
22536 rtx destreg;
22537 rtx label = NULL;
22538 rtx tmp;
22539 rtx jump_around_label = NULL;
22540 HOST_WIDE_INT align = 1;
22541 unsigned HOST_WIDE_INT count = 0;
22542 HOST_WIDE_INT expected_size = -1;
22543 int size_needed = 0, epilogue_size_needed;
22544 int desired_align = 0, align_bytes = 0;
22545 enum stringop_alg alg;
22546 rtx promoted_val = NULL;
22547 bool force_loopy_epilogue = false;
22548 int dynamic_check;
22549 bool need_zero_guard = false;
22550
22551 if (CONST_INT_P (align_exp))
22552 align = INTVAL (align_exp);
22553 /* i386 can do misaligned access on reasonably increased cost. */
22554 if (CONST_INT_P (expected_align_exp)
22555 && INTVAL (expected_align_exp) > align)
22556 align = INTVAL (expected_align_exp);
22557 if (CONST_INT_P (count_exp))
22558 count = expected_size = INTVAL (count_exp);
22559 if (CONST_INT_P (expected_size_exp) && count == 0)
22560 expected_size = INTVAL (expected_size_exp);
22561
22562 /* Make sure we don't need to care about overflow later on. */
22563 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22564 return false;
22565
22566 /* Step 0: Decide on preferred algorithm, desired alignment and
22567 size of chunks to be copied by main loop. */
22568
22569 alg = decide_alg (count, expected_size, true, &dynamic_check);
22570 desired_align = decide_alignment (align, alg, expected_size);
22571
22572 if (!TARGET_ALIGN_STRINGOPS)
22573 align = desired_align;
22574
22575 if (alg == libcall)
22576 return false;
22577 gcc_assert (alg != no_stringop);
22578 if (!count)
22579 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22580 destreg = copy_addr_to_reg (XEXP (dst, 0));
22581 switch (alg)
22582 {
22583 case libcall:
22584 case no_stringop:
22585 gcc_unreachable ();
22586 case loop:
22587 need_zero_guard = true;
22588 size_needed = GET_MODE_SIZE (word_mode);
22589 break;
22590 case unrolled_loop:
22591 need_zero_guard = true;
22592 size_needed = GET_MODE_SIZE (word_mode) * 4;
22593 break;
22594 case rep_prefix_8_byte:
22595 size_needed = 8;
22596 break;
22597 case rep_prefix_4_byte:
22598 size_needed = 4;
22599 break;
22600 case rep_prefix_1_byte:
22601 size_needed = 1;
22602 break;
22603 case loop_1_byte:
22604 need_zero_guard = true;
22605 size_needed = 1;
22606 break;
22607 }
22608 epilogue_size_needed = size_needed;
22609
22610 /* Step 1: Prologue guard. */
22611
22612 /* Alignment code needs count to be in register. */
22613 if (CONST_INT_P (count_exp) && desired_align > align)
22614 {
22615 if (INTVAL (count_exp) > desired_align
22616 && INTVAL (count_exp) > size_needed)
22617 {
22618 align_bytes
22619 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22620 if (align_bytes <= 0)
22621 align_bytes = 0;
22622 else
22623 align_bytes = desired_align - align_bytes;
22624 }
22625 if (align_bytes == 0)
22626 {
22627 enum machine_mode mode = SImode;
22628 if (TARGET_64BIT && (count & ~0xffffffff))
22629 mode = DImode;
22630 count_exp = force_reg (mode, count_exp);
22631 }
22632 }
22633 /* Do the cheap promotion to allow better CSE across the
22634 main loop and epilogue (ie one load of the big constant in the
22635 front of all code. */
22636 if (CONST_INT_P (val_exp))
22637 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22638 desired_align, align);
22639 /* Ensure that alignment prologue won't copy past end of block. */
22640 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22641 {
22642 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22643 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22644 Make sure it is power of 2. */
22645 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22646
22647 /* To improve performance of small blocks, we jump around the VAL
22648 promoting mode. This mean that if the promoted VAL is not constant,
22649 we might not use it in the epilogue and have to use byte
22650 loop variant. */
22651 if (epilogue_size_needed > 2 && !promoted_val)
22652 force_loopy_epilogue = true;
22653 if (count)
22654 {
22655 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22656 {
22657 /* If main algorithm works on QImode, no epilogue is needed.
22658 For small sizes just don't align anything. */
22659 if (size_needed == 1)
22660 desired_align = align;
22661 else
22662 goto epilogue;
22663 }
22664 }
22665 else
22666 {
22667 label = gen_label_rtx ();
22668 emit_cmp_and_jump_insns (count_exp,
22669 GEN_INT (epilogue_size_needed),
22670 LTU, 0, counter_mode (count_exp), 1, label);
22671 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22672 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22673 else
22674 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22675 }
22676 }
22677 if (dynamic_check != -1)
22678 {
22679 rtx hot_label = gen_label_rtx ();
22680 jump_around_label = gen_label_rtx ();
22681 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22682 LEU, 0, counter_mode (count_exp), 1, hot_label);
22683 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22684 set_storage_via_libcall (dst, count_exp, val_exp, false);
22685 emit_jump (jump_around_label);
22686 emit_label (hot_label);
22687 }
22688
22689 /* Step 2: Alignment prologue. */
22690
22691 /* Do the expensive promotion once we branched off the small blocks. */
22692 if (!promoted_val)
22693 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22694 desired_align, align);
22695 gcc_assert (desired_align >= 1 && align >= 1);
22696
22697 if (desired_align > align)
22698 {
22699 if (align_bytes == 0)
22700 {
22701 /* Except for the first move in epilogue, we no longer know
22702 constant offset in aliasing info. It don't seems to worth
22703 the pain to maintain it for the first move, so throw away
22704 the info early. */
22705 dst = change_address (dst, BLKmode, destreg);
22706 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22707 desired_align);
22708 }
22709 else
22710 {
22711 /* If we know how many bytes need to be stored before dst is
22712 sufficiently aligned, maintain aliasing info accurately. */
22713 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22714 desired_align, align_bytes);
22715 count_exp = plus_constant (counter_mode (count_exp),
22716 count_exp, -align_bytes);
22717 count -= align_bytes;
22718 }
22719 if (need_zero_guard
22720 && (count < (unsigned HOST_WIDE_INT) size_needed
22721 || (align_bytes == 0
22722 && count < ((unsigned HOST_WIDE_INT) size_needed
22723 + desired_align - align))))
22724 {
22725 /* It is possible that we copied enough so the main loop will not
22726 execute. */
22727 gcc_assert (size_needed > 1);
22728 if (label == NULL_RTX)
22729 label = gen_label_rtx ();
22730 emit_cmp_and_jump_insns (count_exp,
22731 GEN_INT (size_needed),
22732 LTU, 0, counter_mode (count_exp), 1, label);
22733 if (expected_size == -1
22734 || expected_size < (desired_align - align) / 2 + size_needed)
22735 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22736 else
22737 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22738 }
22739 }
22740 if (label && size_needed == 1)
22741 {
22742 emit_label (label);
22743 LABEL_NUSES (label) = 1;
22744 label = NULL;
22745 promoted_val = val_exp;
22746 epilogue_size_needed = 1;
22747 }
22748 else if (label == NULL_RTX)
22749 epilogue_size_needed = size_needed;
22750
22751 /* Step 3: Main loop. */
22752
22753 switch (alg)
22754 {
22755 case libcall:
22756 case no_stringop:
22757 gcc_unreachable ();
22758 case loop_1_byte:
22759 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22760 count_exp, QImode, 1, expected_size);
22761 break;
22762 case loop:
22763 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22764 count_exp, word_mode, 1, expected_size);
22765 break;
22766 case unrolled_loop:
22767 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22768 count_exp, word_mode, 4, expected_size);
22769 break;
22770 case rep_prefix_8_byte:
22771 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22772 DImode, val_exp);
22773 break;
22774 case rep_prefix_4_byte:
22775 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22776 SImode, val_exp);
22777 break;
22778 case rep_prefix_1_byte:
22779 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22780 QImode, val_exp);
22781 break;
22782 }
22783 /* Adjust properly the offset of src and dest memory for aliasing. */
22784 if (CONST_INT_P (count_exp))
22785 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22786 (count / size_needed) * size_needed);
22787 else
22788 dst = change_address (dst, BLKmode, destreg);
22789
22790 /* Step 4: Epilogue to copy the remaining bytes. */
22791
22792 if (label)
22793 {
22794 /* When the main loop is done, COUNT_EXP might hold original count,
22795 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22796 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22797 bytes. Compensate if needed. */
22798
22799 if (size_needed < epilogue_size_needed)
22800 {
22801 tmp =
22802 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22803 GEN_INT (size_needed - 1), count_exp, 1,
22804 OPTAB_DIRECT);
22805 if (tmp != count_exp)
22806 emit_move_insn (count_exp, tmp);
22807 }
22808 emit_label (label);
22809 LABEL_NUSES (label) = 1;
22810 }
22811 epilogue:
22812 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22813 {
22814 if (force_loopy_epilogue)
22815 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22816 epilogue_size_needed);
22817 else
22818 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22819 epilogue_size_needed);
22820 }
22821 if (jump_around_label)
22822 emit_label (jump_around_label);
22823 return true;
22824 }
22825
22826 /* Expand the appropriate insns for doing strlen if not just doing
22827 repnz; scasb
22828
22829 out = result, initialized with the start address
22830 align_rtx = alignment of the address.
22831 scratch = scratch register, initialized with the startaddress when
22832 not aligned, otherwise undefined
22833
22834 This is just the body. It needs the initializations mentioned above and
22835 some address computing at the end. These things are done in i386.md. */
22836
22837 static void
22838 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22839 {
22840 int align;
22841 rtx tmp;
22842 rtx align_2_label = NULL_RTX;
22843 rtx align_3_label = NULL_RTX;
22844 rtx align_4_label = gen_label_rtx ();
22845 rtx end_0_label = gen_label_rtx ();
22846 rtx mem;
22847 rtx tmpreg = gen_reg_rtx (SImode);
22848 rtx scratch = gen_reg_rtx (SImode);
22849 rtx cmp;
22850
22851 align = 0;
22852 if (CONST_INT_P (align_rtx))
22853 align = INTVAL (align_rtx);
22854
22855 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22856
22857 /* Is there a known alignment and is it less than 4? */
22858 if (align < 4)
22859 {
22860 rtx scratch1 = gen_reg_rtx (Pmode);
22861 emit_move_insn (scratch1, out);
22862 /* Is there a known alignment and is it not 2? */
22863 if (align != 2)
22864 {
22865 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22866 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22867
22868 /* Leave just the 3 lower bits. */
22869 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22870 NULL_RTX, 0, OPTAB_WIDEN);
22871
22872 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22873 Pmode, 1, align_4_label);
22874 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22875 Pmode, 1, align_2_label);
22876 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22877 Pmode, 1, align_3_label);
22878 }
22879 else
22880 {
22881 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22882 check if is aligned to 4 - byte. */
22883
22884 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22885 NULL_RTX, 0, OPTAB_WIDEN);
22886
22887 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22888 Pmode, 1, align_4_label);
22889 }
22890
22891 mem = change_address (src, QImode, out);
22892
22893 /* Now compare the bytes. */
22894
22895 /* Compare the first n unaligned byte on a byte per byte basis. */
22896 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22897 QImode, 1, end_0_label);
22898
22899 /* Increment the address. */
22900 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22901
22902 /* Not needed with an alignment of 2 */
22903 if (align != 2)
22904 {
22905 emit_label (align_2_label);
22906
22907 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22908 end_0_label);
22909
22910 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22911
22912 emit_label (align_3_label);
22913 }
22914
22915 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22916 end_0_label);
22917
22918 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22919 }
22920
22921 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22922 align this loop. It gives only huge programs, but does not help to
22923 speed up. */
22924 emit_label (align_4_label);
22925
22926 mem = change_address (src, SImode, out);
22927 emit_move_insn (scratch, mem);
22928 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22929
22930 /* This formula yields a nonzero result iff one of the bytes is zero.
22931 This saves three branches inside loop and many cycles. */
22932
22933 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22934 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22935 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22936 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22937 gen_int_mode (0x80808080, SImode)));
22938 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22939 align_4_label);
22940
22941 if (TARGET_CMOVE)
22942 {
22943 rtx reg = gen_reg_rtx (SImode);
22944 rtx reg2 = gen_reg_rtx (Pmode);
22945 emit_move_insn (reg, tmpreg);
22946 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22947
22948 /* If zero is not in the first two bytes, move two bytes forward. */
22949 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22950 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22951 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22952 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22953 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22954 reg,
22955 tmpreg)));
22956 /* Emit lea manually to avoid clobbering of flags. */
22957 emit_insn (gen_rtx_SET (SImode, reg2,
22958 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22959
22960 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22961 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22962 emit_insn (gen_rtx_SET (VOIDmode, out,
22963 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22964 reg2,
22965 out)));
22966 }
22967 else
22968 {
22969 rtx end_2_label = gen_label_rtx ();
22970 /* Is zero in the first two bytes? */
22971
22972 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22973 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22974 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22975 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22976 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22977 pc_rtx);
22978 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22979 JUMP_LABEL (tmp) = end_2_label;
22980
22981 /* Not in the first two. Move two bytes forward. */
22982 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22983 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22984
22985 emit_label (end_2_label);
22986
22987 }
22988
22989 /* Avoid branch in fixing the byte. */
22990 tmpreg = gen_lowpart (QImode, tmpreg);
22991 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22992 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22993 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22994 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22995
22996 emit_label (end_0_label);
22997 }
22998
22999 /* Expand strlen. */
23000
23001 bool
23002 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23003 {
23004 rtx addr, scratch1, scratch2, scratch3, scratch4;
23005
23006 /* The generic case of strlen expander is long. Avoid it's
23007 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23008
23009 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23010 && !TARGET_INLINE_ALL_STRINGOPS
23011 && !optimize_insn_for_size_p ()
23012 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23013 return false;
23014
23015 addr = force_reg (Pmode, XEXP (src, 0));
23016 scratch1 = gen_reg_rtx (Pmode);
23017
23018 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23019 && !optimize_insn_for_size_p ())
23020 {
23021 /* Well it seems that some optimizer does not combine a call like
23022 foo(strlen(bar), strlen(bar));
23023 when the move and the subtraction is done here. It does calculate
23024 the length just once when these instructions are done inside of
23025 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23026 often used and I use one fewer register for the lifetime of
23027 output_strlen_unroll() this is better. */
23028
23029 emit_move_insn (out, addr);
23030
23031 ix86_expand_strlensi_unroll_1 (out, src, align);
23032
23033 /* strlensi_unroll_1 returns the address of the zero at the end of
23034 the string, like memchr(), so compute the length by subtracting
23035 the start address. */
23036 emit_insn (ix86_gen_sub3 (out, out, addr));
23037 }
23038 else
23039 {
23040 rtx unspec;
23041
23042 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23043 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23044 return false;
23045
23046 scratch2 = gen_reg_rtx (Pmode);
23047 scratch3 = gen_reg_rtx (Pmode);
23048 scratch4 = force_reg (Pmode, constm1_rtx);
23049
23050 emit_move_insn (scratch3, addr);
23051 eoschar = force_reg (QImode, eoschar);
23052
23053 src = replace_equiv_address_nv (src, scratch3);
23054
23055 /* If .md starts supporting :P, this can be done in .md. */
23056 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23057 scratch4), UNSPEC_SCAS);
23058 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23059 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23060 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23061 }
23062 return true;
23063 }
23064
23065 /* For given symbol (function) construct code to compute address of it's PLT
23066 entry in large x86-64 PIC model. */
23067 rtx
23068 construct_plt_address (rtx symbol)
23069 {
23070 rtx tmp, unspec;
23071
23072 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23073 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23074 gcc_assert (Pmode == DImode);
23075
23076 tmp = gen_reg_rtx (Pmode);
23077 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23078
23079 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23080 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23081 return tmp;
23082 }
23083
23084 rtx
23085 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23086 rtx callarg2,
23087 rtx pop, bool sibcall)
23088 {
23089 /* We need to represent that SI and DI registers are clobbered
23090 by SYSV calls. */
23091 static int clobbered_registers[] = {
23092 XMM6_REG, XMM7_REG, XMM8_REG,
23093 XMM9_REG, XMM10_REG, XMM11_REG,
23094 XMM12_REG, XMM13_REG, XMM14_REG,
23095 XMM15_REG, SI_REG, DI_REG
23096 };
23097 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23098 rtx use = NULL, call;
23099 unsigned int vec_len;
23100
23101 if (pop == const0_rtx)
23102 pop = NULL;
23103 gcc_assert (!TARGET_64BIT || !pop);
23104
23105 if (TARGET_MACHO && !TARGET_64BIT)
23106 {
23107 #if TARGET_MACHO
23108 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23109 fnaddr = machopic_indirect_call_target (fnaddr);
23110 #endif
23111 }
23112 else
23113 {
23114 /* Static functions and indirect calls don't need the pic register. */
23115 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23116 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23117 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23118 use_reg (&use, pic_offset_table_rtx);
23119 }
23120
23121 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23122 {
23123 rtx al = gen_rtx_REG (QImode, AX_REG);
23124 emit_move_insn (al, callarg2);
23125 use_reg (&use, al);
23126 }
23127
23128 if (ix86_cmodel == CM_LARGE_PIC
23129 && MEM_P (fnaddr)
23130 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23131 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23132 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23133 else if (sibcall
23134 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23135 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23136 {
23137 fnaddr = XEXP (fnaddr, 0);
23138 if (GET_MODE (fnaddr) != word_mode)
23139 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23140 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23141 }
23142
23143 vec_len = 0;
23144 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23145 if (retval)
23146 call = gen_rtx_SET (VOIDmode, retval, call);
23147 vec[vec_len++] = call;
23148
23149 if (pop)
23150 {
23151 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23152 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23153 vec[vec_len++] = pop;
23154 }
23155
23156 if (TARGET_64BIT_MS_ABI
23157 && (!callarg2 || INTVAL (callarg2) != -2))
23158 {
23159 unsigned i;
23160
23161 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23162 UNSPEC_MS_TO_SYSV_CALL);
23163
23164 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23165 vec[vec_len++]
23166 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23167 ? TImode : DImode,
23168 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23169 ? TImode : DImode,
23170 clobbered_registers[i]));
23171 }
23172
23173 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23174 if (TARGET_VZEROUPPER)
23175 {
23176 int avx256;
23177 if (cfun->machine->callee_pass_avx256_p)
23178 {
23179 if (cfun->machine->callee_return_avx256_p)
23180 avx256 = callee_return_pass_avx256;
23181 else
23182 avx256 = callee_pass_avx256;
23183 }
23184 else if (cfun->machine->callee_return_avx256_p)
23185 avx256 = callee_return_avx256;
23186 else
23187 avx256 = call_no_avx256;
23188
23189 if (reload_completed)
23190 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23191 else
23192 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23193 gen_rtvec (1, GEN_INT (avx256)),
23194 UNSPEC_CALL_NEEDS_VZEROUPPER);
23195 }
23196
23197 if (vec_len > 1)
23198 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23199 call = emit_call_insn (call);
23200 if (use)
23201 CALL_INSN_FUNCTION_USAGE (call) = use;
23202
23203 return call;
23204 }
23205
23206 void
23207 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23208 {
23209 rtx pat = PATTERN (insn);
23210 rtvec vec = XVEC (pat, 0);
23211 int len = GET_NUM_ELEM (vec) - 1;
23212
23213 /* Strip off the last entry of the parallel. */
23214 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23215 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23216 if (len == 1)
23217 pat = RTVEC_ELT (vec, 0);
23218 else
23219 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23220
23221 emit_insn (gen_avx_vzeroupper (vzeroupper));
23222 emit_call_insn (pat);
23223 }
23224
23225 /* Output the assembly for a call instruction. */
23226
23227 const char *
23228 ix86_output_call_insn (rtx insn, rtx call_op)
23229 {
23230 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23231 bool seh_nop_p = false;
23232 const char *xasm;
23233
23234 if (SIBLING_CALL_P (insn))
23235 {
23236 if (direct_p)
23237 xasm = "jmp\t%P0";
23238 /* SEH epilogue detection requires the indirect branch case
23239 to include REX.W. */
23240 else if (TARGET_SEH)
23241 xasm = "rex.W jmp %A0";
23242 else
23243 xasm = "jmp\t%A0";
23244
23245 output_asm_insn (xasm, &call_op);
23246 return "";
23247 }
23248
23249 /* SEH unwinding can require an extra nop to be emitted in several
23250 circumstances. Determine if we have one of those. */
23251 if (TARGET_SEH)
23252 {
23253 rtx i;
23254
23255 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23256 {
23257 /* If we get to another real insn, we don't need the nop. */
23258 if (INSN_P (i))
23259 break;
23260
23261 /* If we get to the epilogue note, prevent a catch region from
23262 being adjacent to the standard epilogue sequence. If non-
23263 call-exceptions, we'll have done this during epilogue emission. */
23264 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23265 && !flag_non_call_exceptions
23266 && !can_throw_internal (insn))
23267 {
23268 seh_nop_p = true;
23269 break;
23270 }
23271 }
23272
23273 /* If we didn't find a real insn following the call, prevent the
23274 unwinder from looking into the next function. */
23275 if (i == NULL)
23276 seh_nop_p = true;
23277 }
23278
23279 if (direct_p)
23280 xasm = "call\t%P0";
23281 else
23282 xasm = "call\t%A0";
23283
23284 output_asm_insn (xasm, &call_op);
23285
23286 if (seh_nop_p)
23287 return "nop";
23288
23289 return "";
23290 }
23291 \f
23292 /* Clear stack slot assignments remembered from previous functions.
23293 This is called from INIT_EXPANDERS once before RTL is emitted for each
23294 function. */
23295
23296 static struct machine_function *
23297 ix86_init_machine_status (void)
23298 {
23299 struct machine_function *f;
23300
23301 f = ggc_alloc_cleared_machine_function ();
23302 f->use_fast_prologue_epilogue_nregs = -1;
23303 f->tls_descriptor_call_expanded_p = 0;
23304 f->call_abi = ix86_abi;
23305
23306 return f;
23307 }
23308
23309 /* Return a MEM corresponding to a stack slot with mode MODE.
23310 Allocate a new slot if necessary.
23311
23312 The RTL for a function can have several slots available: N is
23313 which slot to use. */
23314
23315 rtx
23316 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23317 {
23318 struct stack_local_entry *s;
23319
23320 gcc_assert (n < MAX_386_STACK_LOCALS);
23321
23322 /* Virtual slot is valid only before vregs are instantiated. */
23323 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23324
23325 for (s = ix86_stack_locals; s; s = s->next)
23326 if (s->mode == mode && s->n == n)
23327 return validize_mem (copy_rtx (s->rtl));
23328
23329 s = ggc_alloc_stack_local_entry ();
23330 s->n = n;
23331 s->mode = mode;
23332 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23333
23334 s->next = ix86_stack_locals;
23335 ix86_stack_locals = s;
23336 return validize_mem (s->rtl);
23337 }
23338 \f
23339 /* Calculate the length of the memory address in the instruction encoding.
23340 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23341 or other prefixes. */
23342
23343 int
23344 memory_address_length (rtx addr)
23345 {
23346 struct ix86_address parts;
23347 rtx base, index, disp;
23348 int len;
23349 int ok;
23350
23351 if (GET_CODE (addr) == PRE_DEC
23352 || GET_CODE (addr) == POST_INC
23353 || GET_CODE (addr) == PRE_MODIFY
23354 || GET_CODE (addr) == POST_MODIFY)
23355 return 0;
23356
23357 ok = ix86_decompose_address (addr, &parts);
23358 gcc_assert (ok);
23359
23360 if (parts.base && GET_CODE (parts.base) == SUBREG)
23361 parts.base = SUBREG_REG (parts.base);
23362 if (parts.index && GET_CODE (parts.index) == SUBREG)
23363 parts.index = SUBREG_REG (parts.index);
23364
23365 base = parts.base;
23366 index = parts.index;
23367 disp = parts.disp;
23368
23369 /* Add length of addr32 prefix. */
23370 len = (GET_CODE (addr) == ZERO_EXTEND
23371 || GET_CODE (addr) == AND);
23372
23373 /* Rule of thumb:
23374 - esp as the base always wants an index,
23375 - ebp as the base always wants a displacement,
23376 - r12 as the base always wants an index,
23377 - r13 as the base always wants a displacement. */
23378
23379 /* Register Indirect. */
23380 if (base && !index && !disp)
23381 {
23382 /* esp (for its index) and ebp (for its displacement) need
23383 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23384 code. */
23385 if (REG_P (addr)
23386 && (addr == arg_pointer_rtx
23387 || addr == frame_pointer_rtx
23388 || REGNO (addr) == SP_REG
23389 || REGNO (addr) == BP_REG
23390 || REGNO (addr) == R12_REG
23391 || REGNO (addr) == R13_REG))
23392 len = 1;
23393 }
23394
23395 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23396 is not disp32, but disp32(%rip), so for disp32
23397 SIB byte is needed, unless print_operand_address
23398 optimizes it into disp32(%rip) or (%rip) is implied
23399 by UNSPEC. */
23400 else if (disp && !base && !index)
23401 {
23402 len = 4;
23403 if (TARGET_64BIT)
23404 {
23405 rtx symbol = disp;
23406
23407 if (GET_CODE (disp) == CONST)
23408 symbol = XEXP (disp, 0);
23409 if (GET_CODE (symbol) == PLUS
23410 && CONST_INT_P (XEXP (symbol, 1)))
23411 symbol = XEXP (symbol, 0);
23412
23413 if (GET_CODE (symbol) != LABEL_REF
23414 && (GET_CODE (symbol) != SYMBOL_REF
23415 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23416 && (GET_CODE (symbol) != UNSPEC
23417 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23418 && XINT (symbol, 1) != UNSPEC_PCREL
23419 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23420 len += 1;
23421 }
23422 }
23423
23424 else
23425 {
23426 /* Find the length of the displacement constant. */
23427 if (disp)
23428 {
23429 if (base && satisfies_constraint_K (disp))
23430 len = 1;
23431 else
23432 len = 4;
23433 }
23434 /* ebp always wants a displacement. Similarly r13. */
23435 else if (base && REG_P (base)
23436 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23437 len = 1;
23438
23439 /* An index requires the two-byte modrm form.... */
23440 if (index
23441 /* ...like esp (or r12), which always wants an index. */
23442 || base == arg_pointer_rtx
23443 || base == frame_pointer_rtx
23444 || (base && REG_P (base)
23445 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23446 len += 1;
23447 }
23448
23449 switch (parts.seg)
23450 {
23451 case SEG_FS:
23452 case SEG_GS:
23453 len += 1;
23454 break;
23455 default:
23456 break;
23457 }
23458
23459 return len;
23460 }
23461
23462 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23463 is set, expect that insn have 8bit immediate alternative. */
23464 int
23465 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23466 {
23467 int len = 0;
23468 int i;
23469 extract_insn_cached (insn);
23470 for (i = recog_data.n_operands - 1; i >= 0; --i)
23471 if (CONSTANT_P (recog_data.operand[i]))
23472 {
23473 enum attr_mode mode = get_attr_mode (insn);
23474
23475 gcc_assert (!len);
23476 if (shortform && CONST_INT_P (recog_data.operand[i]))
23477 {
23478 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23479 switch (mode)
23480 {
23481 case MODE_QI:
23482 len = 1;
23483 continue;
23484 case MODE_HI:
23485 ival = trunc_int_for_mode (ival, HImode);
23486 break;
23487 case MODE_SI:
23488 ival = trunc_int_for_mode (ival, SImode);
23489 break;
23490 default:
23491 break;
23492 }
23493 if (IN_RANGE (ival, -128, 127))
23494 {
23495 len = 1;
23496 continue;
23497 }
23498 }
23499 switch (mode)
23500 {
23501 case MODE_QI:
23502 len = 1;
23503 break;
23504 case MODE_HI:
23505 len = 2;
23506 break;
23507 case MODE_SI:
23508 len = 4;
23509 break;
23510 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23511 case MODE_DI:
23512 len = 4;
23513 break;
23514 default:
23515 fatal_insn ("unknown insn mode", insn);
23516 }
23517 }
23518 return len;
23519 }
23520 /* Compute default value for "length_address" attribute. */
23521 int
23522 ix86_attr_length_address_default (rtx insn)
23523 {
23524 int i;
23525
23526 if (get_attr_type (insn) == TYPE_LEA)
23527 {
23528 rtx set = PATTERN (insn), addr;
23529
23530 if (GET_CODE (set) == PARALLEL)
23531 set = XVECEXP (set, 0, 0);
23532
23533 gcc_assert (GET_CODE (set) == SET);
23534
23535 addr = SET_SRC (set);
23536 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23537 {
23538 if (GET_CODE (addr) == ZERO_EXTEND)
23539 addr = XEXP (addr, 0);
23540 if (GET_CODE (addr) == SUBREG)
23541 addr = SUBREG_REG (addr);
23542 }
23543
23544 return memory_address_length (addr);
23545 }
23546
23547 extract_insn_cached (insn);
23548 for (i = recog_data.n_operands - 1; i >= 0; --i)
23549 if (MEM_P (recog_data.operand[i]))
23550 {
23551 constrain_operands_cached (reload_completed);
23552 if (which_alternative != -1)
23553 {
23554 const char *constraints = recog_data.constraints[i];
23555 int alt = which_alternative;
23556
23557 while (*constraints == '=' || *constraints == '+')
23558 constraints++;
23559 while (alt-- > 0)
23560 while (*constraints++ != ',')
23561 ;
23562 /* Skip ignored operands. */
23563 if (*constraints == 'X')
23564 continue;
23565 }
23566 return memory_address_length (XEXP (recog_data.operand[i], 0));
23567 }
23568 return 0;
23569 }
23570
23571 /* Compute default value for "length_vex" attribute. It includes
23572 2 or 3 byte VEX prefix and 1 opcode byte. */
23573
23574 int
23575 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23576 {
23577 int i;
23578
23579 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23580 byte VEX prefix. */
23581 if (!has_0f_opcode || has_vex_w)
23582 return 3 + 1;
23583
23584 /* We can always use 2 byte VEX prefix in 32bit. */
23585 if (!TARGET_64BIT)
23586 return 2 + 1;
23587
23588 extract_insn_cached (insn);
23589
23590 for (i = recog_data.n_operands - 1; i >= 0; --i)
23591 if (REG_P (recog_data.operand[i]))
23592 {
23593 /* REX.W bit uses 3 byte VEX prefix. */
23594 if (GET_MODE (recog_data.operand[i]) == DImode
23595 && GENERAL_REG_P (recog_data.operand[i]))
23596 return 3 + 1;
23597 }
23598 else
23599 {
23600 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23601 if (MEM_P (recog_data.operand[i])
23602 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23603 return 3 + 1;
23604 }
23605
23606 return 2 + 1;
23607 }
23608 \f
23609 /* Return the maximum number of instructions a cpu can issue. */
23610
23611 static int
23612 ix86_issue_rate (void)
23613 {
23614 switch (ix86_tune)
23615 {
23616 case PROCESSOR_PENTIUM:
23617 case PROCESSOR_ATOM:
23618 case PROCESSOR_K6:
23619 return 2;
23620
23621 case PROCESSOR_PENTIUMPRO:
23622 case PROCESSOR_PENTIUM4:
23623 case PROCESSOR_CORE2_32:
23624 case PROCESSOR_CORE2_64:
23625 case PROCESSOR_COREI7_32:
23626 case PROCESSOR_COREI7_64:
23627 case PROCESSOR_ATHLON:
23628 case PROCESSOR_K8:
23629 case PROCESSOR_AMDFAM10:
23630 case PROCESSOR_NOCONA:
23631 case PROCESSOR_GENERIC32:
23632 case PROCESSOR_GENERIC64:
23633 case PROCESSOR_BDVER1:
23634 case PROCESSOR_BDVER2:
23635 case PROCESSOR_BTVER1:
23636 return 3;
23637
23638 default:
23639 return 1;
23640 }
23641 }
23642
23643 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23644 by DEP_INSN and nothing set by DEP_INSN. */
23645
23646 static bool
23647 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23648 {
23649 rtx set, set2;
23650
23651 /* Simplify the test for uninteresting insns. */
23652 if (insn_type != TYPE_SETCC
23653 && insn_type != TYPE_ICMOV
23654 && insn_type != TYPE_FCMOV
23655 && insn_type != TYPE_IBR)
23656 return false;
23657
23658 if ((set = single_set (dep_insn)) != 0)
23659 {
23660 set = SET_DEST (set);
23661 set2 = NULL_RTX;
23662 }
23663 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23664 && XVECLEN (PATTERN (dep_insn), 0) == 2
23665 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23666 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23667 {
23668 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23669 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23670 }
23671 else
23672 return false;
23673
23674 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23675 return false;
23676
23677 /* This test is true if the dependent insn reads the flags but
23678 not any other potentially set register. */
23679 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23680 return false;
23681
23682 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23683 return false;
23684
23685 return true;
23686 }
23687
23688 /* Return true iff USE_INSN has a memory address with operands set by
23689 SET_INSN. */
23690
23691 bool
23692 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23693 {
23694 int i;
23695 extract_insn_cached (use_insn);
23696 for (i = recog_data.n_operands - 1; i >= 0; --i)
23697 if (MEM_P (recog_data.operand[i]))
23698 {
23699 rtx addr = XEXP (recog_data.operand[i], 0);
23700 return modified_in_p (addr, set_insn) != 0;
23701 }
23702 return false;
23703 }
23704
23705 static int
23706 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23707 {
23708 enum attr_type insn_type, dep_insn_type;
23709 enum attr_memory memory;
23710 rtx set, set2;
23711 int dep_insn_code_number;
23712
23713 /* Anti and output dependencies have zero cost on all CPUs. */
23714 if (REG_NOTE_KIND (link) != 0)
23715 return 0;
23716
23717 dep_insn_code_number = recog_memoized (dep_insn);
23718
23719 /* If we can't recognize the insns, we can't really do anything. */
23720 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23721 return cost;
23722
23723 insn_type = get_attr_type (insn);
23724 dep_insn_type = get_attr_type (dep_insn);
23725
23726 switch (ix86_tune)
23727 {
23728 case PROCESSOR_PENTIUM:
23729 /* Address Generation Interlock adds a cycle of latency. */
23730 if (insn_type == TYPE_LEA)
23731 {
23732 rtx addr = PATTERN (insn);
23733
23734 if (GET_CODE (addr) == PARALLEL)
23735 addr = XVECEXP (addr, 0, 0);
23736
23737 gcc_assert (GET_CODE (addr) == SET);
23738
23739 addr = SET_SRC (addr);
23740 if (modified_in_p (addr, dep_insn))
23741 cost += 1;
23742 }
23743 else if (ix86_agi_dependent (dep_insn, insn))
23744 cost += 1;
23745
23746 /* ??? Compares pair with jump/setcc. */
23747 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23748 cost = 0;
23749
23750 /* Floating point stores require value to be ready one cycle earlier. */
23751 if (insn_type == TYPE_FMOV
23752 && get_attr_memory (insn) == MEMORY_STORE
23753 && !ix86_agi_dependent (dep_insn, insn))
23754 cost += 1;
23755 break;
23756
23757 case PROCESSOR_PENTIUMPRO:
23758 memory = get_attr_memory (insn);
23759
23760 /* INT->FP conversion is expensive. */
23761 if (get_attr_fp_int_src (dep_insn))
23762 cost += 5;
23763
23764 /* There is one cycle extra latency between an FP op and a store. */
23765 if (insn_type == TYPE_FMOV
23766 && (set = single_set (dep_insn)) != NULL_RTX
23767 && (set2 = single_set (insn)) != NULL_RTX
23768 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23769 && MEM_P (SET_DEST (set2)))
23770 cost += 1;
23771
23772 /* Show ability of reorder buffer to hide latency of load by executing
23773 in parallel with previous instruction in case
23774 previous instruction is not needed to compute the address. */
23775 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23776 && !ix86_agi_dependent (dep_insn, insn))
23777 {
23778 /* Claim moves to take one cycle, as core can issue one load
23779 at time and the next load can start cycle later. */
23780 if (dep_insn_type == TYPE_IMOV
23781 || dep_insn_type == TYPE_FMOV)
23782 cost = 1;
23783 else if (cost > 1)
23784 cost--;
23785 }
23786 break;
23787
23788 case PROCESSOR_K6:
23789 memory = get_attr_memory (insn);
23790
23791 /* The esp dependency is resolved before the instruction is really
23792 finished. */
23793 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23794 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23795 return 1;
23796
23797 /* INT->FP conversion is expensive. */
23798 if (get_attr_fp_int_src (dep_insn))
23799 cost += 5;
23800
23801 /* Show ability of reorder buffer to hide latency of load by executing
23802 in parallel with previous instruction in case
23803 previous instruction is not needed to compute the address. */
23804 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23805 && !ix86_agi_dependent (dep_insn, insn))
23806 {
23807 /* Claim moves to take one cycle, as core can issue one load
23808 at time and the next load can start cycle later. */
23809 if (dep_insn_type == TYPE_IMOV
23810 || dep_insn_type == TYPE_FMOV)
23811 cost = 1;
23812 else if (cost > 2)
23813 cost -= 2;
23814 else
23815 cost = 1;
23816 }
23817 break;
23818
23819 case PROCESSOR_ATHLON:
23820 case PROCESSOR_K8:
23821 case PROCESSOR_AMDFAM10:
23822 case PROCESSOR_BDVER1:
23823 case PROCESSOR_BDVER2:
23824 case PROCESSOR_BTVER1:
23825 case PROCESSOR_ATOM:
23826 case PROCESSOR_GENERIC32:
23827 case PROCESSOR_GENERIC64:
23828 memory = get_attr_memory (insn);
23829
23830 /* Show ability of reorder buffer to hide latency of load by executing
23831 in parallel with previous instruction in case
23832 previous instruction is not needed to compute the address. */
23833 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23834 && !ix86_agi_dependent (dep_insn, insn))
23835 {
23836 enum attr_unit unit = get_attr_unit (insn);
23837 int loadcost = 3;
23838
23839 /* Because of the difference between the length of integer and
23840 floating unit pipeline preparation stages, the memory operands
23841 for floating point are cheaper.
23842
23843 ??? For Athlon it the difference is most probably 2. */
23844 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23845 loadcost = 3;
23846 else
23847 loadcost = TARGET_ATHLON ? 2 : 0;
23848
23849 if (cost >= loadcost)
23850 cost -= loadcost;
23851 else
23852 cost = 0;
23853 }
23854
23855 default:
23856 break;
23857 }
23858
23859 return cost;
23860 }
23861
23862 /* How many alternative schedules to try. This should be as wide as the
23863 scheduling freedom in the DFA, but no wider. Making this value too
23864 large results extra work for the scheduler. */
23865
23866 static int
23867 ia32_multipass_dfa_lookahead (void)
23868 {
23869 switch (ix86_tune)
23870 {
23871 case PROCESSOR_PENTIUM:
23872 return 2;
23873
23874 case PROCESSOR_PENTIUMPRO:
23875 case PROCESSOR_K6:
23876 return 1;
23877
23878 case PROCESSOR_CORE2_32:
23879 case PROCESSOR_CORE2_64:
23880 case PROCESSOR_COREI7_32:
23881 case PROCESSOR_COREI7_64:
23882 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23883 as many instructions can be executed on a cycle, i.e.,
23884 issue_rate. I wonder why tuning for many CPUs does not do this. */
23885 return ix86_issue_rate ();
23886
23887 default:
23888 return 0;
23889 }
23890 }
23891
23892 \f
23893
23894 /* Model decoder of Core 2/i7.
23895 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23896 track the instruction fetch block boundaries and make sure that long
23897 (9+ bytes) instructions are assigned to D0. */
23898
23899 /* Maximum length of an insn that can be handled by
23900 a secondary decoder unit. '8' for Core 2/i7. */
23901 static int core2i7_secondary_decoder_max_insn_size;
23902
23903 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23904 '16' for Core 2/i7. */
23905 static int core2i7_ifetch_block_size;
23906
23907 /* Maximum number of instructions decoder can handle per cycle.
23908 '6' for Core 2/i7. */
23909 static int core2i7_ifetch_block_max_insns;
23910
23911 typedef struct ix86_first_cycle_multipass_data_ *
23912 ix86_first_cycle_multipass_data_t;
23913 typedef const struct ix86_first_cycle_multipass_data_ *
23914 const_ix86_first_cycle_multipass_data_t;
23915
23916 /* A variable to store target state across calls to max_issue within
23917 one cycle. */
23918 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23919 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23920
23921 /* Initialize DATA. */
23922 static void
23923 core2i7_first_cycle_multipass_init (void *_data)
23924 {
23925 ix86_first_cycle_multipass_data_t data
23926 = (ix86_first_cycle_multipass_data_t) _data;
23927
23928 data->ifetch_block_len = 0;
23929 data->ifetch_block_n_insns = 0;
23930 data->ready_try_change = NULL;
23931 data->ready_try_change_size = 0;
23932 }
23933
23934 /* Advancing the cycle; reset ifetch block counts. */
23935 static void
23936 core2i7_dfa_post_advance_cycle (void)
23937 {
23938 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23939
23940 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23941
23942 data->ifetch_block_len = 0;
23943 data->ifetch_block_n_insns = 0;
23944 }
23945
23946 static int min_insn_size (rtx);
23947
23948 /* Filter out insns from ready_try that the core will not be able to issue
23949 on current cycle due to decoder. */
23950 static void
23951 core2i7_first_cycle_multipass_filter_ready_try
23952 (const_ix86_first_cycle_multipass_data_t data,
23953 char *ready_try, int n_ready, bool first_cycle_insn_p)
23954 {
23955 while (n_ready--)
23956 {
23957 rtx insn;
23958 int insn_size;
23959
23960 if (ready_try[n_ready])
23961 continue;
23962
23963 insn = get_ready_element (n_ready);
23964 insn_size = min_insn_size (insn);
23965
23966 if (/* If this is a too long an insn for a secondary decoder ... */
23967 (!first_cycle_insn_p
23968 && insn_size > core2i7_secondary_decoder_max_insn_size)
23969 /* ... or it would not fit into the ifetch block ... */
23970 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23971 /* ... or the decoder is full already ... */
23972 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23973 /* ... mask the insn out. */
23974 {
23975 ready_try[n_ready] = 1;
23976
23977 if (data->ready_try_change)
23978 SET_BIT (data->ready_try_change, n_ready);
23979 }
23980 }
23981 }
23982
23983 /* Prepare for a new round of multipass lookahead scheduling. */
23984 static void
23985 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23986 bool first_cycle_insn_p)
23987 {
23988 ix86_first_cycle_multipass_data_t data
23989 = (ix86_first_cycle_multipass_data_t) _data;
23990 const_ix86_first_cycle_multipass_data_t prev_data
23991 = ix86_first_cycle_multipass_data;
23992
23993 /* Restore the state from the end of the previous round. */
23994 data->ifetch_block_len = prev_data->ifetch_block_len;
23995 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23996
23997 /* Filter instructions that cannot be issued on current cycle due to
23998 decoder restrictions. */
23999 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24000 first_cycle_insn_p);
24001 }
24002
24003 /* INSN is being issued in current solution. Account for its impact on
24004 the decoder model. */
24005 static void
24006 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24007 rtx insn, const void *_prev_data)
24008 {
24009 ix86_first_cycle_multipass_data_t data
24010 = (ix86_first_cycle_multipass_data_t) _data;
24011 const_ix86_first_cycle_multipass_data_t prev_data
24012 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24013
24014 int insn_size = min_insn_size (insn);
24015
24016 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24017 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24018 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24019 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24020
24021 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24022 if (!data->ready_try_change)
24023 {
24024 data->ready_try_change = sbitmap_alloc (n_ready);
24025 data->ready_try_change_size = n_ready;
24026 }
24027 else if (data->ready_try_change_size < n_ready)
24028 {
24029 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24030 n_ready, 0);
24031 data->ready_try_change_size = n_ready;
24032 }
24033 sbitmap_zero (data->ready_try_change);
24034
24035 /* Filter out insns from ready_try that the core will not be able to issue
24036 on current cycle due to decoder. */
24037 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24038 false);
24039 }
24040
24041 /* Revert the effect on ready_try. */
24042 static void
24043 core2i7_first_cycle_multipass_backtrack (const void *_data,
24044 char *ready_try,
24045 int n_ready ATTRIBUTE_UNUSED)
24046 {
24047 const_ix86_first_cycle_multipass_data_t data
24048 = (const_ix86_first_cycle_multipass_data_t) _data;
24049 unsigned int i = 0;
24050 sbitmap_iterator sbi;
24051
24052 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24053 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24054 {
24055 ready_try[i] = 0;
24056 }
24057 }
24058
24059 /* Save the result of multipass lookahead scheduling for the next round. */
24060 static void
24061 core2i7_first_cycle_multipass_end (const void *_data)
24062 {
24063 const_ix86_first_cycle_multipass_data_t data
24064 = (const_ix86_first_cycle_multipass_data_t) _data;
24065 ix86_first_cycle_multipass_data_t next_data
24066 = ix86_first_cycle_multipass_data;
24067
24068 if (data != NULL)
24069 {
24070 next_data->ifetch_block_len = data->ifetch_block_len;
24071 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24072 }
24073 }
24074
24075 /* Deallocate target data. */
24076 static void
24077 core2i7_first_cycle_multipass_fini (void *_data)
24078 {
24079 ix86_first_cycle_multipass_data_t data
24080 = (ix86_first_cycle_multipass_data_t) _data;
24081
24082 if (data->ready_try_change)
24083 {
24084 sbitmap_free (data->ready_try_change);
24085 data->ready_try_change = NULL;
24086 data->ready_try_change_size = 0;
24087 }
24088 }
24089
24090 /* Prepare for scheduling pass. */
24091 static void
24092 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24093 int verbose ATTRIBUTE_UNUSED,
24094 int max_uid ATTRIBUTE_UNUSED)
24095 {
24096 /* Install scheduling hooks for current CPU. Some of these hooks are used
24097 in time-critical parts of the scheduler, so we only set them up when
24098 they are actually used. */
24099 switch (ix86_tune)
24100 {
24101 case PROCESSOR_CORE2_32:
24102 case PROCESSOR_CORE2_64:
24103 case PROCESSOR_COREI7_32:
24104 case PROCESSOR_COREI7_64:
24105 targetm.sched.dfa_post_advance_cycle
24106 = core2i7_dfa_post_advance_cycle;
24107 targetm.sched.first_cycle_multipass_init
24108 = core2i7_first_cycle_multipass_init;
24109 targetm.sched.first_cycle_multipass_begin
24110 = core2i7_first_cycle_multipass_begin;
24111 targetm.sched.first_cycle_multipass_issue
24112 = core2i7_first_cycle_multipass_issue;
24113 targetm.sched.first_cycle_multipass_backtrack
24114 = core2i7_first_cycle_multipass_backtrack;
24115 targetm.sched.first_cycle_multipass_end
24116 = core2i7_first_cycle_multipass_end;
24117 targetm.sched.first_cycle_multipass_fini
24118 = core2i7_first_cycle_multipass_fini;
24119
24120 /* Set decoder parameters. */
24121 core2i7_secondary_decoder_max_insn_size = 8;
24122 core2i7_ifetch_block_size = 16;
24123 core2i7_ifetch_block_max_insns = 6;
24124 break;
24125
24126 default:
24127 targetm.sched.dfa_post_advance_cycle = NULL;
24128 targetm.sched.first_cycle_multipass_init = NULL;
24129 targetm.sched.first_cycle_multipass_begin = NULL;
24130 targetm.sched.first_cycle_multipass_issue = NULL;
24131 targetm.sched.first_cycle_multipass_backtrack = NULL;
24132 targetm.sched.first_cycle_multipass_end = NULL;
24133 targetm.sched.first_cycle_multipass_fini = NULL;
24134 break;
24135 }
24136 }
24137
24138 \f
24139 /* Compute the alignment given to a constant that is being placed in memory.
24140 EXP is the constant and ALIGN is the alignment that the object would
24141 ordinarily have.
24142 The value of this function is used instead of that alignment to align
24143 the object. */
24144
24145 int
24146 ix86_constant_alignment (tree exp, int align)
24147 {
24148 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24149 || TREE_CODE (exp) == INTEGER_CST)
24150 {
24151 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24152 return 64;
24153 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24154 return 128;
24155 }
24156 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24157 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24158 return BITS_PER_WORD;
24159
24160 return align;
24161 }
24162
24163 /* Compute the alignment for a static variable.
24164 TYPE is the data type, and ALIGN is the alignment that
24165 the object would ordinarily have. The value of this function is used
24166 instead of that alignment to align the object. */
24167
24168 int
24169 ix86_data_alignment (tree type, int align)
24170 {
24171 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24172
24173 if (AGGREGATE_TYPE_P (type)
24174 && TYPE_SIZE (type)
24175 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24176 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24177 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24178 && align < max_align)
24179 align = max_align;
24180
24181 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24182 to 16byte boundary. */
24183 if (TARGET_64BIT)
24184 {
24185 if (AGGREGATE_TYPE_P (type)
24186 && TYPE_SIZE (type)
24187 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24188 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24189 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24190 return 128;
24191 }
24192
24193 if (TREE_CODE (type) == ARRAY_TYPE)
24194 {
24195 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24196 return 64;
24197 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24198 return 128;
24199 }
24200 else if (TREE_CODE (type) == COMPLEX_TYPE)
24201 {
24202
24203 if (TYPE_MODE (type) == DCmode && align < 64)
24204 return 64;
24205 if ((TYPE_MODE (type) == XCmode
24206 || TYPE_MODE (type) == TCmode) && align < 128)
24207 return 128;
24208 }
24209 else if ((TREE_CODE (type) == RECORD_TYPE
24210 || TREE_CODE (type) == UNION_TYPE
24211 || TREE_CODE (type) == QUAL_UNION_TYPE)
24212 && TYPE_FIELDS (type))
24213 {
24214 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24215 return 64;
24216 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24217 return 128;
24218 }
24219 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24220 || TREE_CODE (type) == INTEGER_TYPE)
24221 {
24222 if (TYPE_MODE (type) == DFmode && align < 64)
24223 return 64;
24224 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24225 return 128;
24226 }
24227
24228 return align;
24229 }
24230
24231 /* Compute the alignment for a local variable or a stack slot. EXP is
24232 the data type or decl itself, MODE is the widest mode available and
24233 ALIGN is the alignment that the object would ordinarily have. The
24234 value of this macro is used instead of that alignment to align the
24235 object. */
24236
24237 unsigned int
24238 ix86_local_alignment (tree exp, enum machine_mode mode,
24239 unsigned int align)
24240 {
24241 tree type, decl;
24242
24243 if (exp && DECL_P (exp))
24244 {
24245 type = TREE_TYPE (exp);
24246 decl = exp;
24247 }
24248 else
24249 {
24250 type = exp;
24251 decl = NULL;
24252 }
24253
24254 /* Don't do dynamic stack realignment for long long objects with
24255 -mpreferred-stack-boundary=2. */
24256 if (!TARGET_64BIT
24257 && align == 64
24258 && ix86_preferred_stack_boundary < 64
24259 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24260 && (!type || !TYPE_USER_ALIGN (type))
24261 && (!decl || !DECL_USER_ALIGN (decl)))
24262 align = 32;
24263
24264 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24265 register in MODE. We will return the largest alignment of XF
24266 and DF. */
24267 if (!type)
24268 {
24269 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24270 align = GET_MODE_ALIGNMENT (DFmode);
24271 return align;
24272 }
24273
24274 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24275 to 16byte boundary. Exact wording is:
24276
24277 An array uses the same alignment as its elements, except that a local or
24278 global array variable of length at least 16 bytes or
24279 a C99 variable-length array variable always has alignment of at least 16 bytes.
24280
24281 This was added to allow use of aligned SSE instructions at arrays. This
24282 rule is meant for static storage (where compiler can not do the analysis
24283 by itself). We follow it for automatic variables only when convenient.
24284 We fully control everything in the function compiled and functions from
24285 other unit can not rely on the alignment.
24286
24287 Exclude va_list type. It is the common case of local array where
24288 we can not benefit from the alignment. */
24289 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24290 && TARGET_SSE)
24291 {
24292 if (AGGREGATE_TYPE_P (type)
24293 && (va_list_type_node == NULL_TREE
24294 || (TYPE_MAIN_VARIANT (type)
24295 != TYPE_MAIN_VARIANT (va_list_type_node)))
24296 && TYPE_SIZE (type)
24297 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24298 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24299 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24300 return 128;
24301 }
24302 if (TREE_CODE (type) == ARRAY_TYPE)
24303 {
24304 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24305 return 64;
24306 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24307 return 128;
24308 }
24309 else if (TREE_CODE (type) == COMPLEX_TYPE)
24310 {
24311 if (TYPE_MODE (type) == DCmode && align < 64)
24312 return 64;
24313 if ((TYPE_MODE (type) == XCmode
24314 || TYPE_MODE (type) == TCmode) && align < 128)
24315 return 128;
24316 }
24317 else if ((TREE_CODE (type) == RECORD_TYPE
24318 || TREE_CODE (type) == UNION_TYPE
24319 || TREE_CODE (type) == QUAL_UNION_TYPE)
24320 && TYPE_FIELDS (type))
24321 {
24322 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24323 return 64;
24324 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24325 return 128;
24326 }
24327 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24328 || TREE_CODE (type) == INTEGER_TYPE)
24329 {
24330
24331 if (TYPE_MODE (type) == DFmode && align < 64)
24332 return 64;
24333 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24334 return 128;
24335 }
24336 return align;
24337 }
24338
24339 /* Compute the minimum required alignment for dynamic stack realignment
24340 purposes for a local variable, parameter or a stack slot. EXP is
24341 the data type or decl itself, MODE is its mode and ALIGN is the
24342 alignment that the object would ordinarily have. */
24343
24344 unsigned int
24345 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24346 unsigned int align)
24347 {
24348 tree type, decl;
24349
24350 if (exp && DECL_P (exp))
24351 {
24352 type = TREE_TYPE (exp);
24353 decl = exp;
24354 }
24355 else
24356 {
24357 type = exp;
24358 decl = NULL;
24359 }
24360
24361 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24362 return align;
24363
24364 /* Don't do dynamic stack realignment for long long objects with
24365 -mpreferred-stack-boundary=2. */
24366 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24367 && (!type || !TYPE_USER_ALIGN (type))
24368 && (!decl || !DECL_USER_ALIGN (decl)))
24369 return 32;
24370
24371 return align;
24372 }
24373 \f
24374 /* Find a location for the static chain incoming to a nested function.
24375 This is a register, unless all free registers are used by arguments. */
24376
24377 static rtx
24378 ix86_static_chain (const_tree fndecl, bool incoming_p)
24379 {
24380 unsigned regno;
24381
24382 if (!DECL_STATIC_CHAIN (fndecl))
24383 return NULL;
24384
24385 if (TARGET_64BIT)
24386 {
24387 /* We always use R10 in 64-bit mode. */
24388 regno = R10_REG;
24389 }
24390 else
24391 {
24392 tree fntype;
24393 unsigned int ccvt;
24394
24395 /* By default in 32-bit mode we use ECX to pass the static chain. */
24396 regno = CX_REG;
24397
24398 fntype = TREE_TYPE (fndecl);
24399 ccvt = ix86_get_callcvt (fntype);
24400 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24401 {
24402 /* Fastcall functions use ecx/edx for arguments, which leaves
24403 us with EAX for the static chain.
24404 Thiscall functions use ecx for arguments, which also
24405 leaves us with EAX for the static chain. */
24406 regno = AX_REG;
24407 }
24408 else if (ix86_function_regparm (fntype, fndecl) == 3)
24409 {
24410 /* For regparm 3, we have no free call-clobbered registers in
24411 which to store the static chain. In order to implement this,
24412 we have the trampoline push the static chain to the stack.
24413 However, we can't push a value below the return address when
24414 we call the nested function directly, so we have to use an
24415 alternate entry point. For this we use ESI, and have the
24416 alternate entry point push ESI, so that things appear the
24417 same once we're executing the nested function. */
24418 if (incoming_p)
24419 {
24420 if (fndecl == current_function_decl)
24421 ix86_static_chain_on_stack = true;
24422 return gen_frame_mem (SImode,
24423 plus_constant (Pmode,
24424 arg_pointer_rtx, -8));
24425 }
24426 regno = SI_REG;
24427 }
24428 }
24429
24430 return gen_rtx_REG (Pmode, regno);
24431 }
24432
24433 /* Emit RTL insns to initialize the variable parts of a trampoline.
24434 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24435 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24436 to be passed to the target function. */
24437
24438 static void
24439 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24440 {
24441 rtx mem, fnaddr;
24442 int opcode;
24443 int offset = 0;
24444
24445 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24446
24447 if (TARGET_64BIT)
24448 {
24449 int size;
24450
24451 /* Load the function address to r11. Try to load address using
24452 the shorter movl instead of movabs. We may want to support
24453 movq for kernel mode, but kernel does not use trampolines at
24454 the moment. FNADDR is a 32bit address and may not be in
24455 DImode when ptr_mode == SImode. Always use movl in this
24456 case. */
24457 if (ptr_mode == SImode
24458 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24459 {
24460 fnaddr = copy_addr_to_reg (fnaddr);
24461
24462 mem = adjust_address (m_tramp, HImode, offset);
24463 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24464
24465 mem = adjust_address (m_tramp, SImode, offset + 2);
24466 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24467 offset += 6;
24468 }
24469 else
24470 {
24471 mem = adjust_address (m_tramp, HImode, offset);
24472 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24473
24474 mem = adjust_address (m_tramp, DImode, offset + 2);
24475 emit_move_insn (mem, fnaddr);
24476 offset += 10;
24477 }
24478
24479 /* Load static chain using movabs to r10. Use the shorter movl
24480 instead of movabs when ptr_mode == SImode. */
24481 if (ptr_mode == SImode)
24482 {
24483 opcode = 0xba41;
24484 size = 6;
24485 }
24486 else
24487 {
24488 opcode = 0xba49;
24489 size = 10;
24490 }
24491
24492 mem = adjust_address (m_tramp, HImode, offset);
24493 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24494
24495 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24496 emit_move_insn (mem, chain_value);
24497 offset += size;
24498
24499 /* Jump to r11; the last (unused) byte is a nop, only there to
24500 pad the write out to a single 32-bit store. */
24501 mem = adjust_address (m_tramp, SImode, offset);
24502 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24503 offset += 4;
24504 }
24505 else
24506 {
24507 rtx disp, chain;
24508
24509 /* Depending on the static chain location, either load a register
24510 with a constant, or push the constant to the stack. All of the
24511 instructions are the same size. */
24512 chain = ix86_static_chain (fndecl, true);
24513 if (REG_P (chain))
24514 {
24515 switch (REGNO (chain))
24516 {
24517 case AX_REG:
24518 opcode = 0xb8; break;
24519 case CX_REG:
24520 opcode = 0xb9; break;
24521 default:
24522 gcc_unreachable ();
24523 }
24524 }
24525 else
24526 opcode = 0x68;
24527
24528 mem = adjust_address (m_tramp, QImode, offset);
24529 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24530
24531 mem = adjust_address (m_tramp, SImode, offset + 1);
24532 emit_move_insn (mem, chain_value);
24533 offset += 5;
24534
24535 mem = adjust_address (m_tramp, QImode, offset);
24536 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24537
24538 mem = adjust_address (m_tramp, SImode, offset + 1);
24539
24540 /* Compute offset from the end of the jmp to the target function.
24541 In the case in which the trampoline stores the static chain on
24542 the stack, we need to skip the first insn which pushes the
24543 (call-saved) register static chain; this push is 1 byte. */
24544 offset += 5;
24545 disp = expand_binop (SImode, sub_optab, fnaddr,
24546 plus_constant (Pmode, XEXP (m_tramp, 0),
24547 offset - (MEM_P (chain) ? 1 : 0)),
24548 NULL_RTX, 1, OPTAB_DIRECT);
24549 emit_move_insn (mem, disp);
24550 }
24551
24552 gcc_assert (offset <= TRAMPOLINE_SIZE);
24553
24554 #ifdef HAVE_ENABLE_EXECUTE_STACK
24555 #ifdef CHECK_EXECUTE_STACK_ENABLED
24556 if (CHECK_EXECUTE_STACK_ENABLED)
24557 #endif
24558 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24559 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24560 #endif
24561 }
24562 \f
24563 /* The following file contains several enumerations and data structures
24564 built from the definitions in i386-builtin-types.def. */
24565
24566 #include "i386-builtin-types.inc"
24567
24568 /* Table for the ix86 builtin non-function types. */
24569 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24570
24571 /* Retrieve an element from the above table, building some of
24572 the types lazily. */
24573
24574 static tree
24575 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24576 {
24577 unsigned int index;
24578 tree type, itype;
24579
24580 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24581
24582 type = ix86_builtin_type_tab[(int) tcode];
24583 if (type != NULL)
24584 return type;
24585
24586 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24587 if (tcode <= IX86_BT_LAST_VECT)
24588 {
24589 enum machine_mode mode;
24590
24591 index = tcode - IX86_BT_LAST_PRIM - 1;
24592 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24593 mode = ix86_builtin_type_vect_mode[index];
24594
24595 type = build_vector_type_for_mode (itype, mode);
24596 }
24597 else
24598 {
24599 int quals;
24600
24601 index = tcode - IX86_BT_LAST_VECT - 1;
24602 if (tcode <= IX86_BT_LAST_PTR)
24603 quals = TYPE_UNQUALIFIED;
24604 else
24605 quals = TYPE_QUAL_CONST;
24606
24607 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24608 if (quals != TYPE_UNQUALIFIED)
24609 itype = build_qualified_type (itype, quals);
24610
24611 type = build_pointer_type (itype);
24612 }
24613
24614 ix86_builtin_type_tab[(int) tcode] = type;
24615 return type;
24616 }
24617
24618 /* Table for the ix86 builtin function types. */
24619 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24620
24621 /* Retrieve an element from the above table, building some of
24622 the types lazily. */
24623
24624 static tree
24625 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24626 {
24627 tree type;
24628
24629 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24630
24631 type = ix86_builtin_func_type_tab[(int) tcode];
24632 if (type != NULL)
24633 return type;
24634
24635 if (tcode <= IX86_BT_LAST_FUNC)
24636 {
24637 unsigned start = ix86_builtin_func_start[(int) tcode];
24638 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24639 tree rtype, atype, args = void_list_node;
24640 unsigned i;
24641
24642 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24643 for (i = after - 1; i > start; --i)
24644 {
24645 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24646 args = tree_cons (NULL, atype, args);
24647 }
24648
24649 type = build_function_type (rtype, args);
24650 }
24651 else
24652 {
24653 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24654 enum ix86_builtin_func_type icode;
24655
24656 icode = ix86_builtin_func_alias_base[index];
24657 type = ix86_get_builtin_func_type (icode);
24658 }
24659
24660 ix86_builtin_func_type_tab[(int) tcode] = type;
24661 return type;
24662 }
24663
24664
24665 /* Codes for all the SSE/MMX builtins. */
24666 enum ix86_builtins
24667 {
24668 IX86_BUILTIN_ADDPS,
24669 IX86_BUILTIN_ADDSS,
24670 IX86_BUILTIN_DIVPS,
24671 IX86_BUILTIN_DIVSS,
24672 IX86_BUILTIN_MULPS,
24673 IX86_BUILTIN_MULSS,
24674 IX86_BUILTIN_SUBPS,
24675 IX86_BUILTIN_SUBSS,
24676
24677 IX86_BUILTIN_CMPEQPS,
24678 IX86_BUILTIN_CMPLTPS,
24679 IX86_BUILTIN_CMPLEPS,
24680 IX86_BUILTIN_CMPGTPS,
24681 IX86_BUILTIN_CMPGEPS,
24682 IX86_BUILTIN_CMPNEQPS,
24683 IX86_BUILTIN_CMPNLTPS,
24684 IX86_BUILTIN_CMPNLEPS,
24685 IX86_BUILTIN_CMPNGTPS,
24686 IX86_BUILTIN_CMPNGEPS,
24687 IX86_BUILTIN_CMPORDPS,
24688 IX86_BUILTIN_CMPUNORDPS,
24689 IX86_BUILTIN_CMPEQSS,
24690 IX86_BUILTIN_CMPLTSS,
24691 IX86_BUILTIN_CMPLESS,
24692 IX86_BUILTIN_CMPNEQSS,
24693 IX86_BUILTIN_CMPNLTSS,
24694 IX86_BUILTIN_CMPNLESS,
24695 IX86_BUILTIN_CMPNGTSS,
24696 IX86_BUILTIN_CMPNGESS,
24697 IX86_BUILTIN_CMPORDSS,
24698 IX86_BUILTIN_CMPUNORDSS,
24699
24700 IX86_BUILTIN_COMIEQSS,
24701 IX86_BUILTIN_COMILTSS,
24702 IX86_BUILTIN_COMILESS,
24703 IX86_BUILTIN_COMIGTSS,
24704 IX86_BUILTIN_COMIGESS,
24705 IX86_BUILTIN_COMINEQSS,
24706 IX86_BUILTIN_UCOMIEQSS,
24707 IX86_BUILTIN_UCOMILTSS,
24708 IX86_BUILTIN_UCOMILESS,
24709 IX86_BUILTIN_UCOMIGTSS,
24710 IX86_BUILTIN_UCOMIGESS,
24711 IX86_BUILTIN_UCOMINEQSS,
24712
24713 IX86_BUILTIN_CVTPI2PS,
24714 IX86_BUILTIN_CVTPS2PI,
24715 IX86_BUILTIN_CVTSI2SS,
24716 IX86_BUILTIN_CVTSI642SS,
24717 IX86_BUILTIN_CVTSS2SI,
24718 IX86_BUILTIN_CVTSS2SI64,
24719 IX86_BUILTIN_CVTTPS2PI,
24720 IX86_BUILTIN_CVTTSS2SI,
24721 IX86_BUILTIN_CVTTSS2SI64,
24722
24723 IX86_BUILTIN_MAXPS,
24724 IX86_BUILTIN_MAXSS,
24725 IX86_BUILTIN_MINPS,
24726 IX86_BUILTIN_MINSS,
24727
24728 IX86_BUILTIN_LOADUPS,
24729 IX86_BUILTIN_STOREUPS,
24730 IX86_BUILTIN_MOVSS,
24731
24732 IX86_BUILTIN_MOVHLPS,
24733 IX86_BUILTIN_MOVLHPS,
24734 IX86_BUILTIN_LOADHPS,
24735 IX86_BUILTIN_LOADLPS,
24736 IX86_BUILTIN_STOREHPS,
24737 IX86_BUILTIN_STORELPS,
24738
24739 IX86_BUILTIN_MASKMOVQ,
24740 IX86_BUILTIN_MOVMSKPS,
24741 IX86_BUILTIN_PMOVMSKB,
24742
24743 IX86_BUILTIN_MOVNTPS,
24744 IX86_BUILTIN_MOVNTQ,
24745
24746 IX86_BUILTIN_LOADDQU,
24747 IX86_BUILTIN_STOREDQU,
24748
24749 IX86_BUILTIN_PACKSSWB,
24750 IX86_BUILTIN_PACKSSDW,
24751 IX86_BUILTIN_PACKUSWB,
24752
24753 IX86_BUILTIN_PADDB,
24754 IX86_BUILTIN_PADDW,
24755 IX86_BUILTIN_PADDD,
24756 IX86_BUILTIN_PADDQ,
24757 IX86_BUILTIN_PADDSB,
24758 IX86_BUILTIN_PADDSW,
24759 IX86_BUILTIN_PADDUSB,
24760 IX86_BUILTIN_PADDUSW,
24761 IX86_BUILTIN_PSUBB,
24762 IX86_BUILTIN_PSUBW,
24763 IX86_BUILTIN_PSUBD,
24764 IX86_BUILTIN_PSUBQ,
24765 IX86_BUILTIN_PSUBSB,
24766 IX86_BUILTIN_PSUBSW,
24767 IX86_BUILTIN_PSUBUSB,
24768 IX86_BUILTIN_PSUBUSW,
24769
24770 IX86_BUILTIN_PAND,
24771 IX86_BUILTIN_PANDN,
24772 IX86_BUILTIN_POR,
24773 IX86_BUILTIN_PXOR,
24774
24775 IX86_BUILTIN_PAVGB,
24776 IX86_BUILTIN_PAVGW,
24777
24778 IX86_BUILTIN_PCMPEQB,
24779 IX86_BUILTIN_PCMPEQW,
24780 IX86_BUILTIN_PCMPEQD,
24781 IX86_BUILTIN_PCMPGTB,
24782 IX86_BUILTIN_PCMPGTW,
24783 IX86_BUILTIN_PCMPGTD,
24784
24785 IX86_BUILTIN_PMADDWD,
24786
24787 IX86_BUILTIN_PMAXSW,
24788 IX86_BUILTIN_PMAXUB,
24789 IX86_BUILTIN_PMINSW,
24790 IX86_BUILTIN_PMINUB,
24791
24792 IX86_BUILTIN_PMULHUW,
24793 IX86_BUILTIN_PMULHW,
24794 IX86_BUILTIN_PMULLW,
24795
24796 IX86_BUILTIN_PSADBW,
24797 IX86_BUILTIN_PSHUFW,
24798
24799 IX86_BUILTIN_PSLLW,
24800 IX86_BUILTIN_PSLLD,
24801 IX86_BUILTIN_PSLLQ,
24802 IX86_BUILTIN_PSRAW,
24803 IX86_BUILTIN_PSRAD,
24804 IX86_BUILTIN_PSRLW,
24805 IX86_BUILTIN_PSRLD,
24806 IX86_BUILTIN_PSRLQ,
24807 IX86_BUILTIN_PSLLWI,
24808 IX86_BUILTIN_PSLLDI,
24809 IX86_BUILTIN_PSLLQI,
24810 IX86_BUILTIN_PSRAWI,
24811 IX86_BUILTIN_PSRADI,
24812 IX86_BUILTIN_PSRLWI,
24813 IX86_BUILTIN_PSRLDI,
24814 IX86_BUILTIN_PSRLQI,
24815
24816 IX86_BUILTIN_PUNPCKHBW,
24817 IX86_BUILTIN_PUNPCKHWD,
24818 IX86_BUILTIN_PUNPCKHDQ,
24819 IX86_BUILTIN_PUNPCKLBW,
24820 IX86_BUILTIN_PUNPCKLWD,
24821 IX86_BUILTIN_PUNPCKLDQ,
24822
24823 IX86_BUILTIN_SHUFPS,
24824
24825 IX86_BUILTIN_RCPPS,
24826 IX86_BUILTIN_RCPSS,
24827 IX86_BUILTIN_RSQRTPS,
24828 IX86_BUILTIN_RSQRTPS_NR,
24829 IX86_BUILTIN_RSQRTSS,
24830 IX86_BUILTIN_RSQRTF,
24831 IX86_BUILTIN_SQRTPS,
24832 IX86_BUILTIN_SQRTPS_NR,
24833 IX86_BUILTIN_SQRTSS,
24834
24835 IX86_BUILTIN_UNPCKHPS,
24836 IX86_BUILTIN_UNPCKLPS,
24837
24838 IX86_BUILTIN_ANDPS,
24839 IX86_BUILTIN_ANDNPS,
24840 IX86_BUILTIN_ORPS,
24841 IX86_BUILTIN_XORPS,
24842
24843 IX86_BUILTIN_EMMS,
24844 IX86_BUILTIN_LDMXCSR,
24845 IX86_BUILTIN_STMXCSR,
24846 IX86_BUILTIN_SFENCE,
24847
24848 /* 3DNow! Original */
24849 IX86_BUILTIN_FEMMS,
24850 IX86_BUILTIN_PAVGUSB,
24851 IX86_BUILTIN_PF2ID,
24852 IX86_BUILTIN_PFACC,
24853 IX86_BUILTIN_PFADD,
24854 IX86_BUILTIN_PFCMPEQ,
24855 IX86_BUILTIN_PFCMPGE,
24856 IX86_BUILTIN_PFCMPGT,
24857 IX86_BUILTIN_PFMAX,
24858 IX86_BUILTIN_PFMIN,
24859 IX86_BUILTIN_PFMUL,
24860 IX86_BUILTIN_PFRCP,
24861 IX86_BUILTIN_PFRCPIT1,
24862 IX86_BUILTIN_PFRCPIT2,
24863 IX86_BUILTIN_PFRSQIT1,
24864 IX86_BUILTIN_PFRSQRT,
24865 IX86_BUILTIN_PFSUB,
24866 IX86_BUILTIN_PFSUBR,
24867 IX86_BUILTIN_PI2FD,
24868 IX86_BUILTIN_PMULHRW,
24869
24870 /* 3DNow! Athlon Extensions */
24871 IX86_BUILTIN_PF2IW,
24872 IX86_BUILTIN_PFNACC,
24873 IX86_BUILTIN_PFPNACC,
24874 IX86_BUILTIN_PI2FW,
24875 IX86_BUILTIN_PSWAPDSI,
24876 IX86_BUILTIN_PSWAPDSF,
24877
24878 /* SSE2 */
24879 IX86_BUILTIN_ADDPD,
24880 IX86_BUILTIN_ADDSD,
24881 IX86_BUILTIN_DIVPD,
24882 IX86_BUILTIN_DIVSD,
24883 IX86_BUILTIN_MULPD,
24884 IX86_BUILTIN_MULSD,
24885 IX86_BUILTIN_SUBPD,
24886 IX86_BUILTIN_SUBSD,
24887
24888 IX86_BUILTIN_CMPEQPD,
24889 IX86_BUILTIN_CMPLTPD,
24890 IX86_BUILTIN_CMPLEPD,
24891 IX86_BUILTIN_CMPGTPD,
24892 IX86_BUILTIN_CMPGEPD,
24893 IX86_BUILTIN_CMPNEQPD,
24894 IX86_BUILTIN_CMPNLTPD,
24895 IX86_BUILTIN_CMPNLEPD,
24896 IX86_BUILTIN_CMPNGTPD,
24897 IX86_BUILTIN_CMPNGEPD,
24898 IX86_BUILTIN_CMPORDPD,
24899 IX86_BUILTIN_CMPUNORDPD,
24900 IX86_BUILTIN_CMPEQSD,
24901 IX86_BUILTIN_CMPLTSD,
24902 IX86_BUILTIN_CMPLESD,
24903 IX86_BUILTIN_CMPNEQSD,
24904 IX86_BUILTIN_CMPNLTSD,
24905 IX86_BUILTIN_CMPNLESD,
24906 IX86_BUILTIN_CMPORDSD,
24907 IX86_BUILTIN_CMPUNORDSD,
24908
24909 IX86_BUILTIN_COMIEQSD,
24910 IX86_BUILTIN_COMILTSD,
24911 IX86_BUILTIN_COMILESD,
24912 IX86_BUILTIN_COMIGTSD,
24913 IX86_BUILTIN_COMIGESD,
24914 IX86_BUILTIN_COMINEQSD,
24915 IX86_BUILTIN_UCOMIEQSD,
24916 IX86_BUILTIN_UCOMILTSD,
24917 IX86_BUILTIN_UCOMILESD,
24918 IX86_BUILTIN_UCOMIGTSD,
24919 IX86_BUILTIN_UCOMIGESD,
24920 IX86_BUILTIN_UCOMINEQSD,
24921
24922 IX86_BUILTIN_MAXPD,
24923 IX86_BUILTIN_MAXSD,
24924 IX86_BUILTIN_MINPD,
24925 IX86_BUILTIN_MINSD,
24926
24927 IX86_BUILTIN_ANDPD,
24928 IX86_BUILTIN_ANDNPD,
24929 IX86_BUILTIN_ORPD,
24930 IX86_BUILTIN_XORPD,
24931
24932 IX86_BUILTIN_SQRTPD,
24933 IX86_BUILTIN_SQRTSD,
24934
24935 IX86_BUILTIN_UNPCKHPD,
24936 IX86_BUILTIN_UNPCKLPD,
24937
24938 IX86_BUILTIN_SHUFPD,
24939
24940 IX86_BUILTIN_LOADUPD,
24941 IX86_BUILTIN_STOREUPD,
24942 IX86_BUILTIN_MOVSD,
24943
24944 IX86_BUILTIN_LOADHPD,
24945 IX86_BUILTIN_LOADLPD,
24946
24947 IX86_BUILTIN_CVTDQ2PD,
24948 IX86_BUILTIN_CVTDQ2PS,
24949
24950 IX86_BUILTIN_CVTPD2DQ,
24951 IX86_BUILTIN_CVTPD2PI,
24952 IX86_BUILTIN_CVTPD2PS,
24953 IX86_BUILTIN_CVTTPD2DQ,
24954 IX86_BUILTIN_CVTTPD2PI,
24955
24956 IX86_BUILTIN_CVTPI2PD,
24957 IX86_BUILTIN_CVTSI2SD,
24958 IX86_BUILTIN_CVTSI642SD,
24959
24960 IX86_BUILTIN_CVTSD2SI,
24961 IX86_BUILTIN_CVTSD2SI64,
24962 IX86_BUILTIN_CVTSD2SS,
24963 IX86_BUILTIN_CVTSS2SD,
24964 IX86_BUILTIN_CVTTSD2SI,
24965 IX86_BUILTIN_CVTTSD2SI64,
24966
24967 IX86_BUILTIN_CVTPS2DQ,
24968 IX86_BUILTIN_CVTPS2PD,
24969 IX86_BUILTIN_CVTTPS2DQ,
24970
24971 IX86_BUILTIN_MOVNTI,
24972 IX86_BUILTIN_MOVNTI64,
24973 IX86_BUILTIN_MOVNTPD,
24974 IX86_BUILTIN_MOVNTDQ,
24975
24976 IX86_BUILTIN_MOVQ128,
24977
24978 /* SSE2 MMX */
24979 IX86_BUILTIN_MASKMOVDQU,
24980 IX86_BUILTIN_MOVMSKPD,
24981 IX86_BUILTIN_PMOVMSKB128,
24982
24983 IX86_BUILTIN_PACKSSWB128,
24984 IX86_BUILTIN_PACKSSDW128,
24985 IX86_BUILTIN_PACKUSWB128,
24986
24987 IX86_BUILTIN_PADDB128,
24988 IX86_BUILTIN_PADDW128,
24989 IX86_BUILTIN_PADDD128,
24990 IX86_BUILTIN_PADDQ128,
24991 IX86_BUILTIN_PADDSB128,
24992 IX86_BUILTIN_PADDSW128,
24993 IX86_BUILTIN_PADDUSB128,
24994 IX86_BUILTIN_PADDUSW128,
24995 IX86_BUILTIN_PSUBB128,
24996 IX86_BUILTIN_PSUBW128,
24997 IX86_BUILTIN_PSUBD128,
24998 IX86_BUILTIN_PSUBQ128,
24999 IX86_BUILTIN_PSUBSB128,
25000 IX86_BUILTIN_PSUBSW128,
25001 IX86_BUILTIN_PSUBUSB128,
25002 IX86_BUILTIN_PSUBUSW128,
25003
25004 IX86_BUILTIN_PAND128,
25005 IX86_BUILTIN_PANDN128,
25006 IX86_BUILTIN_POR128,
25007 IX86_BUILTIN_PXOR128,
25008
25009 IX86_BUILTIN_PAVGB128,
25010 IX86_BUILTIN_PAVGW128,
25011
25012 IX86_BUILTIN_PCMPEQB128,
25013 IX86_BUILTIN_PCMPEQW128,
25014 IX86_BUILTIN_PCMPEQD128,
25015 IX86_BUILTIN_PCMPGTB128,
25016 IX86_BUILTIN_PCMPGTW128,
25017 IX86_BUILTIN_PCMPGTD128,
25018
25019 IX86_BUILTIN_PMADDWD128,
25020
25021 IX86_BUILTIN_PMAXSW128,
25022 IX86_BUILTIN_PMAXUB128,
25023 IX86_BUILTIN_PMINSW128,
25024 IX86_BUILTIN_PMINUB128,
25025
25026 IX86_BUILTIN_PMULUDQ,
25027 IX86_BUILTIN_PMULUDQ128,
25028 IX86_BUILTIN_PMULHUW128,
25029 IX86_BUILTIN_PMULHW128,
25030 IX86_BUILTIN_PMULLW128,
25031
25032 IX86_BUILTIN_PSADBW128,
25033 IX86_BUILTIN_PSHUFHW,
25034 IX86_BUILTIN_PSHUFLW,
25035 IX86_BUILTIN_PSHUFD,
25036
25037 IX86_BUILTIN_PSLLDQI128,
25038 IX86_BUILTIN_PSLLWI128,
25039 IX86_BUILTIN_PSLLDI128,
25040 IX86_BUILTIN_PSLLQI128,
25041 IX86_BUILTIN_PSRAWI128,
25042 IX86_BUILTIN_PSRADI128,
25043 IX86_BUILTIN_PSRLDQI128,
25044 IX86_BUILTIN_PSRLWI128,
25045 IX86_BUILTIN_PSRLDI128,
25046 IX86_BUILTIN_PSRLQI128,
25047
25048 IX86_BUILTIN_PSLLDQ128,
25049 IX86_BUILTIN_PSLLW128,
25050 IX86_BUILTIN_PSLLD128,
25051 IX86_BUILTIN_PSLLQ128,
25052 IX86_BUILTIN_PSRAW128,
25053 IX86_BUILTIN_PSRAD128,
25054 IX86_BUILTIN_PSRLW128,
25055 IX86_BUILTIN_PSRLD128,
25056 IX86_BUILTIN_PSRLQ128,
25057
25058 IX86_BUILTIN_PUNPCKHBW128,
25059 IX86_BUILTIN_PUNPCKHWD128,
25060 IX86_BUILTIN_PUNPCKHDQ128,
25061 IX86_BUILTIN_PUNPCKHQDQ128,
25062 IX86_BUILTIN_PUNPCKLBW128,
25063 IX86_BUILTIN_PUNPCKLWD128,
25064 IX86_BUILTIN_PUNPCKLDQ128,
25065 IX86_BUILTIN_PUNPCKLQDQ128,
25066
25067 IX86_BUILTIN_CLFLUSH,
25068 IX86_BUILTIN_MFENCE,
25069 IX86_BUILTIN_LFENCE,
25070 IX86_BUILTIN_PAUSE,
25071
25072 IX86_BUILTIN_BSRSI,
25073 IX86_BUILTIN_BSRDI,
25074 IX86_BUILTIN_RDPMC,
25075 IX86_BUILTIN_RDTSC,
25076 IX86_BUILTIN_RDTSCP,
25077 IX86_BUILTIN_ROLQI,
25078 IX86_BUILTIN_ROLHI,
25079 IX86_BUILTIN_RORQI,
25080 IX86_BUILTIN_RORHI,
25081
25082 /* SSE3. */
25083 IX86_BUILTIN_ADDSUBPS,
25084 IX86_BUILTIN_HADDPS,
25085 IX86_BUILTIN_HSUBPS,
25086 IX86_BUILTIN_MOVSHDUP,
25087 IX86_BUILTIN_MOVSLDUP,
25088 IX86_BUILTIN_ADDSUBPD,
25089 IX86_BUILTIN_HADDPD,
25090 IX86_BUILTIN_HSUBPD,
25091 IX86_BUILTIN_LDDQU,
25092
25093 IX86_BUILTIN_MONITOR,
25094 IX86_BUILTIN_MWAIT,
25095
25096 /* SSSE3. */
25097 IX86_BUILTIN_PHADDW,
25098 IX86_BUILTIN_PHADDD,
25099 IX86_BUILTIN_PHADDSW,
25100 IX86_BUILTIN_PHSUBW,
25101 IX86_BUILTIN_PHSUBD,
25102 IX86_BUILTIN_PHSUBSW,
25103 IX86_BUILTIN_PMADDUBSW,
25104 IX86_BUILTIN_PMULHRSW,
25105 IX86_BUILTIN_PSHUFB,
25106 IX86_BUILTIN_PSIGNB,
25107 IX86_BUILTIN_PSIGNW,
25108 IX86_BUILTIN_PSIGND,
25109 IX86_BUILTIN_PALIGNR,
25110 IX86_BUILTIN_PABSB,
25111 IX86_BUILTIN_PABSW,
25112 IX86_BUILTIN_PABSD,
25113
25114 IX86_BUILTIN_PHADDW128,
25115 IX86_BUILTIN_PHADDD128,
25116 IX86_BUILTIN_PHADDSW128,
25117 IX86_BUILTIN_PHSUBW128,
25118 IX86_BUILTIN_PHSUBD128,
25119 IX86_BUILTIN_PHSUBSW128,
25120 IX86_BUILTIN_PMADDUBSW128,
25121 IX86_BUILTIN_PMULHRSW128,
25122 IX86_BUILTIN_PSHUFB128,
25123 IX86_BUILTIN_PSIGNB128,
25124 IX86_BUILTIN_PSIGNW128,
25125 IX86_BUILTIN_PSIGND128,
25126 IX86_BUILTIN_PALIGNR128,
25127 IX86_BUILTIN_PABSB128,
25128 IX86_BUILTIN_PABSW128,
25129 IX86_BUILTIN_PABSD128,
25130
25131 /* AMDFAM10 - SSE4A New Instructions. */
25132 IX86_BUILTIN_MOVNTSD,
25133 IX86_BUILTIN_MOVNTSS,
25134 IX86_BUILTIN_EXTRQI,
25135 IX86_BUILTIN_EXTRQ,
25136 IX86_BUILTIN_INSERTQI,
25137 IX86_BUILTIN_INSERTQ,
25138
25139 /* SSE4.1. */
25140 IX86_BUILTIN_BLENDPD,
25141 IX86_BUILTIN_BLENDPS,
25142 IX86_BUILTIN_BLENDVPD,
25143 IX86_BUILTIN_BLENDVPS,
25144 IX86_BUILTIN_PBLENDVB128,
25145 IX86_BUILTIN_PBLENDW128,
25146
25147 IX86_BUILTIN_DPPD,
25148 IX86_BUILTIN_DPPS,
25149
25150 IX86_BUILTIN_INSERTPS128,
25151
25152 IX86_BUILTIN_MOVNTDQA,
25153 IX86_BUILTIN_MPSADBW128,
25154 IX86_BUILTIN_PACKUSDW128,
25155 IX86_BUILTIN_PCMPEQQ,
25156 IX86_BUILTIN_PHMINPOSUW128,
25157
25158 IX86_BUILTIN_PMAXSB128,
25159 IX86_BUILTIN_PMAXSD128,
25160 IX86_BUILTIN_PMAXUD128,
25161 IX86_BUILTIN_PMAXUW128,
25162
25163 IX86_BUILTIN_PMINSB128,
25164 IX86_BUILTIN_PMINSD128,
25165 IX86_BUILTIN_PMINUD128,
25166 IX86_BUILTIN_PMINUW128,
25167
25168 IX86_BUILTIN_PMOVSXBW128,
25169 IX86_BUILTIN_PMOVSXBD128,
25170 IX86_BUILTIN_PMOVSXBQ128,
25171 IX86_BUILTIN_PMOVSXWD128,
25172 IX86_BUILTIN_PMOVSXWQ128,
25173 IX86_BUILTIN_PMOVSXDQ128,
25174
25175 IX86_BUILTIN_PMOVZXBW128,
25176 IX86_BUILTIN_PMOVZXBD128,
25177 IX86_BUILTIN_PMOVZXBQ128,
25178 IX86_BUILTIN_PMOVZXWD128,
25179 IX86_BUILTIN_PMOVZXWQ128,
25180 IX86_BUILTIN_PMOVZXDQ128,
25181
25182 IX86_BUILTIN_PMULDQ128,
25183 IX86_BUILTIN_PMULLD128,
25184
25185 IX86_BUILTIN_ROUNDSD,
25186 IX86_BUILTIN_ROUNDSS,
25187
25188 IX86_BUILTIN_ROUNDPD,
25189 IX86_BUILTIN_ROUNDPS,
25190
25191 IX86_BUILTIN_FLOORPD,
25192 IX86_BUILTIN_CEILPD,
25193 IX86_BUILTIN_TRUNCPD,
25194 IX86_BUILTIN_RINTPD,
25195 IX86_BUILTIN_ROUNDPD_AZ,
25196
25197 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25198 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25199 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25200
25201 IX86_BUILTIN_FLOORPS,
25202 IX86_BUILTIN_CEILPS,
25203 IX86_BUILTIN_TRUNCPS,
25204 IX86_BUILTIN_RINTPS,
25205 IX86_BUILTIN_ROUNDPS_AZ,
25206
25207 IX86_BUILTIN_FLOORPS_SFIX,
25208 IX86_BUILTIN_CEILPS_SFIX,
25209 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25210
25211 IX86_BUILTIN_PTESTZ,
25212 IX86_BUILTIN_PTESTC,
25213 IX86_BUILTIN_PTESTNZC,
25214
25215 IX86_BUILTIN_VEC_INIT_V2SI,
25216 IX86_BUILTIN_VEC_INIT_V4HI,
25217 IX86_BUILTIN_VEC_INIT_V8QI,
25218 IX86_BUILTIN_VEC_EXT_V2DF,
25219 IX86_BUILTIN_VEC_EXT_V2DI,
25220 IX86_BUILTIN_VEC_EXT_V4SF,
25221 IX86_BUILTIN_VEC_EXT_V4SI,
25222 IX86_BUILTIN_VEC_EXT_V8HI,
25223 IX86_BUILTIN_VEC_EXT_V2SI,
25224 IX86_BUILTIN_VEC_EXT_V4HI,
25225 IX86_BUILTIN_VEC_EXT_V16QI,
25226 IX86_BUILTIN_VEC_SET_V2DI,
25227 IX86_BUILTIN_VEC_SET_V4SF,
25228 IX86_BUILTIN_VEC_SET_V4SI,
25229 IX86_BUILTIN_VEC_SET_V8HI,
25230 IX86_BUILTIN_VEC_SET_V4HI,
25231 IX86_BUILTIN_VEC_SET_V16QI,
25232
25233 IX86_BUILTIN_VEC_PACK_SFIX,
25234 IX86_BUILTIN_VEC_PACK_SFIX256,
25235
25236 /* SSE4.2. */
25237 IX86_BUILTIN_CRC32QI,
25238 IX86_BUILTIN_CRC32HI,
25239 IX86_BUILTIN_CRC32SI,
25240 IX86_BUILTIN_CRC32DI,
25241
25242 IX86_BUILTIN_PCMPESTRI128,
25243 IX86_BUILTIN_PCMPESTRM128,
25244 IX86_BUILTIN_PCMPESTRA128,
25245 IX86_BUILTIN_PCMPESTRC128,
25246 IX86_BUILTIN_PCMPESTRO128,
25247 IX86_BUILTIN_PCMPESTRS128,
25248 IX86_BUILTIN_PCMPESTRZ128,
25249 IX86_BUILTIN_PCMPISTRI128,
25250 IX86_BUILTIN_PCMPISTRM128,
25251 IX86_BUILTIN_PCMPISTRA128,
25252 IX86_BUILTIN_PCMPISTRC128,
25253 IX86_BUILTIN_PCMPISTRO128,
25254 IX86_BUILTIN_PCMPISTRS128,
25255 IX86_BUILTIN_PCMPISTRZ128,
25256
25257 IX86_BUILTIN_PCMPGTQ,
25258
25259 /* AES instructions */
25260 IX86_BUILTIN_AESENC128,
25261 IX86_BUILTIN_AESENCLAST128,
25262 IX86_BUILTIN_AESDEC128,
25263 IX86_BUILTIN_AESDECLAST128,
25264 IX86_BUILTIN_AESIMC128,
25265 IX86_BUILTIN_AESKEYGENASSIST128,
25266
25267 /* PCLMUL instruction */
25268 IX86_BUILTIN_PCLMULQDQ128,
25269
25270 /* AVX */
25271 IX86_BUILTIN_ADDPD256,
25272 IX86_BUILTIN_ADDPS256,
25273 IX86_BUILTIN_ADDSUBPD256,
25274 IX86_BUILTIN_ADDSUBPS256,
25275 IX86_BUILTIN_ANDPD256,
25276 IX86_BUILTIN_ANDPS256,
25277 IX86_BUILTIN_ANDNPD256,
25278 IX86_BUILTIN_ANDNPS256,
25279 IX86_BUILTIN_BLENDPD256,
25280 IX86_BUILTIN_BLENDPS256,
25281 IX86_BUILTIN_BLENDVPD256,
25282 IX86_BUILTIN_BLENDVPS256,
25283 IX86_BUILTIN_DIVPD256,
25284 IX86_BUILTIN_DIVPS256,
25285 IX86_BUILTIN_DPPS256,
25286 IX86_BUILTIN_HADDPD256,
25287 IX86_BUILTIN_HADDPS256,
25288 IX86_BUILTIN_HSUBPD256,
25289 IX86_BUILTIN_HSUBPS256,
25290 IX86_BUILTIN_MAXPD256,
25291 IX86_BUILTIN_MAXPS256,
25292 IX86_BUILTIN_MINPD256,
25293 IX86_BUILTIN_MINPS256,
25294 IX86_BUILTIN_MULPD256,
25295 IX86_BUILTIN_MULPS256,
25296 IX86_BUILTIN_ORPD256,
25297 IX86_BUILTIN_ORPS256,
25298 IX86_BUILTIN_SHUFPD256,
25299 IX86_BUILTIN_SHUFPS256,
25300 IX86_BUILTIN_SUBPD256,
25301 IX86_BUILTIN_SUBPS256,
25302 IX86_BUILTIN_XORPD256,
25303 IX86_BUILTIN_XORPS256,
25304 IX86_BUILTIN_CMPSD,
25305 IX86_BUILTIN_CMPSS,
25306 IX86_BUILTIN_CMPPD,
25307 IX86_BUILTIN_CMPPS,
25308 IX86_BUILTIN_CMPPD256,
25309 IX86_BUILTIN_CMPPS256,
25310 IX86_BUILTIN_CVTDQ2PD256,
25311 IX86_BUILTIN_CVTDQ2PS256,
25312 IX86_BUILTIN_CVTPD2PS256,
25313 IX86_BUILTIN_CVTPS2DQ256,
25314 IX86_BUILTIN_CVTPS2PD256,
25315 IX86_BUILTIN_CVTTPD2DQ256,
25316 IX86_BUILTIN_CVTPD2DQ256,
25317 IX86_BUILTIN_CVTTPS2DQ256,
25318 IX86_BUILTIN_EXTRACTF128PD256,
25319 IX86_BUILTIN_EXTRACTF128PS256,
25320 IX86_BUILTIN_EXTRACTF128SI256,
25321 IX86_BUILTIN_VZEROALL,
25322 IX86_BUILTIN_VZEROUPPER,
25323 IX86_BUILTIN_VPERMILVARPD,
25324 IX86_BUILTIN_VPERMILVARPS,
25325 IX86_BUILTIN_VPERMILVARPD256,
25326 IX86_BUILTIN_VPERMILVARPS256,
25327 IX86_BUILTIN_VPERMILPD,
25328 IX86_BUILTIN_VPERMILPS,
25329 IX86_BUILTIN_VPERMILPD256,
25330 IX86_BUILTIN_VPERMILPS256,
25331 IX86_BUILTIN_VPERMIL2PD,
25332 IX86_BUILTIN_VPERMIL2PS,
25333 IX86_BUILTIN_VPERMIL2PD256,
25334 IX86_BUILTIN_VPERMIL2PS256,
25335 IX86_BUILTIN_VPERM2F128PD256,
25336 IX86_BUILTIN_VPERM2F128PS256,
25337 IX86_BUILTIN_VPERM2F128SI256,
25338 IX86_BUILTIN_VBROADCASTSS,
25339 IX86_BUILTIN_VBROADCASTSD256,
25340 IX86_BUILTIN_VBROADCASTSS256,
25341 IX86_BUILTIN_VBROADCASTPD256,
25342 IX86_BUILTIN_VBROADCASTPS256,
25343 IX86_BUILTIN_VINSERTF128PD256,
25344 IX86_BUILTIN_VINSERTF128PS256,
25345 IX86_BUILTIN_VINSERTF128SI256,
25346 IX86_BUILTIN_LOADUPD256,
25347 IX86_BUILTIN_LOADUPS256,
25348 IX86_BUILTIN_STOREUPD256,
25349 IX86_BUILTIN_STOREUPS256,
25350 IX86_BUILTIN_LDDQU256,
25351 IX86_BUILTIN_MOVNTDQ256,
25352 IX86_BUILTIN_MOVNTPD256,
25353 IX86_BUILTIN_MOVNTPS256,
25354 IX86_BUILTIN_LOADDQU256,
25355 IX86_BUILTIN_STOREDQU256,
25356 IX86_BUILTIN_MASKLOADPD,
25357 IX86_BUILTIN_MASKLOADPS,
25358 IX86_BUILTIN_MASKSTOREPD,
25359 IX86_BUILTIN_MASKSTOREPS,
25360 IX86_BUILTIN_MASKLOADPD256,
25361 IX86_BUILTIN_MASKLOADPS256,
25362 IX86_BUILTIN_MASKSTOREPD256,
25363 IX86_BUILTIN_MASKSTOREPS256,
25364 IX86_BUILTIN_MOVSHDUP256,
25365 IX86_BUILTIN_MOVSLDUP256,
25366 IX86_BUILTIN_MOVDDUP256,
25367
25368 IX86_BUILTIN_SQRTPD256,
25369 IX86_BUILTIN_SQRTPS256,
25370 IX86_BUILTIN_SQRTPS_NR256,
25371 IX86_BUILTIN_RSQRTPS256,
25372 IX86_BUILTIN_RSQRTPS_NR256,
25373
25374 IX86_BUILTIN_RCPPS256,
25375
25376 IX86_BUILTIN_ROUNDPD256,
25377 IX86_BUILTIN_ROUNDPS256,
25378
25379 IX86_BUILTIN_FLOORPD256,
25380 IX86_BUILTIN_CEILPD256,
25381 IX86_BUILTIN_TRUNCPD256,
25382 IX86_BUILTIN_RINTPD256,
25383 IX86_BUILTIN_ROUNDPD_AZ256,
25384
25385 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25386 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25387 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25388
25389 IX86_BUILTIN_FLOORPS256,
25390 IX86_BUILTIN_CEILPS256,
25391 IX86_BUILTIN_TRUNCPS256,
25392 IX86_BUILTIN_RINTPS256,
25393 IX86_BUILTIN_ROUNDPS_AZ256,
25394
25395 IX86_BUILTIN_FLOORPS_SFIX256,
25396 IX86_BUILTIN_CEILPS_SFIX256,
25397 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25398
25399 IX86_BUILTIN_UNPCKHPD256,
25400 IX86_BUILTIN_UNPCKLPD256,
25401 IX86_BUILTIN_UNPCKHPS256,
25402 IX86_BUILTIN_UNPCKLPS256,
25403
25404 IX86_BUILTIN_SI256_SI,
25405 IX86_BUILTIN_PS256_PS,
25406 IX86_BUILTIN_PD256_PD,
25407 IX86_BUILTIN_SI_SI256,
25408 IX86_BUILTIN_PS_PS256,
25409 IX86_BUILTIN_PD_PD256,
25410
25411 IX86_BUILTIN_VTESTZPD,
25412 IX86_BUILTIN_VTESTCPD,
25413 IX86_BUILTIN_VTESTNZCPD,
25414 IX86_BUILTIN_VTESTZPS,
25415 IX86_BUILTIN_VTESTCPS,
25416 IX86_BUILTIN_VTESTNZCPS,
25417 IX86_BUILTIN_VTESTZPD256,
25418 IX86_BUILTIN_VTESTCPD256,
25419 IX86_BUILTIN_VTESTNZCPD256,
25420 IX86_BUILTIN_VTESTZPS256,
25421 IX86_BUILTIN_VTESTCPS256,
25422 IX86_BUILTIN_VTESTNZCPS256,
25423 IX86_BUILTIN_PTESTZ256,
25424 IX86_BUILTIN_PTESTC256,
25425 IX86_BUILTIN_PTESTNZC256,
25426
25427 IX86_BUILTIN_MOVMSKPD256,
25428 IX86_BUILTIN_MOVMSKPS256,
25429
25430 /* AVX2 */
25431 IX86_BUILTIN_MPSADBW256,
25432 IX86_BUILTIN_PABSB256,
25433 IX86_BUILTIN_PABSW256,
25434 IX86_BUILTIN_PABSD256,
25435 IX86_BUILTIN_PACKSSDW256,
25436 IX86_BUILTIN_PACKSSWB256,
25437 IX86_BUILTIN_PACKUSDW256,
25438 IX86_BUILTIN_PACKUSWB256,
25439 IX86_BUILTIN_PADDB256,
25440 IX86_BUILTIN_PADDW256,
25441 IX86_BUILTIN_PADDD256,
25442 IX86_BUILTIN_PADDQ256,
25443 IX86_BUILTIN_PADDSB256,
25444 IX86_BUILTIN_PADDSW256,
25445 IX86_BUILTIN_PADDUSB256,
25446 IX86_BUILTIN_PADDUSW256,
25447 IX86_BUILTIN_PALIGNR256,
25448 IX86_BUILTIN_AND256I,
25449 IX86_BUILTIN_ANDNOT256I,
25450 IX86_BUILTIN_PAVGB256,
25451 IX86_BUILTIN_PAVGW256,
25452 IX86_BUILTIN_PBLENDVB256,
25453 IX86_BUILTIN_PBLENDVW256,
25454 IX86_BUILTIN_PCMPEQB256,
25455 IX86_BUILTIN_PCMPEQW256,
25456 IX86_BUILTIN_PCMPEQD256,
25457 IX86_BUILTIN_PCMPEQQ256,
25458 IX86_BUILTIN_PCMPGTB256,
25459 IX86_BUILTIN_PCMPGTW256,
25460 IX86_BUILTIN_PCMPGTD256,
25461 IX86_BUILTIN_PCMPGTQ256,
25462 IX86_BUILTIN_PHADDW256,
25463 IX86_BUILTIN_PHADDD256,
25464 IX86_BUILTIN_PHADDSW256,
25465 IX86_BUILTIN_PHSUBW256,
25466 IX86_BUILTIN_PHSUBD256,
25467 IX86_BUILTIN_PHSUBSW256,
25468 IX86_BUILTIN_PMADDUBSW256,
25469 IX86_BUILTIN_PMADDWD256,
25470 IX86_BUILTIN_PMAXSB256,
25471 IX86_BUILTIN_PMAXSW256,
25472 IX86_BUILTIN_PMAXSD256,
25473 IX86_BUILTIN_PMAXUB256,
25474 IX86_BUILTIN_PMAXUW256,
25475 IX86_BUILTIN_PMAXUD256,
25476 IX86_BUILTIN_PMINSB256,
25477 IX86_BUILTIN_PMINSW256,
25478 IX86_BUILTIN_PMINSD256,
25479 IX86_BUILTIN_PMINUB256,
25480 IX86_BUILTIN_PMINUW256,
25481 IX86_BUILTIN_PMINUD256,
25482 IX86_BUILTIN_PMOVMSKB256,
25483 IX86_BUILTIN_PMOVSXBW256,
25484 IX86_BUILTIN_PMOVSXBD256,
25485 IX86_BUILTIN_PMOVSXBQ256,
25486 IX86_BUILTIN_PMOVSXWD256,
25487 IX86_BUILTIN_PMOVSXWQ256,
25488 IX86_BUILTIN_PMOVSXDQ256,
25489 IX86_BUILTIN_PMOVZXBW256,
25490 IX86_BUILTIN_PMOVZXBD256,
25491 IX86_BUILTIN_PMOVZXBQ256,
25492 IX86_BUILTIN_PMOVZXWD256,
25493 IX86_BUILTIN_PMOVZXWQ256,
25494 IX86_BUILTIN_PMOVZXDQ256,
25495 IX86_BUILTIN_PMULDQ256,
25496 IX86_BUILTIN_PMULHRSW256,
25497 IX86_BUILTIN_PMULHUW256,
25498 IX86_BUILTIN_PMULHW256,
25499 IX86_BUILTIN_PMULLW256,
25500 IX86_BUILTIN_PMULLD256,
25501 IX86_BUILTIN_PMULUDQ256,
25502 IX86_BUILTIN_POR256,
25503 IX86_BUILTIN_PSADBW256,
25504 IX86_BUILTIN_PSHUFB256,
25505 IX86_BUILTIN_PSHUFD256,
25506 IX86_BUILTIN_PSHUFHW256,
25507 IX86_BUILTIN_PSHUFLW256,
25508 IX86_BUILTIN_PSIGNB256,
25509 IX86_BUILTIN_PSIGNW256,
25510 IX86_BUILTIN_PSIGND256,
25511 IX86_BUILTIN_PSLLDQI256,
25512 IX86_BUILTIN_PSLLWI256,
25513 IX86_BUILTIN_PSLLW256,
25514 IX86_BUILTIN_PSLLDI256,
25515 IX86_BUILTIN_PSLLD256,
25516 IX86_BUILTIN_PSLLQI256,
25517 IX86_BUILTIN_PSLLQ256,
25518 IX86_BUILTIN_PSRAWI256,
25519 IX86_BUILTIN_PSRAW256,
25520 IX86_BUILTIN_PSRADI256,
25521 IX86_BUILTIN_PSRAD256,
25522 IX86_BUILTIN_PSRLDQI256,
25523 IX86_BUILTIN_PSRLWI256,
25524 IX86_BUILTIN_PSRLW256,
25525 IX86_BUILTIN_PSRLDI256,
25526 IX86_BUILTIN_PSRLD256,
25527 IX86_BUILTIN_PSRLQI256,
25528 IX86_BUILTIN_PSRLQ256,
25529 IX86_BUILTIN_PSUBB256,
25530 IX86_BUILTIN_PSUBW256,
25531 IX86_BUILTIN_PSUBD256,
25532 IX86_BUILTIN_PSUBQ256,
25533 IX86_BUILTIN_PSUBSB256,
25534 IX86_BUILTIN_PSUBSW256,
25535 IX86_BUILTIN_PSUBUSB256,
25536 IX86_BUILTIN_PSUBUSW256,
25537 IX86_BUILTIN_PUNPCKHBW256,
25538 IX86_BUILTIN_PUNPCKHWD256,
25539 IX86_BUILTIN_PUNPCKHDQ256,
25540 IX86_BUILTIN_PUNPCKHQDQ256,
25541 IX86_BUILTIN_PUNPCKLBW256,
25542 IX86_BUILTIN_PUNPCKLWD256,
25543 IX86_BUILTIN_PUNPCKLDQ256,
25544 IX86_BUILTIN_PUNPCKLQDQ256,
25545 IX86_BUILTIN_PXOR256,
25546 IX86_BUILTIN_MOVNTDQA256,
25547 IX86_BUILTIN_VBROADCASTSS_PS,
25548 IX86_BUILTIN_VBROADCASTSS_PS256,
25549 IX86_BUILTIN_VBROADCASTSD_PD256,
25550 IX86_BUILTIN_VBROADCASTSI256,
25551 IX86_BUILTIN_PBLENDD256,
25552 IX86_BUILTIN_PBLENDD128,
25553 IX86_BUILTIN_PBROADCASTB256,
25554 IX86_BUILTIN_PBROADCASTW256,
25555 IX86_BUILTIN_PBROADCASTD256,
25556 IX86_BUILTIN_PBROADCASTQ256,
25557 IX86_BUILTIN_PBROADCASTB128,
25558 IX86_BUILTIN_PBROADCASTW128,
25559 IX86_BUILTIN_PBROADCASTD128,
25560 IX86_BUILTIN_PBROADCASTQ128,
25561 IX86_BUILTIN_VPERMVARSI256,
25562 IX86_BUILTIN_VPERMDF256,
25563 IX86_BUILTIN_VPERMVARSF256,
25564 IX86_BUILTIN_VPERMDI256,
25565 IX86_BUILTIN_VPERMTI256,
25566 IX86_BUILTIN_VEXTRACT128I256,
25567 IX86_BUILTIN_VINSERT128I256,
25568 IX86_BUILTIN_MASKLOADD,
25569 IX86_BUILTIN_MASKLOADQ,
25570 IX86_BUILTIN_MASKLOADD256,
25571 IX86_BUILTIN_MASKLOADQ256,
25572 IX86_BUILTIN_MASKSTORED,
25573 IX86_BUILTIN_MASKSTOREQ,
25574 IX86_BUILTIN_MASKSTORED256,
25575 IX86_BUILTIN_MASKSTOREQ256,
25576 IX86_BUILTIN_PSLLVV4DI,
25577 IX86_BUILTIN_PSLLVV2DI,
25578 IX86_BUILTIN_PSLLVV8SI,
25579 IX86_BUILTIN_PSLLVV4SI,
25580 IX86_BUILTIN_PSRAVV8SI,
25581 IX86_BUILTIN_PSRAVV4SI,
25582 IX86_BUILTIN_PSRLVV4DI,
25583 IX86_BUILTIN_PSRLVV2DI,
25584 IX86_BUILTIN_PSRLVV8SI,
25585 IX86_BUILTIN_PSRLVV4SI,
25586
25587 IX86_BUILTIN_GATHERSIV2DF,
25588 IX86_BUILTIN_GATHERSIV4DF,
25589 IX86_BUILTIN_GATHERDIV2DF,
25590 IX86_BUILTIN_GATHERDIV4DF,
25591 IX86_BUILTIN_GATHERSIV4SF,
25592 IX86_BUILTIN_GATHERSIV8SF,
25593 IX86_BUILTIN_GATHERDIV4SF,
25594 IX86_BUILTIN_GATHERDIV8SF,
25595 IX86_BUILTIN_GATHERSIV2DI,
25596 IX86_BUILTIN_GATHERSIV4DI,
25597 IX86_BUILTIN_GATHERDIV2DI,
25598 IX86_BUILTIN_GATHERDIV4DI,
25599 IX86_BUILTIN_GATHERSIV4SI,
25600 IX86_BUILTIN_GATHERSIV8SI,
25601 IX86_BUILTIN_GATHERDIV4SI,
25602 IX86_BUILTIN_GATHERDIV8SI,
25603
25604 /* Alternate 4 element gather for the vectorizer where
25605 all operands are 32-byte wide. */
25606 IX86_BUILTIN_GATHERALTSIV4DF,
25607 IX86_BUILTIN_GATHERALTDIV8SF,
25608 IX86_BUILTIN_GATHERALTSIV4DI,
25609 IX86_BUILTIN_GATHERALTDIV8SI,
25610
25611 /* TFmode support builtins. */
25612 IX86_BUILTIN_INFQ,
25613 IX86_BUILTIN_HUGE_VALQ,
25614 IX86_BUILTIN_FABSQ,
25615 IX86_BUILTIN_COPYSIGNQ,
25616
25617 /* Vectorizer support builtins. */
25618 IX86_BUILTIN_CPYSGNPS,
25619 IX86_BUILTIN_CPYSGNPD,
25620 IX86_BUILTIN_CPYSGNPS256,
25621 IX86_BUILTIN_CPYSGNPD256,
25622
25623 /* FMA4 instructions. */
25624 IX86_BUILTIN_VFMADDSS,
25625 IX86_BUILTIN_VFMADDSD,
25626 IX86_BUILTIN_VFMADDPS,
25627 IX86_BUILTIN_VFMADDPD,
25628 IX86_BUILTIN_VFMADDPS256,
25629 IX86_BUILTIN_VFMADDPD256,
25630 IX86_BUILTIN_VFMADDSUBPS,
25631 IX86_BUILTIN_VFMADDSUBPD,
25632 IX86_BUILTIN_VFMADDSUBPS256,
25633 IX86_BUILTIN_VFMADDSUBPD256,
25634
25635 /* FMA3 instructions. */
25636 IX86_BUILTIN_VFMADDSS3,
25637 IX86_BUILTIN_VFMADDSD3,
25638
25639 /* XOP instructions. */
25640 IX86_BUILTIN_VPCMOV,
25641 IX86_BUILTIN_VPCMOV_V2DI,
25642 IX86_BUILTIN_VPCMOV_V4SI,
25643 IX86_BUILTIN_VPCMOV_V8HI,
25644 IX86_BUILTIN_VPCMOV_V16QI,
25645 IX86_BUILTIN_VPCMOV_V4SF,
25646 IX86_BUILTIN_VPCMOV_V2DF,
25647 IX86_BUILTIN_VPCMOV256,
25648 IX86_BUILTIN_VPCMOV_V4DI256,
25649 IX86_BUILTIN_VPCMOV_V8SI256,
25650 IX86_BUILTIN_VPCMOV_V16HI256,
25651 IX86_BUILTIN_VPCMOV_V32QI256,
25652 IX86_BUILTIN_VPCMOV_V8SF256,
25653 IX86_BUILTIN_VPCMOV_V4DF256,
25654
25655 IX86_BUILTIN_VPPERM,
25656
25657 IX86_BUILTIN_VPMACSSWW,
25658 IX86_BUILTIN_VPMACSWW,
25659 IX86_BUILTIN_VPMACSSWD,
25660 IX86_BUILTIN_VPMACSWD,
25661 IX86_BUILTIN_VPMACSSDD,
25662 IX86_BUILTIN_VPMACSDD,
25663 IX86_BUILTIN_VPMACSSDQL,
25664 IX86_BUILTIN_VPMACSSDQH,
25665 IX86_BUILTIN_VPMACSDQL,
25666 IX86_BUILTIN_VPMACSDQH,
25667 IX86_BUILTIN_VPMADCSSWD,
25668 IX86_BUILTIN_VPMADCSWD,
25669
25670 IX86_BUILTIN_VPHADDBW,
25671 IX86_BUILTIN_VPHADDBD,
25672 IX86_BUILTIN_VPHADDBQ,
25673 IX86_BUILTIN_VPHADDWD,
25674 IX86_BUILTIN_VPHADDWQ,
25675 IX86_BUILTIN_VPHADDDQ,
25676 IX86_BUILTIN_VPHADDUBW,
25677 IX86_BUILTIN_VPHADDUBD,
25678 IX86_BUILTIN_VPHADDUBQ,
25679 IX86_BUILTIN_VPHADDUWD,
25680 IX86_BUILTIN_VPHADDUWQ,
25681 IX86_BUILTIN_VPHADDUDQ,
25682 IX86_BUILTIN_VPHSUBBW,
25683 IX86_BUILTIN_VPHSUBWD,
25684 IX86_BUILTIN_VPHSUBDQ,
25685
25686 IX86_BUILTIN_VPROTB,
25687 IX86_BUILTIN_VPROTW,
25688 IX86_BUILTIN_VPROTD,
25689 IX86_BUILTIN_VPROTQ,
25690 IX86_BUILTIN_VPROTB_IMM,
25691 IX86_BUILTIN_VPROTW_IMM,
25692 IX86_BUILTIN_VPROTD_IMM,
25693 IX86_BUILTIN_VPROTQ_IMM,
25694
25695 IX86_BUILTIN_VPSHLB,
25696 IX86_BUILTIN_VPSHLW,
25697 IX86_BUILTIN_VPSHLD,
25698 IX86_BUILTIN_VPSHLQ,
25699 IX86_BUILTIN_VPSHAB,
25700 IX86_BUILTIN_VPSHAW,
25701 IX86_BUILTIN_VPSHAD,
25702 IX86_BUILTIN_VPSHAQ,
25703
25704 IX86_BUILTIN_VFRCZSS,
25705 IX86_BUILTIN_VFRCZSD,
25706 IX86_BUILTIN_VFRCZPS,
25707 IX86_BUILTIN_VFRCZPD,
25708 IX86_BUILTIN_VFRCZPS256,
25709 IX86_BUILTIN_VFRCZPD256,
25710
25711 IX86_BUILTIN_VPCOMEQUB,
25712 IX86_BUILTIN_VPCOMNEUB,
25713 IX86_BUILTIN_VPCOMLTUB,
25714 IX86_BUILTIN_VPCOMLEUB,
25715 IX86_BUILTIN_VPCOMGTUB,
25716 IX86_BUILTIN_VPCOMGEUB,
25717 IX86_BUILTIN_VPCOMFALSEUB,
25718 IX86_BUILTIN_VPCOMTRUEUB,
25719
25720 IX86_BUILTIN_VPCOMEQUW,
25721 IX86_BUILTIN_VPCOMNEUW,
25722 IX86_BUILTIN_VPCOMLTUW,
25723 IX86_BUILTIN_VPCOMLEUW,
25724 IX86_BUILTIN_VPCOMGTUW,
25725 IX86_BUILTIN_VPCOMGEUW,
25726 IX86_BUILTIN_VPCOMFALSEUW,
25727 IX86_BUILTIN_VPCOMTRUEUW,
25728
25729 IX86_BUILTIN_VPCOMEQUD,
25730 IX86_BUILTIN_VPCOMNEUD,
25731 IX86_BUILTIN_VPCOMLTUD,
25732 IX86_BUILTIN_VPCOMLEUD,
25733 IX86_BUILTIN_VPCOMGTUD,
25734 IX86_BUILTIN_VPCOMGEUD,
25735 IX86_BUILTIN_VPCOMFALSEUD,
25736 IX86_BUILTIN_VPCOMTRUEUD,
25737
25738 IX86_BUILTIN_VPCOMEQUQ,
25739 IX86_BUILTIN_VPCOMNEUQ,
25740 IX86_BUILTIN_VPCOMLTUQ,
25741 IX86_BUILTIN_VPCOMLEUQ,
25742 IX86_BUILTIN_VPCOMGTUQ,
25743 IX86_BUILTIN_VPCOMGEUQ,
25744 IX86_BUILTIN_VPCOMFALSEUQ,
25745 IX86_BUILTIN_VPCOMTRUEUQ,
25746
25747 IX86_BUILTIN_VPCOMEQB,
25748 IX86_BUILTIN_VPCOMNEB,
25749 IX86_BUILTIN_VPCOMLTB,
25750 IX86_BUILTIN_VPCOMLEB,
25751 IX86_BUILTIN_VPCOMGTB,
25752 IX86_BUILTIN_VPCOMGEB,
25753 IX86_BUILTIN_VPCOMFALSEB,
25754 IX86_BUILTIN_VPCOMTRUEB,
25755
25756 IX86_BUILTIN_VPCOMEQW,
25757 IX86_BUILTIN_VPCOMNEW,
25758 IX86_BUILTIN_VPCOMLTW,
25759 IX86_BUILTIN_VPCOMLEW,
25760 IX86_BUILTIN_VPCOMGTW,
25761 IX86_BUILTIN_VPCOMGEW,
25762 IX86_BUILTIN_VPCOMFALSEW,
25763 IX86_BUILTIN_VPCOMTRUEW,
25764
25765 IX86_BUILTIN_VPCOMEQD,
25766 IX86_BUILTIN_VPCOMNED,
25767 IX86_BUILTIN_VPCOMLTD,
25768 IX86_BUILTIN_VPCOMLED,
25769 IX86_BUILTIN_VPCOMGTD,
25770 IX86_BUILTIN_VPCOMGED,
25771 IX86_BUILTIN_VPCOMFALSED,
25772 IX86_BUILTIN_VPCOMTRUED,
25773
25774 IX86_BUILTIN_VPCOMEQQ,
25775 IX86_BUILTIN_VPCOMNEQ,
25776 IX86_BUILTIN_VPCOMLTQ,
25777 IX86_BUILTIN_VPCOMLEQ,
25778 IX86_BUILTIN_VPCOMGTQ,
25779 IX86_BUILTIN_VPCOMGEQ,
25780 IX86_BUILTIN_VPCOMFALSEQ,
25781 IX86_BUILTIN_VPCOMTRUEQ,
25782
25783 /* LWP instructions. */
25784 IX86_BUILTIN_LLWPCB,
25785 IX86_BUILTIN_SLWPCB,
25786 IX86_BUILTIN_LWPVAL32,
25787 IX86_BUILTIN_LWPVAL64,
25788 IX86_BUILTIN_LWPINS32,
25789 IX86_BUILTIN_LWPINS64,
25790
25791 IX86_BUILTIN_CLZS,
25792
25793 /* RTM */
25794 IX86_BUILTIN_XBEGIN,
25795 IX86_BUILTIN_XEND,
25796 IX86_BUILTIN_XABORT,
25797 IX86_BUILTIN_XTEST,
25798
25799 /* BMI instructions. */
25800 IX86_BUILTIN_BEXTR32,
25801 IX86_BUILTIN_BEXTR64,
25802 IX86_BUILTIN_CTZS,
25803
25804 /* TBM instructions. */
25805 IX86_BUILTIN_BEXTRI32,
25806 IX86_BUILTIN_BEXTRI64,
25807
25808 /* BMI2 instructions. */
25809 IX86_BUILTIN_BZHI32,
25810 IX86_BUILTIN_BZHI64,
25811 IX86_BUILTIN_PDEP32,
25812 IX86_BUILTIN_PDEP64,
25813 IX86_BUILTIN_PEXT32,
25814 IX86_BUILTIN_PEXT64,
25815
25816 /* FSGSBASE instructions. */
25817 IX86_BUILTIN_RDFSBASE32,
25818 IX86_BUILTIN_RDFSBASE64,
25819 IX86_BUILTIN_RDGSBASE32,
25820 IX86_BUILTIN_RDGSBASE64,
25821 IX86_BUILTIN_WRFSBASE32,
25822 IX86_BUILTIN_WRFSBASE64,
25823 IX86_BUILTIN_WRGSBASE32,
25824 IX86_BUILTIN_WRGSBASE64,
25825
25826 /* RDRND instructions. */
25827 IX86_BUILTIN_RDRAND16_STEP,
25828 IX86_BUILTIN_RDRAND32_STEP,
25829 IX86_BUILTIN_RDRAND64_STEP,
25830
25831 /* F16C instructions. */
25832 IX86_BUILTIN_CVTPH2PS,
25833 IX86_BUILTIN_CVTPH2PS256,
25834 IX86_BUILTIN_CVTPS2PH,
25835 IX86_BUILTIN_CVTPS2PH256,
25836
25837 /* CFString built-in for darwin */
25838 IX86_BUILTIN_CFSTRING,
25839
25840 /* Builtins to get CPU type and supported features. */
25841 IX86_BUILTIN_CPU_INIT,
25842 IX86_BUILTIN_CPU_IS,
25843 IX86_BUILTIN_CPU_SUPPORTS,
25844
25845 IX86_BUILTIN_MAX
25846 };
25847
25848 /* Table for the ix86 builtin decls. */
25849 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25850
25851 /* Table of all of the builtin functions that are possible with different ISA's
25852 but are waiting to be built until a function is declared to use that
25853 ISA. */
25854 struct builtin_isa {
25855 const char *name; /* function name */
25856 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25857 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25858 bool const_p; /* true if the declaration is constant */
25859 bool set_and_not_built_p;
25860 };
25861
25862 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25863
25864
25865 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25866 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25867 function decl in the ix86_builtins array. Returns the function decl or
25868 NULL_TREE, if the builtin was not added.
25869
25870 If the front end has a special hook for builtin functions, delay adding
25871 builtin functions that aren't in the current ISA until the ISA is changed
25872 with function specific optimization. Doing so, can save about 300K for the
25873 default compiler. When the builtin is expanded, check at that time whether
25874 it is valid.
25875
25876 If the front end doesn't have a special hook, record all builtins, even if
25877 it isn't an instruction set in the current ISA in case the user uses
25878 function specific options for a different ISA, so that we don't get scope
25879 errors if a builtin is added in the middle of a function scope. */
25880
25881 static inline tree
25882 def_builtin (HOST_WIDE_INT mask, const char *name,
25883 enum ix86_builtin_func_type tcode,
25884 enum ix86_builtins code)
25885 {
25886 tree decl = NULL_TREE;
25887
25888 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25889 {
25890 ix86_builtins_isa[(int) code].isa = mask;
25891
25892 mask &= ~OPTION_MASK_ISA_64BIT;
25893 if (mask == 0
25894 || (mask & ix86_isa_flags) != 0
25895 || (lang_hooks.builtin_function
25896 == lang_hooks.builtin_function_ext_scope))
25897
25898 {
25899 tree type = ix86_get_builtin_func_type (tcode);
25900 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25901 NULL, NULL_TREE);
25902 ix86_builtins[(int) code] = decl;
25903 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25904 }
25905 else
25906 {
25907 ix86_builtins[(int) code] = NULL_TREE;
25908 ix86_builtins_isa[(int) code].tcode = tcode;
25909 ix86_builtins_isa[(int) code].name = name;
25910 ix86_builtins_isa[(int) code].const_p = false;
25911 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25912 }
25913 }
25914
25915 return decl;
25916 }
25917
25918 /* Like def_builtin, but also marks the function decl "const". */
25919
25920 static inline tree
25921 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25922 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25923 {
25924 tree decl = def_builtin (mask, name, tcode, code);
25925 if (decl)
25926 TREE_READONLY (decl) = 1;
25927 else
25928 ix86_builtins_isa[(int) code].const_p = true;
25929
25930 return decl;
25931 }
25932
25933 /* Add any new builtin functions for a given ISA that may not have been
25934 declared. This saves a bit of space compared to adding all of the
25935 declarations to the tree, even if we didn't use them. */
25936
25937 static void
25938 ix86_add_new_builtins (HOST_WIDE_INT isa)
25939 {
25940 int i;
25941
25942 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25943 {
25944 if ((ix86_builtins_isa[i].isa & isa) != 0
25945 && ix86_builtins_isa[i].set_and_not_built_p)
25946 {
25947 tree decl, type;
25948
25949 /* Don't define the builtin again. */
25950 ix86_builtins_isa[i].set_and_not_built_p = false;
25951
25952 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25953 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25954 type, i, BUILT_IN_MD, NULL,
25955 NULL_TREE);
25956
25957 ix86_builtins[i] = decl;
25958 if (ix86_builtins_isa[i].const_p)
25959 TREE_READONLY (decl) = 1;
25960 }
25961 }
25962 }
25963
25964 /* Bits for builtin_description.flag. */
25965
25966 /* Set when we don't support the comparison natively, and should
25967 swap_comparison in order to support it. */
25968 #define BUILTIN_DESC_SWAP_OPERANDS 1
25969
25970 struct builtin_description
25971 {
25972 const HOST_WIDE_INT mask;
25973 const enum insn_code icode;
25974 const char *const name;
25975 const enum ix86_builtins code;
25976 const enum rtx_code comparison;
25977 const int flag;
25978 };
25979
25980 static const struct builtin_description bdesc_comi[] =
25981 {
25982 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25983 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25984 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25985 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25986 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25987 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25988 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25989 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25990 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25991 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25992 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25993 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25994 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25995 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25996 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25997 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25998 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25999 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26000 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26001 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26002 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26003 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26004 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26005 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26006 };
26007
26008 static const struct builtin_description bdesc_pcmpestr[] =
26009 {
26010 /* SSE4.2 */
26011 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26012 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26013 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26014 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26015 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26016 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26017 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26018 };
26019
26020 static const struct builtin_description bdesc_pcmpistr[] =
26021 {
26022 /* SSE4.2 */
26023 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26024 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26025 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26026 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26027 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26028 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26029 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26030 };
26031
26032 /* Special builtins with variable number of arguments. */
26033 static const struct builtin_description bdesc_special_args[] =
26034 {
26035 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26036 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26037 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26038
26039 /* MMX */
26040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26041
26042 /* 3DNow! */
26043 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26044
26045 /* SSE */
26046 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26047 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26049
26050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26052 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26053 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26054
26055 /* SSE or 3DNow!A */
26056 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26057 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26058
26059 /* SSE2 */
26060 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26061 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26062 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26063 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26064 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26065 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26066 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26067 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26068 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26069 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26070
26071 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26072 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26073
26074 /* SSE3 */
26075 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26076
26077 /* SSE4.1 */
26078 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26079
26080 /* SSE4A */
26081 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26082 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26083
26084 /* AVX */
26085 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26086 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26087
26088 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26089 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26090 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26091 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26092 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26093
26094 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26095 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26096 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26097 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26098 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26099 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26100 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26101
26102 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26103 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26104 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26105
26106 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26107 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26108 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26109 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26110 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26111 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26112 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26113 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26114
26115 /* AVX2 */
26116 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26117 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26118 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26119 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26120 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26121 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26122 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26123 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26124 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26125
26126 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26127 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26128 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26129 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26130 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26131 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26132
26133 /* FSGSBASE */
26134 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26135 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26136 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26137 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26138 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26139 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26140 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26141 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26142
26143 /* RTM */
26144 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26145 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26146 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26147 };
26148
26149 /* Builtins with variable number of arguments. */
26150 static const struct builtin_description bdesc_args[] =
26151 {
26152 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26153 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26154 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26155 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26156 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26157 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26158 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26159
26160 /* MMX */
26161 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26162 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26163 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26164 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26165 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26166 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26167
26168 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26169 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26170 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26171 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26172 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26173 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26174 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26175 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26176
26177 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26178 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26179
26180 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26181 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26182 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26183 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26184
26185 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26186 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26187 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26188 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26189 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26190 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26191
26192 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26193 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26194 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26195 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26196 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26197 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26198
26199 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26200 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26201 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26202
26203 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26204
26205 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26206 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26207 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26208 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26209 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26210 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26211
26212 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26213 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26214 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26215 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26216 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26217 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26218
26219 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26220 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26221 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26222 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26223
26224 /* 3DNow! */
26225 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26226 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26227 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26228 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26229
26230 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26231 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26232 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26233 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26234 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26235 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26236 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26237 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26238 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26239 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26240 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26241 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26242 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26243 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26244 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26245
26246 /* 3DNow!A */
26247 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26248 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26249 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26250 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26251 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26252 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26253
26254 /* SSE */
26255 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26256 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26257 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26258 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26259 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26260 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26261 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26262 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26263 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26264 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26265 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26266 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26267
26268 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26269
26270 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26271 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26272 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26273 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26274 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26275 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26276 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26277 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26278
26279 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26280 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26281 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26282 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26283 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26284 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26285 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26286 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26287 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26288 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26289 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26290 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26291 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26292 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26293 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26294 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26295 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26296 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26297 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26298 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26299 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26300 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26301
26302 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26303 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26304 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26305 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26306
26307 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26308 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26309 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26310 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26311
26312 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26313
26314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26316 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26317 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26318 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26319
26320 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26321 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26322 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26323
26324 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26325
26326 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26328 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26329
26330 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26331 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26332
26333 /* SSE MMX or 3Dnow!A */
26334 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26335 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26336 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26337
26338 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26339 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26340 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26341 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26342
26343 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26344 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26345
26346 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26347
26348 /* SSE2 */
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26350
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26356
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26362
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26364
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26367 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26368 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26369
26370 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26371 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26372 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26373
26374 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26375 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26376 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26377 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26378 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26379 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26380 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26381 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26382
26383 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26384 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26386 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26389 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26390 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26396 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26400 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26401 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26403
26404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26405 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26407 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26408
26409 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26411 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26412 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26413
26414 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26415
26416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26417 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26418 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26419
26420 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26421
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26423 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26424 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26425 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26426 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26429 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26430
26431 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26439
26440 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26441 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26442
26443 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26444 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26445 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26446 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26447
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26450
26451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26453 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26456 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26457
26458 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26459 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26460 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26462
26463 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26464 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26465 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26466 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26467 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26468 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26469 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26470 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26471
26472 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26475
26476 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26478
26479 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26480 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26481
26482 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26483
26484 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26485 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26488
26489 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26490 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26491 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26492 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26493 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26494 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26495 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26496
26497 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26498 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26499 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26500 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26501 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26502 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26503 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26504
26505 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26506 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26507 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26508 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26509
26510 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26513
26514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26515
26516 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26517
26518 /* SSE2 MMX */
26519 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26520 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26521
26522 /* SSE3 */
26523 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26524 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26525
26526 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26527 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26528 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26529 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26530 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26531 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26532
26533 /* SSSE3 */
26534 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26535 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26536 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26537 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26538 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26539 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26540
26541 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26542 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26543 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26544 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26545 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26546 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26547 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26548 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26549 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26550 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26551 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26552 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26553 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26554 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26555 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26556 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26557 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26558 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26559 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26560 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26561 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26562 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26563 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26564 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26565
26566 /* SSSE3. */
26567 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26568 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26569
26570 /* SSE4.1 */
26571 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26572 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26573 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26574 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26575 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26576 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26577 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26578 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26579 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26580 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26581
26582 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26583 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26584 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26585 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26586 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26587 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26588 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26589 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26590 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26591 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26592 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26593 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26594 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26595
26596 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26597 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26598 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26599 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26600 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26601 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26602 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26603 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26604 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26605 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26606 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26607 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26608
26609 /* SSE4.1 */
26610 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26611 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26612 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26613 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26614
26615 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26616 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26617 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26618 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26619
26620 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26621 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26622
26623 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26624 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26625
26626 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26627 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26628 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26629 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26630
26631 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26632 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26633
26634 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26635 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26636
26637 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26638 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26639 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26640
26641 /* SSE4.2 */
26642 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26643 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26644 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26645 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26646 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26647
26648 /* SSE4A */
26649 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26650 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26651 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26652 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26653
26654 /* AES */
26655 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26656 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26657
26658 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26659 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26660 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26661 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26662
26663 /* PCLMUL */
26664 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26665
26666 /* AVX */
26667 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26668 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26671 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26672 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26675 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26677 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26678 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26681 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26682 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26683 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26684 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26685 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26686 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26687 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26688 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26689 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26690 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26691 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26692 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26693
26694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26695 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26698
26699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26700 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26705 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26706 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26710 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26711 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26713 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26714 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26715 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26716 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26717 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26719 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26720 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26721 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26722 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26723 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26724 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26725 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26726 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26727 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26728 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26729 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26730 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26731 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26732 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26733
26734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26737
26738 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26739 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26740 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26742 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26743
26744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26745
26746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26748
26749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26753
26754 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26755 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26756
26757 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26758 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26759
26760 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26761 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26763 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26764
26765 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26767
26768 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26769 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26770
26771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26775
26776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26779 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26780 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26781 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26782
26783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26787 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26792 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26795 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26798
26799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26801
26802 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26803 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26804
26805 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26806
26807 /* AVX2 */
26808 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26809 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26810 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26811 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26812 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26813 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26814 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26815 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26816 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26817 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26818 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26819 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26825 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26830 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26831 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26846 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26847 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26848 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26849 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26850 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26851 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26852 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26853 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26854 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26855 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26856 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26857 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26858 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26863 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26864 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26866 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26874 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26875 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26876 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26877 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26879 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26886 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26889 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26890 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26891 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26892 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26893 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26894 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26895 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26896 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26897 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26898 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26900 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26901 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26902 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26903 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26904 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26905 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26906 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26907 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26908 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26922 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26928 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26929 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26930 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26931 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26932 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26933 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26934 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26935 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26936 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26937 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26939 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26940 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26954
26955 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26956
26957 /* BMI */
26958 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26959 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26960 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26961
26962 /* TBM */
26963 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26964 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26965
26966 /* F16C */
26967 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26968 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26969 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26970 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26971
26972 /* BMI2 */
26973 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26974 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26975 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26976 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26977 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26978 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26979 };
26980
26981 /* FMA4 and XOP. */
26982 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26983 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26984 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26985 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26986 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26987 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26988 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26989 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26990 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26991 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26992 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26993 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26994 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26995 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26996 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26997 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26998 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26999 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27000 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27001 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27002 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27003 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27004 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27005 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27006 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27007 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27008 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27009 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27010 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27011 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27012 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27013 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27014 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27015 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27016 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27017 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27018 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27019 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27020 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27021 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27022 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27023 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27024 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27025 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27026 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27027 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27028 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27029 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27030 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27031 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27032 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27033 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27034
27035 static const struct builtin_description bdesc_multi_arg[] =
27036 {
27037 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27038 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27039 UNKNOWN, (int)MULTI_ARG_3_SF },
27040 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27041 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27042 UNKNOWN, (int)MULTI_ARG_3_DF },
27043
27044 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27045 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27046 UNKNOWN, (int)MULTI_ARG_3_SF },
27047 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27048 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27049 UNKNOWN, (int)MULTI_ARG_3_DF },
27050
27051 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27052 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27053 UNKNOWN, (int)MULTI_ARG_3_SF },
27054 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27055 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27056 UNKNOWN, (int)MULTI_ARG_3_DF },
27057 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27058 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27059 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27060 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27061 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27062 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27063
27064 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27065 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27066 UNKNOWN, (int)MULTI_ARG_3_SF },
27067 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27068 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27069 UNKNOWN, (int)MULTI_ARG_3_DF },
27070 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27071 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27072 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27073 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27074 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27075 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27076
27077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27084
27085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27092
27093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27094
27095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27107
27108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27124
27125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27131
27132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27147
27148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27155
27156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27163
27164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27171
27172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27179
27180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27187
27188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27194 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27195
27196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27202 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27203
27204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27211
27212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27220
27221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27229
27230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27234
27235 };
27236 \f
27237 /* TM vector builtins. */
27238
27239 /* Reuse the existing x86-specific `struct builtin_description' cause
27240 we're lazy. Add casts to make them fit. */
27241 static const struct builtin_description bdesc_tm[] =
27242 {
27243 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27244 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27245 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27246 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27247 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27248 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27249 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27250
27251 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27252 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27253 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27254 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27255 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27256 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27257 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27258
27259 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27260 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27261 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27262 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27263 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27264 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27265 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27266
27267 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27268 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27269 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27270 };
27271
27272 /* TM callbacks. */
27273
27274 /* Return the builtin decl needed to load a vector of TYPE. */
27275
27276 static tree
27277 ix86_builtin_tm_load (tree type)
27278 {
27279 if (TREE_CODE (type) == VECTOR_TYPE)
27280 {
27281 switch (tree_low_cst (TYPE_SIZE (type), 1))
27282 {
27283 case 64:
27284 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27285 case 128:
27286 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27287 case 256:
27288 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27289 }
27290 }
27291 return NULL_TREE;
27292 }
27293
27294 /* Return the builtin decl needed to store a vector of TYPE. */
27295
27296 static tree
27297 ix86_builtin_tm_store (tree type)
27298 {
27299 if (TREE_CODE (type) == VECTOR_TYPE)
27300 {
27301 switch (tree_low_cst (TYPE_SIZE (type), 1))
27302 {
27303 case 64:
27304 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27305 case 128:
27306 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27307 case 256:
27308 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27309 }
27310 }
27311 return NULL_TREE;
27312 }
27313 \f
27314 /* Initialize the transactional memory vector load/store builtins. */
27315
27316 static void
27317 ix86_init_tm_builtins (void)
27318 {
27319 enum ix86_builtin_func_type ftype;
27320 const struct builtin_description *d;
27321 size_t i;
27322 tree decl;
27323 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27324 tree attrs_log, attrs_type_log;
27325
27326 if (!flag_tm)
27327 return;
27328
27329 /* If there are no builtins defined, we must be compiling in a
27330 language without trans-mem support. */
27331 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27332 return;
27333
27334 /* Use whatever attributes a normal TM load has. */
27335 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27336 attrs_load = DECL_ATTRIBUTES (decl);
27337 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27338 /* Use whatever attributes a normal TM store has. */
27339 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27340 attrs_store = DECL_ATTRIBUTES (decl);
27341 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27342 /* Use whatever attributes a normal TM log has. */
27343 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27344 attrs_log = DECL_ATTRIBUTES (decl);
27345 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27346
27347 for (i = 0, d = bdesc_tm;
27348 i < ARRAY_SIZE (bdesc_tm);
27349 i++, d++)
27350 {
27351 if ((d->mask & ix86_isa_flags) != 0
27352 || (lang_hooks.builtin_function
27353 == lang_hooks.builtin_function_ext_scope))
27354 {
27355 tree type, attrs, attrs_type;
27356 enum built_in_function code = (enum built_in_function) d->code;
27357
27358 ftype = (enum ix86_builtin_func_type) d->flag;
27359 type = ix86_get_builtin_func_type (ftype);
27360
27361 if (BUILTIN_TM_LOAD_P (code))
27362 {
27363 attrs = attrs_load;
27364 attrs_type = attrs_type_load;
27365 }
27366 else if (BUILTIN_TM_STORE_P (code))
27367 {
27368 attrs = attrs_store;
27369 attrs_type = attrs_type_store;
27370 }
27371 else
27372 {
27373 attrs = attrs_log;
27374 attrs_type = attrs_type_log;
27375 }
27376 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27377 /* The builtin without the prefix for
27378 calling it directly. */
27379 d->name + strlen ("__builtin_"),
27380 attrs);
27381 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27382 set the TYPE_ATTRIBUTES. */
27383 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27384
27385 set_builtin_decl (code, decl, false);
27386 }
27387 }
27388 }
27389
27390 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27391 in the current target ISA to allow the user to compile particular modules
27392 with different target specific options that differ from the command line
27393 options. */
27394 static void
27395 ix86_init_mmx_sse_builtins (void)
27396 {
27397 const struct builtin_description * d;
27398 enum ix86_builtin_func_type ftype;
27399 size_t i;
27400
27401 /* Add all special builtins with variable number of operands. */
27402 for (i = 0, d = bdesc_special_args;
27403 i < ARRAY_SIZE (bdesc_special_args);
27404 i++, d++)
27405 {
27406 if (d->name == 0)
27407 continue;
27408
27409 ftype = (enum ix86_builtin_func_type) d->flag;
27410 def_builtin (d->mask, d->name, ftype, d->code);
27411 }
27412
27413 /* Add all builtins with variable number of operands. */
27414 for (i = 0, d = bdesc_args;
27415 i < ARRAY_SIZE (bdesc_args);
27416 i++, d++)
27417 {
27418 if (d->name == 0)
27419 continue;
27420
27421 ftype = (enum ix86_builtin_func_type) d->flag;
27422 def_builtin_const (d->mask, d->name, ftype, d->code);
27423 }
27424
27425 /* pcmpestr[im] insns. */
27426 for (i = 0, d = bdesc_pcmpestr;
27427 i < ARRAY_SIZE (bdesc_pcmpestr);
27428 i++, d++)
27429 {
27430 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27431 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27432 else
27433 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27434 def_builtin_const (d->mask, d->name, ftype, d->code);
27435 }
27436
27437 /* pcmpistr[im] insns. */
27438 for (i = 0, d = bdesc_pcmpistr;
27439 i < ARRAY_SIZE (bdesc_pcmpistr);
27440 i++, d++)
27441 {
27442 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27443 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27444 else
27445 ftype = INT_FTYPE_V16QI_V16QI_INT;
27446 def_builtin_const (d->mask, d->name, ftype, d->code);
27447 }
27448
27449 /* comi/ucomi insns. */
27450 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27451 {
27452 if (d->mask == OPTION_MASK_ISA_SSE2)
27453 ftype = INT_FTYPE_V2DF_V2DF;
27454 else
27455 ftype = INT_FTYPE_V4SF_V4SF;
27456 def_builtin_const (d->mask, d->name, ftype, d->code);
27457 }
27458
27459 /* SSE */
27460 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27461 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27462 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27463 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27464
27465 /* SSE or 3DNow!A */
27466 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27467 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27468 IX86_BUILTIN_MASKMOVQ);
27469
27470 /* SSE2 */
27471 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27472 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27473
27474 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27475 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27476 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27477 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27478
27479 /* SSE3. */
27480 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27481 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27482 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27483 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27484
27485 /* AES */
27486 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27487 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27488 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27489 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27490 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27491 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27492 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27493 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27494 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27495 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27496 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27497 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27498
27499 /* PCLMUL */
27500 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27501 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27502
27503 /* RDRND */
27504 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27505 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27506 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27507 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27508 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27509 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27510 IX86_BUILTIN_RDRAND64_STEP);
27511
27512 /* AVX2 */
27513 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27514 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27515 IX86_BUILTIN_GATHERSIV2DF);
27516
27517 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27518 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27519 IX86_BUILTIN_GATHERSIV4DF);
27520
27521 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27522 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27523 IX86_BUILTIN_GATHERDIV2DF);
27524
27525 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27526 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27527 IX86_BUILTIN_GATHERDIV4DF);
27528
27529 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27530 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27531 IX86_BUILTIN_GATHERSIV4SF);
27532
27533 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27534 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27535 IX86_BUILTIN_GATHERSIV8SF);
27536
27537 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27538 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27539 IX86_BUILTIN_GATHERDIV4SF);
27540
27541 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27542 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27543 IX86_BUILTIN_GATHERDIV8SF);
27544
27545 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27546 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27547 IX86_BUILTIN_GATHERSIV2DI);
27548
27549 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27550 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27551 IX86_BUILTIN_GATHERSIV4DI);
27552
27553 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27554 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27555 IX86_BUILTIN_GATHERDIV2DI);
27556
27557 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27558 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27559 IX86_BUILTIN_GATHERDIV4DI);
27560
27561 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27562 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27563 IX86_BUILTIN_GATHERSIV4SI);
27564
27565 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27566 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27567 IX86_BUILTIN_GATHERSIV8SI);
27568
27569 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27570 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27571 IX86_BUILTIN_GATHERDIV4SI);
27572
27573 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27574 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27575 IX86_BUILTIN_GATHERDIV8SI);
27576
27577 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27578 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27579 IX86_BUILTIN_GATHERALTSIV4DF);
27580
27581 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27582 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27583 IX86_BUILTIN_GATHERALTDIV8SF);
27584
27585 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27586 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27587 IX86_BUILTIN_GATHERALTSIV4DI);
27588
27589 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27590 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27591 IX86_BUILTIN_GATHERALTDIV8SI);
27592
27593 /* RTM. */
27594 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27595 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27596
27597 /* MMX access to the vec_init patterns. */
27598 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27599 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27600
27601 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27602 V4HI_FTYPE_HI_HI_HI_HI,
27603 IX86_BUILTIN_VEC_INIT_V4HI);
27604
27605 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27606 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27607 IX86_BUILTIN_VEC_INIT_V8QI);
27608
27609 /* Access to the vec_extract patterns. */
27610 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27611 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27612 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27613 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27614 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27615 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27616 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27617 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27618 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27619 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27620
27621 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27622 "__builtin_ia32_vec_ext_v4hi",
27623 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27624
27625 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27626 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27627
27628 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27629 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27630
27631 /* Access to the vec_set patterns. */
27632 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27633 "__builtin_ia32_vec_set_v2di",
27634 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27635
27636 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27637 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27638
27639 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27640 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27641
27642 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27643 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27644
27645 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27646 "__builtin_ia32_vec_set_v4hi",
27647 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27648
27649 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27650 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27651
27652 /* Add FMA4 multi-arg argument instructions */
27653 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27654 {
27655 if (d->name == 0)
27656 continue;
27657
27658 ftype = (enum ix86_builtin_func_type) d->flag;
27659 def_builtin_const (d->mask, d->name, ftype, d->code);
27660 }
27661 }
27662
27663 /* This builds the processor_model struct type defined in
27664 libgcc/config/i386/cpuinfo.c */
27665
27666 static tree
27667 build_processor_model_struct (void)
27668 {
27669 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
27670 "__cpu_features"};
27671 tree field = NULL_TREE, field_chain = NULL_TREE;
27672 int i;
27673 tree type = make_node (RECORD_TYPE);
27674
27675 /* The first 3 fields are unsigned int. */
27676 for (i = 0; i < 3; ++i)
27677 {
27678 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27679 get_identifier (field_name[i]), unsigned_type_node);
27680 if (field_chain != NULL_TREE)
27681 DECL_CHAIN (field) = field_chain;
27682 field_chain = field;
27683 }
27684
27685 /* The last field is an array of unsigned integers of size one. */
27686 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27687 get_identifier (field_name[3]),
27688 build_array_type (unsigned_type_node,
27689 build_index_type (size_one_node)));
27690 if (field_chain != NULL_TREE)
27691 DECL_CHAIN (field) = field_chain;
27692 field_chain = field;
27693
27694 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
27695 return type;
27696 }
27697
27698 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
27699
27700 static tree
27701 make_var_decl (tree type, const char *name)
27702 {
27703 tree new_decl;
27704
27705 new_decl = build_decl (UNKNOWN_LOCATION,
27706 VAR_DECL,
27707 get_identifier(name),
27708 type);
27709
27710 DECL_EXTERNAL (new_decl) = 1;
27711 TREE_STATIC (new_decl) = 1;
27712 TREE_PUBLIC (new_decl) = 1;
27713 DECL_INITIAL (new_decl) = 0;
27714 DECL_ARTIFICIAL (new_decl) = 0;
27715 DECL_PRESERVE_P (new_decl) = 1;
27716
27717 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
27718 assemble_variable (new_decl, 0, 0, 0);
27719
27720 return new_decl;
27721 }
27722
27723 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
27724 into an integer defined in libgcc/config/i386/cpuinfo.c */
27725
27726 static tree
27727 fold_builtin_cpu (tree fndecl, tree *args)
27728 {
27729 unsigned int i;
27730 enum ix86_builtins fn_code = (enum ix86_builtins)
27731 DECL_FUNCTION_CODE (fndecl);
27732 tree param_string_cst = NULL;
27733
27734 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
27735 enum processor_features
27736 {
27737 F_CMOV = 0,
27738 F_MMX,
27739 F_POPCNT,
27740 F_SSE,
27741 F_SSE2,
27742 F_SSE3,
27743 F_SSSE3,
27744 F_SSE4_1,
27745 F_SSE4_2,
27746 F_AVX,
27747 F_AVX2,
27748 F_MAX
27749 };
27750
27751 /* These are the values for vendor types and cpu types and subtypes
27752 in cpuinfo.c. Cpu types and subtypes should be subtracted by
27753 the corresponding start value. */
27754 enum processor_model
27755 {
27756 M_INTEL = 1,
27757 M_AMD,
27758 M_CPU_TYPE_START,
27759 M_INTEL_ATOM,
27760 M_INTEL_CORE2,
27761 M_INTEL_COREI7,
27762 M_AMDFAM10H,
27763 M_AMDFAM15H,
27764 M_CPU_SUBTYPE_START,
27765 M_INTEL_COREI7_NEHALEM,
27766 M_INTEL_COREI7_WESTMERE,
27767 M_INTEL_COREI7_SANDYBRIDGE,
27768 M_AMDFAM10H_BARCELONA,
27769 M_AMDFAM10H_SHANGHAI,
27770 M_AMDFAM10H_ISTANBUL,
27771 M_AMDFAM15H_BDVER1,
27772 M_AMDFAM15H_BDVER2
27773 };
27774
27775 static struct _arch_names_table
27776 {
27777 const char *const name;
27778 const enum processor_model model;
27779 }
27780 const arch_names_table[] =
27781 {
27782 {"amd", M_AMD},
27783 {"intel", M_INTEL},
27784 {"atom", M_INTEL_ATOM},
27785 {"core2", M_INTEL_CORE2},
27786 {"corei7", M_INTEL_COREI7},
27787 {"nehalem", M_INTEL_COREI7_NEHALEM},
27788 {"westmere", M_INTEL_COREI7_WESTMERE},
27789 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
27790 {"amdfam10h", M_AMDFAM10H},
27791 {"barcelona", M_AMDFAM10H_BARCELONA},
27792 {"shanghai", M_AMDFAM10H_SHANGHAI},
27793 {"istanbul", M_AMDFAM10H_ISTANBUL},
27794 {"amdfam15h", M_AMDFAM15H},
27795 {"bdver1", M_AMDFAM15H_BDVER1},
27796 {"bdver2", M_AMDFAM15H_BDVER2},
27797 };
27798
27799 static struct _isa_names_table
27800 {
27801 const char *const name;
27802 const enum processor_features feature;
27803 }
27804 const isa_names_table[] =
27805 {
27806 {"cmov", F_CMOV},
27807 {"mmx", F_MMX},
27808 {"popcnt", F_POPCNT},
27809 {"sse", F_SSE},
27810 {"sse2", F_SSE2},
27811 {"sse3", F_SSE3},
27812 {"ssse3", F_SSSE3},
27813 {"sse4.1", F_SSE4_1},
27814 {"sse4.2", F_SSE4_2},
27815 {"avx", F_AVX},
27816 {"avx2", F_AVX2}
27817 };
27818
27819 static tree __processor_model_type = NULL_TREE;
27820 static tree __cpu_model_var = NULL_TREE;
27821
27822 if (__processor_model_type == NULL_TREE)
27823 __processor_model_type = build_processor_model_struct ();
27824
27825 if (__cpu_model_var == NULL_TREE)
27826 __cpu_model_var = make_var_decl (__processor_model_type,
27827 "__cpu_model");
27828
27829 gcc_assert ((args != NULL) && (*args != NULL));
27830
27831 param_string_cst = *args;
27832 while (param_string_cst
27833 && TREE_CODE (param_string_cst) != STRING_CST)
27834 {
27835 /* *args must be a expr that can contain other EXPRS leading to a
27836 STRING_CST. */
27837 if (!EXPR_P (param_string_cst))
27838 {
27839 error ("Parameter to builtin must be a string constant or literal");
27840 return integer_zero_node;
27841 }
27842 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
27843 }
27844
27845 gcc_assert (param_string_cst);
27846
27847 if (fn_code == IX86_BUILTIN_CPU_IS)
27848 {
27849 tree ref;
27850 tree field;
27851 unsigned int field_val = 0;
27852 unsigned int NUM_ARCH_NAMES
27853 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
27854
27855 for (i = 0; i < NUM_ARCH_NAMES; i++)
27856 if (strcmp (arch_names_table[i].name,
27857 TREE_STRING_POINTER (param_string_cst)) == 0)
27858 break;
27859
27860 if (i == NUM_ARCH_NAMES)
27861 {
27862 error ("Parameter to builtin not valid: %s",
27863 TREE_STRING_POINTER (param_string_cst));
27864 return integer_zero_node;
27865 }
27866
27867 field = TYPE_FIELDS (__processor_model_type);
27868 field_val = arch_names_table[i].model;
27869
27870 /* CPU types are stored in the next field. */
27871 if (field_val > M_CPU_TYPE_START
27872 && field_val < M_CPU_SUBTYPE_START)
27873 {
27874 field = DECL_CHAIN (field);
27875 field_val -= M_CPU_TYPE_START;
27876 }
27877
27878 /* CPU subtypes are stored in the next field. */
27879 if (field_val > M_CPU_SUBTYPE_START)
27880 {
27881 field = DECL_CHAIN ( DECL_CHAIN (field));
27882 field_val -= M_CPU_SUBTYPE_START;
27883 }
27884
27885 /* Get the appropriate field in __cpu_model. */
27886 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
27887 field, NULL_TREE);
27888
27889 /* Check the value. */
27890 return build2 (EQ_EXPR, unsigned_type_node, ref,
27891 build_int_cstu (unsigned_type_node, field_val));
27892 }
27893 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
27894 {
27895 tree ref;
27896 tree array_elt;
27897 tree field;
27898 unsigned int field_val = 0;
27899 unsigned int NUM_ISA_NAMES
27900 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
27901
27902 for (i = 0; i < NUM_ISA_NAMES; i++)
27903 if (strcmp (isa_names_table[i].name,
27904 TREE_STRING_POINTER (param_string_cst)) == 0)
27905 break;
27906
27907 if (i == NUM_ISA_NAMES)
27908 {
27909 error ("Parameter to builtin not valid: %s",
27910 TREE_STRING_POINTER (param_string_cst));
27911 return integer_zero_node;
27912 }
27913
27914 field = TYPE_FIELDS (__processor_model_type);
27915 /* Get the last field, which is __cpu_features. */
27916 while (DECL_CHAIN (field))
27917 field = DECL_CHAIN (field);
27918
27919 /* Get the appropriate field: __cpu_model.__cpu_features */
27920 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
27921 field, NULL_TREE);
27922
27923 /* Access the 0th element of __cpu_features array. */
27924 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
27925 integer_zero_node, NULL_TREE, NULL_TREE);
27926
27927 field_val = (1 << isa_names_table[i].feature);
27928 /* Return __cpu_model.__cpu_features[0] & field_val */
27929 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
27930 build_int_cstu (unsigned_type_node, field_val));
27931 }
27932 gcc_unreachable ();
27933 }
27934
27935 static tree
27936 ix86_fold_builtin (tree fndecl, int n_args,
27937 tree *args, bool ignore ATTRIBUTE_UNUSED)
27938 {
27939 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
27940 {
27941 enum ix86_builtins fn_code = (enum ix86_builtins)
27942 DECL_FUNCTION_CODE (fndecl);
27943 if (fn_code == IX86_BUILTIN_CPU_IS
27944 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
27945 {
27946 gcc_assert (n_args == 1);
27947 return fold_builtin_cpu (fndecl, args);
27948 }
27949 }
27950
27951 return NULL_TREE;
27952 }
27953
27954 /* Make builtins to detect cpu type and features supported. NAME is
27955 the builtin name, CODE is the builtin code, and FTYPE is the function
27956 type of the builtin. */
27957
27958 static void
27959 make_cpu_type_builtin (const char* name, int code,
27960 enum ix86_builtin_func_type ftype, bool is_const)
27961 {
27962 tree decl;
27963 tree type;
27964
27965 type = ix86_get_builtin_func_type (ftype);
27966 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27967 NULL, NULL_TREE);
27968 gcc_assert (decl != NULL_TREE);
27969 ix86_builtins[(int) code] = decl;
27970 TREE_READONLY (decl) = is_const;
27971 }
27972
27973 /* Make builtins to get CPU type and features supported. The created
27974 builtins are :
27975
27976 __builtin_cpu_init (), to detect cpu type and features,
27977 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
27978 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
27979 */
27980
27981 static void
27982 ix86_init_platform_type_builtins (void)
27983 {
27984 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
27985 INT_FTYPE_VOID, false);
27986 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
27987 INT_FTYPE_PCCHAR, true);
27988 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
27989 INT_FTYPE_PCCHAR, true);
27990 }
27991
27992 /* Internal method for ix86_init_builtins. */
27993
27994 static void
27995 ix86_init_builtins_va_builtins_abi (void)
27996 {
27997 tree ms_va_ref, sysv_va_ref;
27998 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27999 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28000 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28001 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28002
28003 if (!TARGET_64BIT)
28004 return;
28005 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28006 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28007 ms_va_ref = build_reference_type (ms_va_list_type_node);
28008 sysv_va_ref =
28009 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28010
28011 fnvoid_va_end_ms =
28012 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28013 fnvoid_va_start_ms =
28014 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28015 fnvoid_va_end_sysv =
28016 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28017 fnvoid_va_start_sysv =
28018 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28019 NULL_TREE);
28020 fnvoid_va_copy_ms =
28021 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28022 NULL_TREE);
28023 fnvoid_va_copy_sysv =
28024 build_function_type_list (void_type_node, sysv_va_ref,
28025 sysv_va_ref, NULL_TREE);
28026
28027 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28028 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28029 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28030 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28031 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28032 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28033 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28034 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28035 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28036 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28037 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28038 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28039 }
28040
28041 static void
28042 ix86_init_builtin_types (void)
28043 {
28044 tree float128_type_node, float80_type_node;
28045
28046 /* The __float80 type. */
28047 float80_type_node = long_double_type_node;
28048 if (TYPE_MODE (float80_type_node) != XFmode)
28049 {
28050 /* The __float80 type. */
28051 float80_type_node = make_node (REAL_TYPE);
28052
28053 TYPE_PRECISION (float80_type_node) = 80;
28054 layout_type (float80_type_node);
28055 }
28056 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28057
28058 /* The __float128 type. */
28059 float128_type_node = make_node (REAL_TYPE);
28060 TYPE_PRECISION (float128_type_node) = 128;
28061 layout_type (float128_type_node);
28062 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28063
28064 /* This macro is built by i386-builtin-types.awk. */
28065 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28066 }
28067
28068 static void
28069 ix86_init_builtins (void)
28070 {
28071 tree t;
28072
28073 ix86_init_builtin_types ();
28074
28075 /* Builtins to get CPU type and features. */
28076 ix86_init_platform_type_builtins ();
28077
28078 /* TFmode support builtins. */
28079 def_builtin_const (0, "__builtin_infq",
28080 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28081 def_builtin_const (0, "__builtin_huge_valq",
28082 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28083
28084 /* We will expand them to normal call if SSE isn't available since
28085 they are used by libgcc. */
28086 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28087 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28088 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28089 TREE_READONLY (t) = 1;
28090 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28091
28092 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28093 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28094 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28095 TREE_READONLY (t) = 1;
28096 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28097
28098 ix86_init_tm_builtins ();
28099 ix86_init_mmx_sse_builtins ();
28100
28101 if (TARGET_LP64)
28102 ix86_init_builtins_va_builtins_abi ();
28103
28104 #ifdef SUBTARGET_INIT_BUILTINS
28105 SUBTARGET_INIT_BUILTINS;
28106 #endif
28107 }
28108
28109 /* Return the ix86 builtin for CODE. */
28110
28111 static tree
28112 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28113 {
28114 if (code >= IX86_BUILTIN_MAX)
28115 return error_mark_node;
28116
28117 return ix86_builtins[code];
28118 }
28119
28120 /* Errors in the source file can cause expand_expr to return const0_rtx
28121 where we expect a vector. To avoid crashing, use one of the vector
28122 clear instructions. */
28123 static rtx
28124 safe_vector_operand (rtx x, enum machine_mode mode)
28125 {
28126 if (x == const0_rtx)
28127 x = CONST0_RTX (mode);
28128 return x;
28129 }
28130
28131 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28132
28133 static rtx
28134 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28135 {
28136 rtx pat;
28137 tree arg0 = CALL_EXPR_ARG (exp, 0);
28138 tree arg1 = CALL_EXPR_ARG (exp, 1);
28139 rtx op0 = expand_normal (arg0);
28140 rtx op1 = expand_normal (arg1);
28141 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28142 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28143 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28144
28145 if (VECTOR_MODE_P (mode0))
28146 op0 = safe_vector_operand (op0, mode0);
28147 if (VECTOR_MODE_P (mode1))
28148 op1 = safe_vector_operand (op1, mode1);
28149
28150 if (optimize || !target
28151 || GET_MODE (target) != tmode
28152 || !insn_data[icode].operand[0].predicate (target, tmode))
28153 target = gen_reg_rtx (tmode);
28154
28155 if (GET_MODE (op1) == SImode && mode1 == TImode)
28156 {
28157 rtx x = gen_reg_rtx (V4SImode);
28158 emit_insn (gen_sse2_loadd (x, op1));
28159 op1 = gen_lowpart (TImode, x);
28160 }
28161
28162 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28163 op0 = copy_to_mode_reg (mode0, op0);
28164 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28165 op1 = copy_to_mode_reg (mode1, op1);
28166
28167 pat = GEN_FCN (icode) (target, op0, op1);
28168 if (! pat)
28169 return 0;
28170
28171 emit_insn (pat);
28172
28173 return target;
28174 }
28175
28176 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28177
28178 static rtx
28179 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28180 enum ix86_builtin_func_type m_type,
28181 enum rtx_code sub_code)
28182 {
28183 rtx pat;
28184 int i;
28185 int nargs;
28186 bool comparison_p = false;
28187 bool tf_p = false;
28188 bool last_arg_constant = false;
28189 int num_memory = 0;
28190 struct {
28191 rtx op;
28192 enum machine_mode mode;
28193 } args[4];
28194
28195 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28196
28197 switch (m_type)
28198 {
28199 case MULTI_ARG_4_DF2_DI_I:
28200 case MULTI_ARG_4_DF2_DI_I1:
28201 case MULTI_ARG_4_SF2_SI_I:
28202 case MULTI_ARG_4_SF2_SI_I1:
28203 nargs = 4;
28204 last_arg_constant = true;
28205 break;
28206
28207 case MULTI_ARG_3_SF:
28208 case MULTI_ARG_3_DF:
28209 case MULTI_ARG_3_SF2:
28210 case MULTI_ARG_3_DF2:
28211 case MULTI_ARG_3_DI:
28212 case MULTI_ARG_3_SI:
28213 case MULTI_ARG_3_SI_DI:
28214 case MULTI_ARG_3_HI:
28215 case MULTI_ARG_3_HI_SI:
28216 case MULTI_ARG_3_QI:
28217 case MULTI_ARG_3_DI2:
28218 case MULTI_ARG_3_SI2:
28219 case MULTI_ARG_3_HI2:
28220 case MULTI_ARG_3_QI2:
28221 nargs = 3;
28222 break;
28223
28224 case MULTI_ARG_2_SF:
28225 case MULTI_ARG_2_DF:
28226 case MULTI_ARG_2_DI:
28227 case MULTI_ARG_2_SI:
28228 case MULTI_ARG_2_HI:
28229 case MULTI_ARG_2_QI:
28230 nargs = 2;
28231 break;
28232
28233 case MULTI_ARG_2_DI_IMM:
28234 case MULTI_ARG_2_SI_IMM:
28235 case MULTI_ARG_2_HI_IMM:
28236 case MULTI_ARG_2_QI_IMM:
28237 nargs = 2;
28238 last_arg_constant = true;
28239 break;
28240
28241 case MULTI_ARG_1_SF:
28242 case MULTI_ARG_1_DF:
28243 case MULTI_ARG_1_SF2:
28244 case MULTI_ARG_1_DF2:
28245 case MULTI_ARG_1_DI:
28246 case MULTI_ARG_1_SI:
28247 case MULTI_ARG_1_HI:
28248 case MULTI_ARG_1_QI:
28249 case MULTI_ARG_1_SI_DI:
28250 case MULTI_ARG_1_HI_DI:
28251 case MULTI_ARG_1_HI_SI:
28252 case MULTI_ARG_1_QI_DI:
28253 case MULTI_ARG_1_QI_SI:
28254 case MULTI_ARG_1_QI_HI:
28255 nargs = 1;
28256 break;
28257
28258 case MULTI_ARG_2_DI_CMP:
28259 case MULTI_ARG_2_SI_CMP:
28260 case MULTI_ARG_2_HI_CMP:
28261 case MULTI_ARG_2_QI_CMP:
28262 nargs = 2;
28263 comparison_p = true;
28264 break;
28265
28266 case MULTI_ARG_2_SF_TF:
28267 case MULTI_ARG_2_DF_TF:
28268 case MULTI_ARG_2_DI_TF:
28269 case MULTI_ARG_2_SI_TF:
28270 case MULTI_ARG_2_HI_TF:
28271 case MULTI_ARG_2_QI_TF:
28272 nargs = 2;
28273 tf_p = true;
28274 break;
28275
28276 default:
28277 gcc_unreachable ();
28278 }
28279
28280 if (optimize || !target
28281 || GET_MODE (target) != tmode
28282 || !insn_data[icode].operand[0].predicate (target, tmode))
28283 target = gen_reg_rtx (tmode);
28284
28285 gcc_assert (nargs <= 4);
28286
28287 for (i = 0; i < nargs; i++)
28288 {
28289 tree arg = CALL_EXPR_ARG (exp, i);
28290 rtx op = expand_normal (arg);
28291 int adjust = (comparison_p) ? 1 : 0;
28292 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28293
28294 if (last_arg_constant && i == nargs - 1)
28295 {
28296 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28297 {
28298 enum insn_code new_icode = icode;
28299 switch (icode)
28300 {
28301 case CODE_FOR_xop_vpermil2v2df3:
28302 case CODE_FOR_xop_vpermil2v4sf3:
28303 case CODE_FOR_xop_vpermil2v4df3:
28304 case CODE_FOR_xop_vpermil2v8sf3:
28305 error ("the last argument must be a 2-bit immediate");
28306 return gen_reg_rtx (tmode);
28307 case CODE_FOR_xop_rotlv2di3:
28308 new_icode = CODE_FOR_rotlv2di3;
28309 goto xop_rotl;
28310 case CODE_FOR_xop_rotlv4si3:
28311 new_icode = CODE_FOR_rotlv4si3;
28312 goto xop_rotl;
28313 case CODE_FOR_xop_rotlv8hi3:
28314 new_icode = CODE_FOR_rotlv8hi3;
28315 goto xop_rotl;
28316 case CODE_FOR_xop_rotlv16qi3:
28317 new_icode = CODE_FOR_rotlv16qi3;
28318 xop_rotl:
28319 if (CONST_INT_P (op))
28320 {
28321 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28322 op = GEN_INT (INTVAL (op) & mask);
28323 gcc_checking_assert
28324 (insn_data[icode].operand[i + 1].predicate (op, mode));
28325 }
28326 else
28327 {
28328 gcc_checking_assert
28329 (nargs == 2
28330 && insn_data[new_icode].operand[0].mode == tmode
28331 && insn_data[new_icode].operand[1].mode == tmode
28332 && insn_data[new_icode].operand[2].mode == mode
28333 && insn_data[new_icode].operand[0].predicate
28334 == insn_data[icode].operand[0].predicate
28335 && insn_data[new_icode].operand[1].predicate
28336 == insn_data[icode].operand[1].predicate);
28337 icode = new_icode;
28338 goto non_constant;
28339 }
28340 break;
28341 default:
28342 gcc_unreachable ();
28343 }
28344 }
28345 }
28346 else
28347 {
28348 non_constant:
28349 if (VECTOR_MODE_P (mode))
28350 op = safe_vector_operand (op, mode);
28351
28352 /* If we aren't optimizing, only allow one memory operand to be
28353 generated. */
28354 if (memory_operand (op, mode))
28355 num_memory++;
28356
28357 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28358
28359 if (optimize
28360 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28361 || num_memory > 1)
28362 op = force_reg (mode, op);
28363 }
28364
28365 args[i].op = op;
28366 args[i].mode = mode;
28367 }
28368
28369 switch (nargs)
28370 {
28371 case 1:
28372 pat = GEN_FCN (icode) (target, args[0].op);
28373 break;
28374
28375 case 2:
28376 if (tf_p)
28377 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28378 GEN_INT ((int)sub_code));
28379 else if (! comparison_p)
28380 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28381 else
28382 {
28383 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28384 args[0].op,
28385 args[1].op);
28386
28387 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28388 }
28389 break;
28390
28391 case 3:
28392 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28393 break;
28394
28395 case 4:
28396 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28397 break;
28398
28399 default:
28400 gcc_unreachable ();
28401 }
28402
28403 if (! pat)
28404 return 0;
28405
28406 emit_insn (pat);
28407 return target;
28408 }
28409
28410 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28411 insns with vec_merge. */
28412
28413 static rtx
28414 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28415 rtx target)
28416 {
28417 rtx pat;
28418 tree arg0 = CALL_EXPR_ARG (exp, 0);
28419 rtx op1, op0 = expand_normal (arg0);
28420 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28421 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28422
28423 if (optimize || !target
28424 || GET_MODE (target) != tmode
28425 || !insn_data[icode].operand[0].predicate (target, tmode))
28426 target = gen_reg_rtx (tmode);
28427
28428 if (VECTOR_MODE_P (mode0))
28429 op0 = safe_vector_operand (op0, mode0);
28430
28431 if ((optimize && !register_operand (op0, mode0))
28432 || !insn_data[icode].operand[1].predicate (op0, mode0))
28433 op0 = copy_to_mode_reg (mode0, op0);
28434
28435 op1 = op0;
28436 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28437 op1 = copy_to_mode_reg (mode0, op1);
28438
28439 pat = GEN_FCN (icode) (target, op0, op1);
28440 if (! pat)
28441 return 0;
28442 emit_insn (pat);
28443 return target;
28444 }
28445
28446 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28447
28448 static rtx
28449 ix86_expand_sse_compare (const struct builtin_description *d,
28450 tree exp, rtx target, bool swap)
28451 {
28452 rtx pat;
28453 tree arg0 = CALL_EXPR_ARG (exp, 0);
28454 tree arg1 = CALL_EXPR_ARG (exp, 1);
28455 rtx op0 = expand_normal (arg0);
28456 rtx op1 = expand_normal (arg1);
28457 rtx op2;
28458 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28459 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28460 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28461 enum rtx_code comparison = d->comparison;
28462
28463 if (VECTOR_MODE_P (mode0))
28464 op0 = safe_vector_operand (op0, mode0);
28465 if (VECTOR_MODE_P (mode1))
28466 op1 = safe_vector_operand (op1, mode1);
28467
28468 /* Swap operands if we have a comparison that isn't available in
28469 hardware. */
28470 if (swap)
28471 {
28472 rtx tmp = gen_reg_rtx (mode1);
28473 emit_move_insn (tmp, op1);
28474 op1 = op0;
28475 op0 = tmp;
28476 }
28477
28478 if (optimize || !target
28479 || GET_MODE (target) != tmode
28480 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28481 target = gen_reg_rtx (tmode);
28482
28483 if ((optimize && !register_operand (op0, mode0))
28484 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28485 op0 = copy_to_mode_reg (mode0, op0);
28486 if ((optimize && !register_operand (op1, mode1))
28487 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28488 op1 = copy_to_mode_reg (mode1, op1);
28489
28490 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28491 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28492 if (! pat)
28493 return 0;
28494 emit_insn (pat);
28495 return target;
28496 }
28497
28498 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28499
28500 static rtx
28501 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28502 rtx target)
28503 {
28504 rtx pat;
28505 tree arg0 = CALL_EXPR_ARG (exp, 0);
28506 tree arg1 = CALL_EXPR_ARG (exp, 1);
28507 rtx op0 = expand_normal (arg0);
28508 rtx op1 = expand_normal (arg1);
28509 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28510 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28511 enum rtx_code comparison = d->comparison;
28512
28513 if (VECTOR_MODE_P (mode0))
28514 op0 = safe_vector_operand (op0, mode0);
28515 if (VECTOR_MODE_P (mode1))
28516 op1 = safe_vector_operand (op1, mode1);
28517
28518 /* Swap operands if we have a comparison that isn't available in
28519 hardware. */
28520 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28521 {
28522 rtx tmp = op1;
28523 op1 = op0;
28524 op0 = tmp;
28525 }
28526
28527 target = gen_reg_rtx (SImode);
28528 emit_move_insn (target, const0_rtx);
28529 target = gen_rtx_SUBREG (QImode, target, 0);
28530
28531 if ((optimize && !register_operand (op0, mode0))
28532 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28533 op0 = copy_to_mode_reg (mode0, op0);
28534 if ((optimize && !register_operand (op1, mode1))
28535 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28536 op1 = copy_to_mode_reg (mode1, op1);
28537
28538 pat = GEN_FCN (d->icode) (op0, op1);
28539 if (! pat)
28540 return 0;
28541 emit_insn (pat);
28542 emit_insn (gen_rtx_SET (VOIDmode,
28543 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28544 gen_rtx_fmt_ee (comparison, QImode,
28545 SET_DEST (pat),
28546 const0_rtx)));
28547
28548 return SUBREG_REG (target);
28549 }
28550
28551 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28552
28553 static rtx
28554 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28555 rtx target)
28556 {
28557 rtx pat;
28558 tree arg0 = CALL_EXPR_ARG (exp, 0);
28559 rtx op1, op0 = expand_normal (arg0);
28560 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28561 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28562
28563 if (optimize || target == 0
28564 || GET_MODE (target) != tmode
28565 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28566 target = gen_reg_rtx (tmode);
28567
28568 if (VECTOR_MODE_P (mode0))
28569 op0 = safe_vector_operand (op0, mode0);
28570
28571 if ((optimize && !register_operand (op0, mode0))
28572 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28573 op0 = copy_to_mode_reg (mode0, op0);
28574
28575 op1 = GEN_INT (d->comparison);
28576
28577 pat = GEN_FCN (d->icode) (target, op0, op1);
28578 if (! pat)
28579 return 0;
28580 emit_insn (pat);
28581 return target;
28582 }
28583
28584 static rtx
28585 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28586 tree exp, rtx target)
28587 {
28588 rtx pat;
28589 tree arg0 = CALL_EXPR_ARG (exp, 0);
28590 tree arg1 = CALL_EXPR_ARG (exp, 1);
28591 rtx op0 = expand_normal (arg0);
28592 rtx op1 = expand_normal (arg1);
28593 rtx op2;
28594 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28595 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28596 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28597
28598 if (optimize || target == 0
28599 || GET_MODE (target) != tmode
28600 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28601 target = gen_reg_rtx (tmode);
28602
28603 op0 = safe_vector_operand (op0, mode0);
28604 op1 = safe_vector_operand (op1, mode1);
28605
28606 if ((optimize && !register_operand (op0, mode0))
28607 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28608 op0 = copy_to_mode_reg (mode0, op0);
28609 if ((optimize && !register_operand (op1, mode1))
28610 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28611 op1 = copy_to_mode_reg (mode1, op1);
28612
28613 op2 = GEN_INT (d->comparison);
28614
28615 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28616 if (! pat)
28617 return 0;
28618 emit_insn (pat);
28619 return target;
28620 }
28621
28622 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28623
28624 static rtx
28625 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28626 rtx target)
28627 {
28628 rtx pat;
28629 tree arg0 = CALL_EXPR_ARG (exp, 0);
28630 tree arg1 = CALL_EXPR_ARG (exp, 1);
28631 rtx op0 = expand_normal (arg0);
28632 rtx op1 = expand_normal (arg1);
28633 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28634 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28635 enum rtx_code comparison = d->comparison;
28636
28637 if (VECTOR_MODE_P (mode0))
28638 op0 = safe_vector_operand (op0, mode0);
28639 if (VECTOR_MODE_P (mode1))
28640 op1 = safe_vector_operand (op1, mode1);
28641
28642 target = gen_reg_rtx (SImode);
28643 emit_move_insn (target, const0_rtx);
28644 target = gen_rtx_SUBREG (QImode, target, 0);
28645
28646 if ((optimize && !register_operand (op0, mode0))
28647 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28648 op0 = copy_to_mode_reg (mode0, op0);
28649 if ((optimize && !register_operand (op1, mode1))
28650 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28651 op1 = copy_to_mode_reg (mode1, op1);
28652
28653 pat = GEN_FCN (d->icode) (op0, op1);
28654 if (! pat)
28655 return 0;
28656 emit_insn (pat);
28657 emit_insn (gen_rtx_SET (VOIDmode,
28658 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28659 gen_rtx_fmt_ee (comparison, QImode,
28660 SET_DEST (pat),
28661 const0_rtx)));
28662
28663 return SUBREG_REG (target);
28664 }
28665
28666 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28667
28668 static rtx
28669 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28670 tree exp, rtx target)
28671 {
28672 rtx pat;
28673 tree arg0 = CALL_EXPR_ARG (exp, 0);
28674 tree arg1 = CALL_EXPR_ARG (exp, 1);
28675 tree arg2 = CALL_EXPR_ARG (exp, 2);
28676 tree arg3 = CALL_EXPR_ARG (exp, 3);
28677 tree arg4 = CALL_EXPR_ARG (exp, 4);
28678 rtx scratch0, scratch1;
28679 rtx op0 = expand_normal (arg0);
28680 rtx op1 = expand_normal (arg1);
28681 rtx op2 = expand_normal (arg2);
28682 rtx op3 = expand_normal (arg3);
28683 rtx op4 = expand_normal (arg4);
28684 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28685
28686 tmode0 = insn_data[d->icode].operand[0].mode;
28687 tmode1 = insn_data[d->icode].operand[1].mode;
28688 modev2 = insn_data[d->icode].operand[2].mode;
28689 modei3 = insn_data[d->icode].operand[3].mode;
28690 modev4 = insn_data[d->icode].operand[4].mode;
28691 modei5 = insn_data[d->icode].operand[5].mode;
28692 modeimm = insn_data[d->icode].operand[6].mode;
28693
28694 if (VECTOR_MODE_P (modev2))
28695 op0 = safe_vector_operand (op0, modev2);
28696 if (VECTOR_MODE_P (modev4))
28697 op2 = safe_vector_operand (op2, modev4);
28698
28699 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28700 op0 = copy_to_mode_reg (modev2, op0);
28701 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28702 op1 = copy_to_mode_reg (modei3, op1);
28703 if ((optimize && !register_operand (op2, modev4))
28704 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28705 op2 = copy_to_mode_reg (modev4, op2);
28706 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28707 op3 = copy_to_mode_reg (modei5, op3);
28708
28709 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28710 {
28711 error ("the fifth argument must be an 8-bit immediate");
28712 return const0_rtx;
28713 }
28714
28715 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28716 {
28717 if (optimize || !target
28718 || GET_MODE (target) != tmode0
28719 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28720 target = gen_reg_rtx (tmode0);
28721
28722 scratch1 = gen_reg_rtx (tmode1);
28723
28724 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28725 }
28726 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28727 {
28728 if (optimize || !target
28729 || GET_MODE (target) != tmode1
28730 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28731 target = gen_reg_rtx (tmode1);
28732
28733 scratch0 = gen_reg_rtx (tmode0);
28734
28735 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28736 }
28737 else
28738 {
28739 gcc_assert (d->flag);
28740
28741 scratch0 = gen_reg_rtx (tmode0);
28742 scratch1 = gen_reg_rtx (tmode1);
28743
28744 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28745 }
28746
28747 if (! pat)
28748 return 0;
28749
28750 emit_insn (pat);
28751
28752 if (d->flag)
28753 {
28754 target = gen_reg_rtx (SImode);
28755 emit_move_insn (target, const0_rtx);
28756 target = gen_rtx_SUBREG (QImode, target, 0);
28757
28758 emit_insn
28759 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28760 gen_rtx_fmt_ee (EQ, QImode,
28761 gen_rtx_REG ((enum machine_mode) d->flag,
28762 FLAGS_REG),
28763 const0_rtx)));
28764 return SUBREG_REG (target);
28765 }
28766 else
28767 return target;
28768 }
28769
28770
28771 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28772
28773 static rtx
28774 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28775 tree exp, rtx target)
28776 {
28777 rtx pat;
28778 tree arg0 = CALL_EXPR_ARG (exp, 0);
28779 tree arg1 = CALL_EXPR_ARG (exp, 1);
28780 tree arg2 = CALL_EXPR_ARG (exp, 2);
28781 rtx scratch0, scratch1;
28782 rtx op0 = expand_normal (arg0);
28783 rtx op1 = expand_normal (arg1);
28784 rtx op2 = expand_normal (arg2);
28785 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28786
28787 tmode0 = insn_data[d->icode].operand[0].mode;
28788 tmode1 = insn_data[d->icode].operand[1].mode;
28789 modev2 = insn_data[d->icode].operand[2].mode;
28790 modev3 = insn_data[d->icode].operand[3].mode;
28791 modeimm = insn_data[d->icode].operand[4].mode;
28792
28793 if (VECTOR_MODE_P (modev2))
28794 op0 = safe_vector_operand (op0, modev2);
28795 if (VECTOR_MODE_P (modev3))
28796 op1 = safe_vector_operand (op1, modev3);
28797
28798 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28799 op0 = copy_to_mode_reg (modev2, op0);
28800 if ((optimize && !register_operand (op1, modev3))
28801 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28802 op1 = copy_to_mode_reg (modev3, op1);
28803
28804 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28805 {
28806 error ("the third argument must be an 8-bit immediate");
28807 return const0_rtx;
28808 }
28809
28810 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28811 {
28812 if (optimize || !target
28813 || GET_MODE (target) != tmode0
28814 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28815 target = gen_reg_rtx (tmode0);
28816
28817 scratch1 = gen_reg_rtx (tmode1);
28818
28819 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28820 }
28821 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28822 {
28823 if (optimize || !target
28824 || GET_MODE (target) != tmode1
28825 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28826 target = gen_reg_rtx (tmode1);
28827
28828 scratch0 = gen_reg_rtx (tmode0);
28829
28830 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28831 }
28832 else
28833 {
28834 gcc_assert (d->flag);
28835
28836 scratch0 = gen_reg_rtx (tmode0);
28837 scratch1 = gen_reg_rtx (tmode1);
28838
28839 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28840 }
28841
28842 if (! pat)
28843 return 0;
28844
28845 emit_insn (pat);
28846
28847 if (d->flag)
28848 {
28849 target = gen_reg_rtx (SImode);
28850 emit_move_insn (target, const0_rtx);
28851 target = gen_rtx_SUBREG (QImode, target, 0);
28852
28853 emit_insn
28854 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28855 gen_rtx_fmt_ee (EQ, QImode,
28856 gen_rtx_REG ((enum machine_mode) d->flag,
28857 FLAGS_REG),
28858 const0_rtx)));
28859 return SUBREG_REG (target);
28860 }
28861 else
28862 return target;
28863 }
28864
28865 /* Subroutine of ix86_expand_builtin to take care of insns with
28866 variable number of operands. */
28867
28868 static rtx
28869 ix86_expand_args_builtin (const struct builtin_description *d,
28870 tree exp, rtx target)
28871 {
28872 rtx pat, real_target;
28873 unsigned int i, nargs;
28874 unsigned int nargs_constant = 0;
28875 int num_memory = 0;
28876 struct
28877 {
28878 rtx op;
28879 enum machine_mode mode;
28880 } args[4];
28881 bool last_arg_count = false;
28882 enum insn_code icode = d->icode;
28883 const struct insn_data_d *insn_p = &insn_data[icode];
28884 enum machine_mode tmode = insn_p->operand[0].mode;
28885 enum machine_mode rmode = VOIDmode;
28886 bool swap = false;
28887 enum rtx_code comparison = d->comparison;
28888
28889 switch ((enum ix86_builtin_func_type) d->flag)
28890 {
28891 case V2DF_FTYPE_V2DF_ROUND:
28892 case V4DF_FTYPE_V4DF_ROUND:
28893 case V4SF_FTYPE_V4SF_ROUND:
28894 case V8SF_FTYPE_V8SF_ROUND:
28895 case V4SI_FTYPE_V4SF_ROUND:
28896 case V8SI_FTYPE_V8SF_ROUND:
28897 return ix86_expand_sse_round (d, exp, target);
28898 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28899 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28900 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28901 case INT_FTYPE_V8SF_V8SF_PTEST:
28902 case INT_FTYPE_V4DI_V4DI_PTEST:
28903 case INT_FTYPE_V4DF_V4DF_PTEST:
28904 case INT_FTYPE_V4SF_V4SF_PTEST:
28905 case INT_FTYPE_V2DI_V2DI_PTEST:
28906 case INT_FTYPE_V2DF_V2DF_PTEST:
28907 return ix86_expand_sse_ptest (d, exp, target);
28908 case FLOAT128_FTYPE_FLOAT128:
28909 case FLOAT_FTYPE_FLOAT:
28910 case INT_FTYPE_INT:
28911 case UINT64_FTYPE_INT:
28912 case UINT16_FTYPE_UINT16:
28913 case INT64_FTYPE_INT64:
28914 case INT64_FTYPE_V4SF:
28915 case INT64_FTYPE_V2DF:
28916 case INT_FTYPE_V16QI:
28917 case INT_FTYPE_V8QI:
28918 case INT_FTYPE_V8SF:
28919 case INT_FTYPE_V4DF:
28920 case INT_FTYPE_V4SF:
28921 case INT_FTYPE_V2DF:
28922 case INT_FTYPE_V32QI:
28923 case V16QI_FTYPE_V16QI:
28924 case V8SI_FTYPE_V8SF:
28925 case V8SI_FTYPE_V4SI:
28926 case V8HI_FTYPE_V8HI:
28927 case V8HI_FTYPE_V16QI:
28928 case V8QI_FTYPE_V8QI:
28929 case V8SF_FTYPE_V8SF:
28930 case V8SF_FTYPE_V8SI:
28931 case V8SF_FTYPE_V4SF:
28932 case V8SF_FTYPE_V8HI:
28933 case V4SI_FTYPE_V4SI:
28934 case V4SI_FTYPE_V16QI:
28935 case V4SI_FTYPE_V4SF:
28936 case V4SI_FTYPE_V8SI:
28937 case V4SI_FTYPE_V8HI:
28938 case V4SI_FTYPE_V4DF:
28939 case V4SI_FTYPE_V2DF:
28940 case V4HI_FTYPE_V4HI:
28941 case V4DF_FTYPE_V4DF:
28942 case V4DF_FTYPE_V4SI:
28943 case V4DF_FTYPE_V4SF:
28944 case V4DF_FTYPE_V2DF:
28945 case V4SF_FTYPE_V4SF:
28946 case V4SF_FTYPE_V4SI:
28947 case V4SF_FTYPE_V8SF:
28948 case V4SF_FTYPE_V4DF:
28949 case V4SF_FTYPE_V8HI:
28950 case V4SF_FTYPE_V2DF:
28951 case V2DI_FTYPE_V2DI:
28952 case V2DI_FTYPE_V16QI:
28953 case V2DI_FTYPE_V8HI:
28954 case V2DI_FTYPE_V4SI:
28955 case V2DF_FTYPE_V2DF:
28956 case V2DF_FTYPE_V4SI:
28957 case V2DF_FTYPE_V4DF:
28958 case V2DF_FTYPE_V4SF:
28959 case V2DF_FTYPE_V2SI:
28960 case V2SI_FTYPE_V2SI:
28961 case V2SI_FTYPE_V4SF:
28962 case V2SI_FTYPE_V2SF:
28963 case V2SI_FTYPE_V2DF:
28964 case V2SF_FTYPE_V2SF:
28965 case V2SF_FTYPE_V2SI:
28966 case V32QI_FTYPE_V32QI:
28967 case V32QI_FTYPE_V16QI:
28968 case V16HI_FTYPE_V16HI:
28969 case V16HI_FTYPE_V8HI:
28970 case V8SI_FTYPE_V8SI:
28971 case V16HI_FTYPE_V16QI:
28972 case V8SI_FTYPE_V16QI:
28973 case V4DI_FTYPE_V16QI:
28974 case V8SI_FTYPE_V8HI:
28975 case V4DI_FTYPE_V8HI:
28976 case V4DI_FTYPE_V4SI:
28977 case V4DI_FTYPE_V2DI:
28978 nargs = 1;
28979 break;
28980 case V4SF_FTYPE_V4SF_VEC_MERGE:
28981 case V2DF_FTYPE_V2DF_VEC_MERGE:
28982 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28983 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28984 case V16QI_FTYPE_V16QI_V16QI:
28985 case V16QI_FTYPE_V8HI_V8HI:
28986 case V8QI_FTYPE_V8QI_V8QI:
28987 case V8QI_FTYPE_V4HI_V4HI:
28988 case V8HI_FTYPE_V8HI_V8HI:
28989 case V8HI_FTYPE_V16QI_V16QI:
28990 case V8HI_FTYPE_V4SI_V4SI:
28991 case V8SF_FTYPE_V8SF_V8SF:
28992 case V8SF_FTYPE_V8SF_V8SI:
28993 case V4SI_FTYPE_V4SI_V4SI:
28994 case V4SI_FTYPE_V8HI_V8HI:
28995 case V4SI_FTYPE_V4SF_V4SF:
28996 case V4SI_FTYPE_V2DF_V2DF:
28997 case V4HI_FTYPE_V4HI_V4HI:
28998 case V4HI_FTYPE_V8QI_V8QI:
28999 case V4HI_FTYPE_V2SI_V2SI:
29000 case V4DF_FTYPE_V4DF_V4DF:
29001 case V4DF_FTYPE_V4DF_V4DI:
29002 case V4SF_FTYPE_V4SF_V4SF:
29003 case V4SF_FTYPE_V4SF_V4SI:
29004 case V4SF_FTYPE_V4SF_V2SI:
29005 case V4SF_FTYPE_V4SF_V2DF:
29006 case V4SF_FTYPE_V4SF_DI:
29007 case V4SF_FTYPE_V4SF_SI:
29008 case V2DI_FTYPE_V2DI_V2DI:
29009 case V2DI_FTYPE_V16QI_V16QI:
29010 case V2DI_FTYPE_V4SI_V4SI:
29011 case V2DI_FTYPE_V2DI_V16QI:
29012 case V2DI_FTYPE_V2DF_V2DF:
29013 case V2SI_FTYPE_V2SI_V2SI:
29014 case V2SI_FTYPE_V4HI_V4HI:
29015 case V2SI_FTYPE_V2SF_V2SF:
29016 case V2DF_FTYPE_V2DF_V2DF:
29017 case V2DF_FTYPE_V2DF_V4SF:
29018 case V2DF_FTYPE_V2DF_V2DI:
29019 case V2DF_FTYPE_V2DF_DI:
29020 case V2DF_FTYPE_V2DF_SI:
29021 case V2SF_FTYPE_V2SF_V2SF:
29022 case V1DI_FTYPE_V1DI_V1DI:
29023 case V1DI_FTYPE_V8QI_V8QI:
29024 case V1DI_FTYPE_V2SI_V2SI:
29025 case V32QI_FTYPE_V16HI_V16HI:
29026 case V16HI_FTYPE_V8SI_V8SI:
29027 case V32QI_FTYPE_V32QI_V32QI:
29028 case V16HI_FTYPE_V32QI_V32QI:
29029 case V16HI_FTYPE_V16HI_V16HI:
29030 case V8SI_FTYPE_V4DF_V4DF:
29031 case V8SI_FTYPE_V8SI_V8SI:
29032 case V8SI_FTYPE_V16HI_V16HI:
29033 case V4DI_FTYPE_V4DI_V4DI:
29034 case V4DI_FTYPE_V8SI_V8SI:
29035 if (comparison == UNKNOWN)
29036 return ix86_expand_binop_builtin (icode, exp, target);
29037 nargs = 2;
29038 break;
29039 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29040 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29041 gcc_assert (comparison != UNKNOWN);
29042 nargs = 2;
29043 swap = true;
29044 break;
29045 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29046 case V16HI_FTYPE_V16HI_SI_COUNT:
29047 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29048 case V8SI_FTYPE_V8SI_SI_COUNT:
29049 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29050 case V4DI_FTYPE_V4DI_INT_COUNT:
29051 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29052 case V8HI_FTYPE_V8HI_SI_COUNT:
29053 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29054 case V4SI_FTYPE_V4SI_SI_COUNT:
29055 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29056 case V4HI_FTYPE_V4HI_SI_COUNT:
29057 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29058 case V2DI_FTYPE_V2DI_SI_COUNT:
29059 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29060 case V2SI_FTYPE_V2SI_SI_COUNT:
29061 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29062 case V1DI_FTYPE_V1DI_SI_COUNT:
29063 nargs = 2;
29064 last_arg_count = true;
29065 break;
29066 case UINT64_FTYPE_UINT64_UINT64:
29067 case UINT_FTYPE_UINT_UINT:
29068 case UINT_FTYPE_UINT_USHORT:
29069 case UINT_FTYPE_UINT_UCHAR:
29070 case UINT16_FTYPE_UINT16_INT:
29071 case UINT8_FTYPE_UINT8_INT:
29072 nargs = 2;
29073 break;
29074 case V2DI_FTYPE_V2DI_INT_CONVERT:
29075 nargs = 2;
29076 rmode = V1TImode;
29077 nargs_constant = 1;
29078 break;
29079 case V4DI_FTYPE_V4DI_INT_CONVERT:
29080 nargs = 2;
29081 rmode = V2TImode;
29082 nargs_constant = 1;
29083 break;
29084 case V8HI_FTYPE_V8HI_INT:
29085 case V8HI_FTYPE_V8SF_INT:
29086 case V8HI_FTYPE_V4SF_INT:
29087 case V8SF_FTYPE_V8SF_INT:
29088 case V4SI_FTYPE_V4SI_INT:
29089 case V4SI_FTYPE_V8SI_INT:
29090 case V4HI_FTYPE_V4HI_INT:
29091 case V4DF_FTYPE_V4DF_INT:
29092 case V4SF_FTYPE_V4SF_INT:
29093 case V4SF_FTYPE_V8SF_INT:
29094 case V2DI_FTYPE_V2DI_INT:
29095 case V2DF_FTYPE_V2DF_INT:
29096 case V2DF_FTYPE_V4DF_INT:
29097 case V16HI_FTYPE_V16HI_INT:
29098 case V8SI_FTYPE_V8SI_INT:
29099 case V4DI_FTYPE_V4DI_INT:
29100 case V2DI_FTYPE_V4DI_INT:
29101 nargs = 2;
29102 nargs_constant = 1;
29103 break;
29104 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29105 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29106 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29107 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29108 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29109 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29110 nargs = 3;
29111 break;
29112 case V32QI_FTYPE_V32QI_V32QI_INT:
29113 case V16HI_FTYPE_V16HI_V16HI_INT:
29114 case V16QI_FTYPE_V16QI_V16QI_INT:
29115 case V4DI_FTYPE_V4DI_V4DI_INT:
29116 case V8HI_FTYPE_V8HI_V8HI_INT:
29117 case V8SI_FTYPE_V8SI_V8SI_INT:
29118 case V8SI_FTYPE_V8SI_V4SI_INT:
29119 case V8SF_FTYPE_V8SF_V8SF_INT:
29120 case V8SF_FTYPE_V8SF_V4SF_INT:
29121 case V4SI_FTYPE_V4SI_V4SI_INT:
29122 case V4DF_FTYPE_V4DF_V4DF_INT:
29123 case V4DF_FTYPE_V4DF_V2DF_INT:
29124 case V4SF_FTYPE_V4SF_V4SF_INT:
29125 case V2DI_FTYPE_V2DI_V2DI_INT:
29126 case V4DI_FTYPE_V4DI_V2DI_INT:
29127 case V2DF_FTYPE_V2DF_V2DF_INT:
29128 nargs = 3;
29129 nargs_constant = 1;
29130 break;
29131 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29132 nargs = 3;
29133 rmode = V4DImode;
29134 nargs_constant = 1;
29135 break;
29136 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29137 nargs = 3;
29138 rmode = V2DImode;
29139 nargs_constant = 1;
29140 break;
29141 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29142 nargs = 3;
29143 rmode = DImode;
29144 nargs_constant = 1;
29145 break;
29146 case V2DI_FTYPE_V2DI_UINT_UINT:
29147 nargs = 3;
29148 nargs_constant = 2;
29149 break;
29150 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29151 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29152 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29153 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29154 nargs = 4;
29155 nargs_constant = 1;
29156 break;
29157 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29158 nargs = 4;
29159 nargs_constant = 2;
29160 break;
29161 default:
29162 gcc_unreachable ();
29163 }
29164
29165 gcc_assert (nargs <= ARRAY_SIZE (args));
29166
29167 if (comparison != UNKNOWN)
29168 {
29169 gcc_assert (nargs == 2);
29170 return ix86_expand_sse_compare (d, exp, target, swap);
29171 }
29172
29173 if (rmode == VOIDmode || rmode == tmode)
29174 {
29175 if (optimize
29176 || target == 0
29177 || GET_MODE (target) != tmode
29178 || !insn_p->operand[0].predicate (target, tmode))
29179 target = gen_reg_rtx (tmode);
29180 real_target = target;
29181 }
29182 else
29183 {
29184 target = gen_reg_rtx (rmode);
29185 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29186 }
29187
29188 for (i = 0; i < nargs; i++)
29189 {
29190 tree arg = CALL_EXPR_ARG (exp, i);
29191 rtx op = expand_normal (arg);
29192 enum machine_mode mode = insn_p->operand[i + 1].mode;
29193 bool match = insn_p->operand[i + 1].predicate (op, mode);
29194
29195 if (last_arg_count && (i + 1) == nargs)
29196 {
29197 /* SIMD shift insns take either an 8-bit immediate or
29198 register as count. But builtin functions take int as
29199 count. If count doesn't match, we put it in register. */
29200 if (!match)
29201 {
29202 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29203 if (!insn_p->operand[i + 1].predicate (op, mode))
29204 op = copy_to_reg (op);
29205 }
29206 }
29207 else if ((nargs - i) <= nargs_constant)
29208 {
29209 if (!match)
29210 switch (icode)
29211 {
29212 case CODE_FOR_avx2_inserti128:
29213 case CODE_FOR_avx2_extracti128:
29214 error ("the last argument must be an 1-bit immediate");
29215 return const0_rtx;
29216
29217 case CODE_FOR_sse4_1_roundsd:
29218 case CODE_FOR_sse4_1_roundss:
29219
29220 case CODE_FOR_sse4_1_roundpd:
29221 case CODE_FOR_sse4_1_roundps:
29222 case CODE_FOR_avx_roundpd256:
29223 case CODE_FOR_avx_roundps256:
29224
29225 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29226 case CODE_FOR_sse4_1_roundps_sfix:
29227 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29228 case CODE_FOR_avx_roundps_sfix256:
29229
29230 case CODE_FOR_sse4_1_blendps:
29231 case CODE_FOR_avx_blendpd256:
29232 case CODE_FOR_avx_vpermilv4df:
29233 error ("the last argument must be a 4-bit immediate");
29234 return const0_rtx;
29235
29236 case CODE_FOR_sse4_1_blendpd:
29237 case CODE_FOR_avx_vpermilv2df:
29238 case CODE_FOR_xop_vpermil2v2df3:
29239 case CODE_FOR_xop_vpermil2v4sf3:
29240 case CODE_FOR_xop_vpermil2v4df3:
29241 case CODE_FOR_xop_vpermil2v8sf3:
29242 error ("the last argument must be a 2-bit immediate");
29243 return const0_rtx;
29244
29245 case CODE_FOR_avx_vextractf128v4df:
29246 case CODE_FOR_avx_vextractf128v8sf:
29247 case CODE_FOR_avx_vextractf128v8si:
29248 case CODE_FOR_avx_vinsertf128v4df:
29249 case CODE_FOR_avx_vinsertf128v8sf:
29250 case CODE_FOR_avx_vinsertf128v8si:
29251 error ("the last argument must be a 1-bit immediate");
29252 return const0_rtx;
29253
29254 case CODE_FOR_avx_vmcmpv2df3:
29255 case CODE_FOR_avx_vmcmpv4sf3:
29256 case CODE_FOR_avx_cmpv2df3:
29257 case CODE_FOR_avx_cmpv4sf3:
29258 case CODE_FOR_avx_cmpv4df3:
29259 case CODE_FOR_avx_cmpv8sf3:
29260 error ("the last argument must be a 5-bit immediate");
29261 return const0_rtx;
29262
29263 default:
29264 switch (nargs_constant)
29265 {
29266 case 2:
29267 if ((nargs - i) == nargs_constant)
29268 {
29269 error ("the next to last argument must be an 8-bit immediate");
29270 break;
29271 }
29272 case 1:
29273 error ("the last argument must be an 8-bit immediate");
29274 break;
29275 default:
29276 gcc_unreachable ();
29277 }
29278 return const0_rtx;
29279 }
29280 }
29281 else
29282 {
29283 if (VECTOR_MODE_P (mode))
29284 op = safe_vector_operand (op, mode);
29285
29286 /* If we aren't optimizing, only allow one memory operand to
29287 be generated. */
29288 if (memory_operand (op, mode))
29289 num_memory++;
29290
29291 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29292 {
29293 if (optimize || !match || num_memory > 1)
29294 op = copy_to_mode_reg (mode, op);
29295 }
29296 else
29297 {
29298 op = copy_to_reg (op);
29299 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29300 }
29301 }
29302
29303 args[i].op = op;
29304 args[i].mode = mode;
29305 }
29306
29307 switch (nargs)
29308 {
29309 case 1:
29310 pat = GEN_FCN (icode) (real_target, args[0].op);
29311 break;
29312 case 2:
29313 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29314 break;
29315 case 3:
29316 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29317 args[2].op);
29318 break;
29319 case 4:
29320 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29321 args[2].op, args[3].op);
29322 break;
29323 default:
29324 gcc_unreachable ();
29325 }
29326
29327 if (! pat)
29328 return 0;
29329
29330 emit_insn (pat);
29331 return target;
29332 }
29333
29334 /* Subroutine of ix86_expand_builtin to take care of special insns
29335 with variable number of operands. */
29336
29337 static rtx
29338 ix86_expand_special_args_builtin (const struct builtin_description *d,
29339 tree exp, rtx target)
29340 {
29341 tree arg;
29342 rtx pat, op;
29343 unsigned int i, nargs, arg_adjust, memory;
29344 struct
29345 {
29346 rtx op;
29347 enum machine_mode mode;
29348 } args[3];
29349 enum insn_code icode = d->icode;
29350 bool last_arg_constant = false;
29351 const struct insn_data_d *insn_p = &insn_data[icode];
29352 enum machine_mode tmode = insn_p->operand[0].mode;
29353 enum { load, store } klass;
29354
29355 switch ((enum ix86_builtin_func_type) d->flag)
29356 {
29357 case VOID_FTYPE_VOID:
29358 if (icode == CODE_FOR_avx_vzeroupper)
29359 target = GEN_INT (vzeroupper_intrinsic);
29360 emit_insn (GEN_FCN (icode) (target));
29361 return 0;
29362 case VOID_FTYPE_UINT64:
29363 case VOID_FTYPE_UNSIGNED:
29364 nargs = 0;
29365 klass = store;
29366 memory = 0;
29367 break;
29368
29369 case INT_FTYPE_VOID:
29370 case UINT64_FTYPE_VOID:
29371 case UNSIGNED_FTYPE_VOID:
29372 nargs = 0;
29373 klass = load;
29374 memory = 0;
29375 break;
29376 case UINT64_FTYPE_PUNSIGNED:
29377 case V2DI_FTYPE_PV2DI:
29378 case V4DI_FTYPE_PV4DI:
29379 case V32QI_FTYPE_PCCHAR:
29380 case V16QI_FTYPE_PCCHAR:
29381 case V8SF_FTYPE_PCV4SF:
29382 case V8SF_FTYPE_PCFLOAT:
29383 case V4SF_FTYPE_PCFLOAT:
29384 case V4DF_FTYPE_PCV2DF:
29385 case V4DF_FTYPE_PCDOUBLE:
29386 case V2DF_FTYPE_PCDOUBLE:
29387 case VOID_FTYPE_PVOID:
29388 nargs = 1;
29389 klass = load;
29390 memory = 0;
29391 break;
29392 case VOID_FTYPE_PV2SF_V4SF:
29393 case VOID_FTYPE_PV4DI_V4DI:
29394 case VOID_FTYPE_PV2DI_V2DI:
29395 case VOID_FTYPE_PCHAR_V32QI:
29396 case VOID_FTYPE_PCHAR_V16QI:
29397 case VOID_FTYPE_PFLOAT_V8SF:
29398 case VOID_FTYPE_PFLOAT_V4SF:
29399 case VOID_FTYPE_PDOUBLE_V4DF:
29400 case VOID_FTYPE_PDOUBLE_V2DF:
29401 case VOID_FTYPE_PLONGLONG_LONGLONG:
29402 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29403 case VOID_FTYPE_PINT_INT:
29404 nargs = 1;
29405 klass = store;
29406 /* Reserve memory operand for target. */
29407 memory = ARRAY_SIZE (args);
29408 break;
29409 case V4SF_FTYPE_V4SF_PCV2SF:
29410 case V2DF_FTYPE_V2DF_PCDOUBLE:
29411 nargs = 2;
29412 klass = load;
29413 memory = 1;
29414 break;
29415 case V8SF_FTYPE_PCV8SF_V8SI:
29416 case V4DF_FTYPE_PCV4DF_V4DI:
29417 case V4SF_FTYPE_PCV4SF_V4SI:
29418 case V2DF_FTYPE_PCV2DF_V2DI:
29419 case V8SI_FTYPE_PCV8SI_V8SI:
29420 case V4DI_FTYPE_PCV4DI_V4DI:
29421 case V4SI_FTYPE_PCV4SI_V4SI:
29422 case V2DI_FTYPE_PCV2DI_V2DI:
29423 nargs = 2;
29424 klass = load;
29425 memory = 0;
29426 break;
29427 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29428 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29429 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29430 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29431 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29432 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29433 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29434 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29435 nargs = 2;
29436 klass = store;
29437 /* Reserve memory operand for target. */
29438 memory = ARRAY_SIZE (args);
29439 break;
29440 case VOID_FTYPE_UINT_UINT_UINT:
29441 case VOID_FTYPE_UINT64_UINT_UINT:
29442 case UCHAR_FTYPE_UINT_UINT_UINT:
29443 case UCHAR_FTYPE_UINT64_UINT_UINT:
29444 nargs = 3;
29445 klass = load;
29446 memory = ARRAY_SIZE (args);
29447 last_arg_constant = true;
29448 break;
29449 default:
29450 gcc_unreachable ();
29451 }
29452
29453 gcc_assert (nargs <= ARRAY_SIZE (args));
29454
29455 if (klass == store)
29456 {
29457 arg = CALL_EXPR_ARG (exp, 0);
29458 op = expand_normal (arg);
29459 gcc_assert (target == 0);
29460 if (memory)
29461 {
29462 if (GET_MODE (op) != Pmode)
29463 op = convert_to_mode (Pmode, op, 1);
29464 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29465 }
29466 else
29467 target = force_reg (tmode, op);
29468 arg_adjust = 1;
29469 }
29470 else
29471 {
29472 arg_adjust = 0;
29473 if (optimize
29474 || target == 0
29475 || GET_MODE (target) != tmode
29476 || !insn_p->operand[0].predicate (target, tmode))
29477 target = gen_reg_rtx (tmode);
29478 }
29479
29480 for (i = 0; i < nargs; i++)
29481 {
29482 enum machine_mode mode = insn_p->operand[i + 1].mode;
29483 bool match;
29484
29485 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29486 op = expand_normal (arg);
29487 match = insn_p->operand[i + 1].predicate (op, mode);
29488
29489 if (last_arg_constant && (i + 1) == nargs)
29490 {
29491 if (!match)
29492 {
29493 if (icode == CODE_FOR_lwp_lwpvalsi3
29494 || icode == CODE_FOR_lwp_lwpinssi3
29495 || icode == CODE_FOR_lwp_lwpvaldi3
29496 || icode == CODE_FOR_lwp_lwpinsdi3)
29497 error ("the last argument must be a 32-bit immediate");
29498 else
29499 error ("the last argument must be an 8-bit immediate");
29500 return const0_rtx;
29501 }
29502 }
29503 else
29504 {
29505 if (i == memory)
29506 {
29507 /* This must be the memory operand. */
29508 if (GET_MODE (op) != Pmode)
29509 op = convert_to_mode (Pmode, op, 1);
29510 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29511 gcc_assert (GET_MODE (op) == mode
29512 || GET_MODE (op) == VOIDmode);
29513 }
29514 else
29515 {
29516 /* This must be register. */
29517 if (VECTOR_MODE_P (mode))
29518 op = safe_vector_operand (op, mode);
29519
29520 gcc_assert (GET_MODE (op) == mode
29521 || GET_MODE (op) == VOIDmode);
29522 op = copy_to_mode_reg (mode, op);
29523 }
29524 }
29525
29526 args[i].op = op;
29527 args[i].mode = mode;
29528 }
29529
29530 switch (nargs)
29531 {
29532 case 0:
29533 pat = GEN_FCN (icode) (target);
29534 break;
29535 case 1:
29536 pat = GEN_FCN (icode) (target, args[0].op);
29537 break;
29538 case 2:
29539 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29540 break;
29541 case 3:
29542 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29543 break;
29544 default:
29545 gcc_unreachable ();
29546 }
29547
29548 if (! pat)
29549 return 0;
29550 emit_insn (pat);
29551 return klass == store ? 0 : target;
29552 }
29553
29554 /* Return the integer constant in ARG. Constrain it to be in the range
29555 of the subparts of VEC_TYPE; issue an error if not. */
29556
29557 static int
29558 get_element_number (tree vec_type, tree arg)
29559 {
29560 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29561
29562 if (!host_integerp (arg, 1)
29563 || (elt = tree_low_cst (arg, 1), elt > max))
29564 {
29565 error ("selector must be an integer constant in the range 0..%wi", max);
29566 return 0;
29567 }
29568
29569 return elt;
29570 }
29571
29572 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29573 ix86_expand_vector_init. We DO have language-level syntax for this, in
29574 the form of (type){ init-list }. Except that since we can't place emms
29575 instructions from inside the compiler, we can't allow the use of MMX
29576 registers unless the user explicitly asks for it. So we do *not* define
29577 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29578 we have builtins invoked by mmintrin.h that gives us license to emit
29579 these sorts of instructions. */
29580
29581 static rtx
29582 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29583 {
29584 enum machine_mode tmode = TYPE_MODE (type);
29585 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29586 int i, n_elt = GET_MODE_NUNITS (tmode);
29587 rtvec v = rtvec_alloc (n_elt);
29588
29589 gcc_assert (VECTOR_MODE_P (tmode));
29590 gcc_assert (call_expr_nargs (exp) == n_elt);
29591
29592 for (i = 0; i < n_elt; ++i)
29593 {
29594 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29595 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29596 }
29597
29598 if (!target || !register_operand (target, tmode))
29599 target = gen_reg_rtx (tmode);
29600
29601 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29602 return target;
29603 }
29604
29605 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29606 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29607 had a language-level syntax for referencing vector elements. */
29608
29609 static rtx
29610 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29611 {
29612 enum machine_mode tmode, mode0;
29613 tree arg0, arg1;
29614 int elt;
29615 rtx op0;
29616
29617 arg0 = CALL_EXPR_ARG (exp, 0);
29618 arg1 = CALL_EXPR_ARG (exp, 1);
29619
29620 op0 = expand_normal (arg0);
29621 elt = get_element_number (TREE_TYPE (arg0), arg1);
29622
29623 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29624 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29625 gcc_assert (VECTOR_MODE_P (mode0));
29626
29627 op0 = force_reg (mode0, op0);
29628
29629 if (optimize || !target || !register_operand (target, tmode))
29630 target = gen_reg_rtx (tmode);
29631
29632 ix86_expand_vector_extract (true, target, op0, elt);
29633
29634 return target;
29635 }
29636
29637 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29638 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29639 a language-level syntax for referencing vector elements. */
29640
29641 static rtx
29642 ix86_expand_vec_set_builtin (tree exp)
29643 {
29644 enum machine_mode tmode, mode1;
29645 tree arg0, arg1, arg2;
29646 int elt;
29647 rtx op0, op1, target;
29648
29649 arg0 = CALL_EXPR_ARG (exp, 0);
29650 arg1 = CALL_EXPR_ARG (exp, 1);
29651 arg2 = CALL_EXPR_ARG (exp, 2);
29652
29653 tmode = TYPE_MODE (TREE_TYPE (arg0));
29654 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29655 gcc_assert (VECTOR_MODE_P (tmode));
29656
29657 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29658 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29659 elt = get_element_number (TREE_TYPE (arg0), arg2);
29660
29661 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29662 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29663
29664 op0 = force_reg (tmode, op0);
29665 op1 = force_reg (mode1, op1);
29666
29667 /* OP0 is the source of these builtin functions and shouldn't be
29668 modified. Create a copy, use it and return it as target. */
29669 target = gen_reg_rtx (tmode);
29670 emit_move_insn (target, op0);
29671 ix86_expand_vector_set (true, target, op1, elt);
29672
29673 return target;
29674 }
29675
29676 /* Expand an expression EXP that calls a built-in function,
29677 with result going to TARGET if that's convenient
29678 (and in mode MODE if that's convenient).
29679 SUBTARGET may be used as the target for computing one of EXP's operands.
29680 IGNORE is nonzero if the value is to be ignored. */
29681
29682 static rtx
29683 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29684 enum machine_mode mode ATTRIBUTE_UNUSED,
29685 int ignore ATTRIBUTE_UNUSED)
29686 {
29687 const struct builtin_description *d;
29688 size_t i;
29689 enum insn_code icode;
29690 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29691 tree arg0, arg1, arg2, arg3, arg4;
29692 rtx op0, op1, op2, op3, op4, pat;
29693 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29694 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29695
29696 /* For CPU builtins that can be folded, fold first and expand the fold. */
29697 switch (fcode)
29698 {
29699 case IX86_BUILTIN_CPU_INIT:
29700 {
29701 /* Make it call __cpu_indicator_init in libgcc. */
29702 tree call_expr, fndecl, type;
29703 type = build_function_type_list (integer_type_node, NULL_TREE);
29704 fndecl = build_fn_decl ("__cpu_indicator_init", type);
29705 call_expr = build_call_expr (fndecl, 0);
29706 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
29707 }
29708 case IX86_BUILTIN_CPU_IS:
29709 case IX86_BUILTIN_CPU_SUPPORTS:
29710 {
29711 tree arg0 = CALL_EXPR_ARG (exp, 0);
29712 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
29713 gcc_assert (fold_expr != NULL_TREE);
29714 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
29715 }
29716 }
29717
29718 /* Determine whether the builtin function is available under the current ISA.
29719 Originally the builtin was not created if it wasn't applicable to the
29720 current ISA based on the command line switches. With function specific
29721 options, we need to check in the context of the function making the call
29722 whether it is supported. */
29723 if (ix86_builtins_isa[fcode].isa
29724 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29725 {
29726 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29727 NULL, (enum fpmath_unit) 0, false);
29728
29729 if (!opts)
29730 error ("%qE needs unknown isa option", fndecl);
29731 else
29732 {
29733 gcc_assert (opts != NULL);
29734 error ("%qE needs isa option %s", fndecl, opts);
29735 free (opts);
29736 }
29737 return const0_rtx;
29738 }
29739
29740 switch (fcode)
29741 {
29742 case IX86_BUILTIN_MASKMOVQ:
29743 case IX86_BUILTIN_MASKMOVDQU:
29744 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29745 ? CODE_FOR_mmx_maskmovq
29746 : CODE_FOR_sse2_maskmovdqu);
29747 /* Note the arg order is different from the operand order. */
29748 arg1 = CALL_EXPR_ARG (exp, 0);
29749 arg2 = CALL_EXPR_ARG (exp, 1);
29750 arg0 = CALL_EXPR_ARG (exp, 2);
29751 op0 = expand_normal (arg0);
29752 op1 = expand_normal (arg1);
29753 op2 = expand_normal (arg2);
29754 mode0 = insn_data[icode].operand[0].mode;
29755 mode1 = insn_data[icode].operand[1].mode;
29756 mode2 = insn_data[icode].operand[2].mode;
29757
29758 if (GET_MODE (op0) != Pmode)
29759 op0 = convert_to_mode (Pmode, op0, 1);
29760 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29761
29762 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29763 op0 = copy_to_mode_reg (mode0, op0);
29764 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29765 op1 = copy_to_mode_reg (mode1, op1);
29766 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29767 op2 = copy_to_mode_reg (mode2, op2);
29768 pat = GEN_FCN (icode) (op0, op1, op2);
29769 if (! pat)
29770 return 0;
29771 emit_insn (pat);
29772 return 0;
29773
29774 case IX86_BUILTIN_LDMXCSR:
29775 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29776 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29777 emit_move_insn (target, op0);
29778 emit_insn (gen_sse_ldmxcsr (target));
29779 return 0;
29780
29781 case IX86_BUILTIN_STMXCSR:
29782 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29783 emit_insn (gen_sse_stmxcsr (target));
29784 return copy_to_mode_reg (SImode, target);
29785
29786 case IX86_BUILTIN_CLFLUSH:
29787 arg0 = CALL_EXPR_ARG (exp, 0);
29788 op0 = expand_normal (arg0);
29789 icode = CODE_FOR_sse2_clflush;
29790 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29791 {
29792 if (GET_MODE (op0) != Pmode)
29793 op0 = convert_to_mode (Pmode, op0, 1);
29794 op0 = force_reg (Pmode, op0);
29795 }
29796
29797 emit_insn (gen_sse2_clflush (op0));
29798 return 0;
29799
29800 case IX86_BUILTIN_MONITOR:
29801 arg0 = CALL_EXPR_ARG (exp, 0);
29802 arg1 = CALL_EXPR_ARG (exp, 1);
29803 arg2 = CALL_EXPR_ARG (exp, 2);
29804 op0 = expand_normal (arg0);
29805 op1 = expand_normal (arg1);
29806 op2 = expand_normal (arg2);
29807 if (!REG_P (op0))
29808 {
29809 if (GET_MODE (op0) != Pmode)
29810 op0 = convert_to_mode (Pmode, op0, 1);
29811 op0 = force_reg (Pmode, op0);
29812 }
29813 if (!REG_P (op1))
29814 op1 = copy_to_mode_reg (SImode, op1);
29815 if (!REG_P (op2))
29816 op2 = copy_to_mode_reg (SImode, op2);
29817 emit_insn (ix86_gen_monitor (op0, op1, op2));
29818 return 0;
29819
29820 case IX86_BUILTIN_MWAIT:
29821 arg0 = CALL_EXPR_ARG (exp, 0);
29822 arg1 = CALL_EXPR_ARG (exp, 1);
29823 op0 = expand_normal (arg0);
29824 op1 = expand_normal (arg1);
29825 if (!REG_P (op0))
29826 op0 = copy_to_mode_reg (SImode, op0);
29827 if (!REG_P (op1))
29828 op1 = copy_to_mode_reg (SImode, op1);
29829 emit_insn (gen_sse3_mwait (op0, op1));
29830 return 0;
29831
29832 case IX86_BUILTIN_VEC_INIT_V2SI:
29833 case IX86_BUILTIN_VEC_INIT_V4HI:
29834 case IX86_BUILTIN_VEC_INIT_V8QI:
29835 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29836
29837 case IX86_BUILTIN_VEC_EXT_V2DF:
29838 case IX86_BUILTIN_VEC_EXT_V2DI:
29839 case IX86_BUILTIN_VEC_EXT_V4SF:
29840 case IX86_BUILTIN_VEC_EXT_V4SI:
29841 case IX86_BUILTIN_VEC_EXT_V8HI:
29842 case IX86_BUILTIN_VEC_EXT_V2SI:
29843 case IX86_BUILTIN_VEC_EXT_V4HI:
29844 case IX86_BUILTIN_VEC_EXT_V16QI:
29845 return ix86_expand_vec_ext_builtin (exp, target);
29846
29847 case IX86_BUILTIN_VEC_SET_V2DI:
29848 case IX86_BUILTIN_VEC_SET_V4SF:
29849 case IX86_BUILTIN_VEC_SET_V4SI:
29850 case IX86_BUILTIN_VEC_SET_V8HI:
29851 case IX86_BUILTIN_VEC_SET_V4HI:
29852 case IX86_BUILTIN_VEC_SET_V16QI:
29853 return ix86_expand_vec_set_builtin (exp);
29854
29855 case IX86_BUILTIN_INFQ:
29856 case IX86_BUILTIN_HUGE_VALQ:
29857 {
29858 REAL_VALUE_TYPE inf;
29859 rtx tmp;
29860
29861 real_inf (&inf);
29862 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29863
29864 tmp = validize_mem (force_const_mem (mode, tmp));
29865
29866 if (target == 0)
29867 target = gen_reg_rtx (mode);
29868
29869 emit_move_insn (target, tmp);
29870 return target;
29871 }
29872
29873 case IX86_BUILTIN_LLWPCB:
29874 arg0 = CALL_EXPR_ARG (exp, 0);
29875 op0 = expand_normal (arg0);
29876 icode = CODE_FOR_lwp_llwpcb;
29877 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29878 {
29879 if (GET_MODE (op0) != Pmode)
29880 op0 = convert_to_mode (Pmode, op0, 1);
29881 op0 = force_reg (Pmode, op0);
29882 }
29883 emit_insn (gen_lwp_llwpcb (op0));
29884 return 0;
29885
29886 case IX86_BUILTIN_SLWPCB:
29887 icode = CODE_FOR_lwp_slwpcb;
29888 if (!target
29889 || !insn_data[icode].operand[0].predicate (target, Pmode))
29890 target = gen_reg_rtx (Pmode);
29891 emit_insn (gen_lwp_slwpcb (target));
29892 return target;
29893
29894 case IX86_BUILTIN_BEXTRI32:
29895 case IX86_BUILTIN_BEXTRI64:
29896 arg0 = CALL_EXPR_ARG (exp, 0);
29897 arg1 = CALL_EXPR_ARG (exp, 1);
29898 op0 = expand_normal (arg0);
29899 op1 = expand_normal (arg1);
29900 icode = (fcode == IX86_BUILTIN_BEXTRI32
29901 ? CODE_FOR_tbm_bextri_si
29902 : CODE_FOR_tbm_bextri_di);
29903 if (!CONST_INT_P (op1))
29904 {
29905 error ("last argument must be an immediate");
29906 return const0_rtx;
29907 }
29908 else
29909 {
29910 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29911 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29912 op1 = GEN_INT (length);
29913 op2 = GEN_INT (lsb_index);
29914 pat = GEN_FCN (icode) (target, op0, op1, op2);
29915 if (pat)
29916 emit_insn (pat);
29917 return target;
29918 }
29919
29920 case IX86_BUILTIN_RDRAND16_STEP:
29921 icode = CODE_FOR_rdrandhi_1;
29922 mode0 = HImode;
29923 goto rdrand_step;
29924
29925 case IX86_BUILTIN_RDRAND32_STEP:
29926 icode = CODE_FOR_rdrandsi_1;
29927 mode0 = SImode;
29928 goto rdrand_step;
29929
29930 case IX86_BUILTIN_RDRAND64_STEP:
29931 icode = CODE_FOR_rdranddi_1;
29932 mode0 = DImode;
29933
29934 rdrand_step:
29935 op0 = gen_reg_rtx (mode0);
29936 emit_insn (GEN_FCN (icode) (op0));
29937
29938 arg0 = CALL_EXPR_ARG (exp, 0);
29939 op1 = expand_normal (arg0);
29940 if (!address_operand (op1, VOIDmode))
29941 {
29942 op1 = convert_memory_address (Pmode, op1);
29943 op1 = copy_addr_to_reg (op1);
29944 }
29945 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29946
29947 op1 = gen_reg_rtx (SImode);
29948 emit_move_insn (op1, CONST1_RTX (SImode));
29949
29950 /* Emit SImode conditional move. */
29951 if (mode0 == HImode)
29952 {
29953 op2 = gen_reg_rtx (SImode);
29954 emit_insn (gen_zero_extendhisi2 (op2, op0));
29955 }
29956 else if (mode0 == SImode)
29957 op2 = op0;
29958 else
29959 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29960
29961 if (target == 0)
29962 target = gen_reg_rtx (SImode);
29963
29964 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29965 const0_rtx);
29966 emit_insn (gen_rtx_SET (VOIDmode, target,
29967 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29968 return target;
29969
29970 case IX86_BUILTIN_GATHERSIV2DF:
29971 icode = CODE_FOR_avx2_gathersiv2df;
29972 goto gather_gen;
29973 case IX86_BUILTIN_GATHERSIV4DF:
29974 icode = CODE_FOR_avx2_gathersiv4df;
29975 goto gather_gen;
29976 case IX86_BUILTIN_GATHERDIV2DF:
29977 icode = CODE_FOR_avx2_gatherdiv2df;
29978 goto gather_gen;
29979 case IX86_BUILTIN_GATHERDIV4DF:
29980 icode = CODE_FOR_avx2_gatherdiv4df;
29981 goto gather_gen;
29982 case IX86_BUILTIN_GATHERSIV4SF:
29983 icode = CODE_FOR_avx2_gathersiv4sf;
29984 goto gather_gen;
29985 case IX86_BUILTIN_GATHERSIV8SF:
29986 icode = CODE_FOR_avx2_gathersiv8sf;
29987 goto gather_gen;
29988 case IX86_BUILTIN_GATHERDIV4SF:
29989 icode = CODE_FOR_avx2_gatherdiv4sf;
29990 goto gather_gen;
29991 case IX86_BUILTIN_GATHERDIV8SF:
29992 icode = CODE_FOR_avx2_gatherdiv8sf;
29993 goto gather_gen;
29994 case IX86_BUILTIN_GATHERSIV2DI:
29995 icode = CODE_FOR_avx2_gathersiv2di;
29996 goto gather_gen;
29997 case IX86_BUILTIN_GATHERSIV4DI:
29998 icode = CODE_FOR_avx2_gathersiv4di;
29999 goto gather_gen;
30000 case IX86_BUILTIN_GATHERDIV2DI:
30001 icode = CODE_FOR_avx2_gatherdiv2di;
30002 goto gather_gen;
30003 case IX86_BUILTIN_GATHERDIV4DI:
30004 icode = CODE_FOR_avx2_gatherdiv4di;
30005 goto gather_gen;
30006 case IX86_BUILTIN_GATHERSIV4SI:
30007 icode = CODE_FOR_avx2_gathersiv4si;
30008 goto gather_gen;
30009 case IX86_BUILTIN_GATHERSIV8SI:
30010 icode = CODE_FOR_avx2_gathersiv8si;
30011 goto gather_gen;
30012 case IX86_BUILTIN_GATHERDIV4SI:
30013 icode = CODE_FOR_avx2_gatherdiv4si;
30014 goto gather_gen;
30015 case IX86_BUILTIN_GATHERDIV8SI:
30016 icode = CODE_FOR_avx2_gatherdiv8si;
30017 goto gather_gen;
30018 case IX86_BUILTIN_GATHERALTSIV4DF:
30019 icode = CODE_FOR_avx2_gathersiv4df;
30020 goto gather_gen;
30021 case IX86_BUILTIN_GATHERALTDIV8SF:
30022 icode = CODE_FOR_avx2_gatherdiv8sf;
30023 goto gather_gen;
30024 case IX86_BUILTIN_GATHERALTSIV4DI:
30025 icode = CODE_FOR_avx2_gathersiv4di;
30026 goto gather_gen;
30027 case IX86_BUILTIN_GATHERALTDIV8SI:
30028 icode = CODE_FOR_avx2_gatherdiv8si;
30029 goto gather_gen;
30030
30031 gather_gen:
30032 arg0 = CALL_EXPR_ARG (exp, 0);
30033 arg1 = CALL_EXPR_ARG (exp, 1);
30034 arg2 = CALL_EXPR_ARG (exp, 2);
30035 arg3 = CALL_EXPR_ARG (exp, 3);
30036 arg4 = CALL_EXPR_ARG (exp, 4);
30037 op0 = expand_normal (arg0);
30038 op1 = expand_normal (arg1);
30039 op2 = expand_normal (arg2);
30040 op3 = expand_normal (arg3);
30041 op4 = expand_normal (arg4);
30042 /* Note the arg order is different from the operand order. */
30043 mode0 = insn_data[icode].operand[1].mode;
30044 mode2 = insn_data[icode].operand[3].mode;
30045 mode3 = insn_data[icode].operand[4].mode;
30046 mode4 = insn_data[icode].operand[5].mode;
30047
30048 if (target == NULL_RTX
30049 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30050 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30051 else
30052 subtarget = target;
30053
30054 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30055 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30056 {
30057 rtx half = gen_reg_rtx (V4SImode);
30058 if (!nonimmediate_operand (op2, V8SImode))
30059 op2 = copy_to_mode_reg (V8SImode, op2);
30060 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30061 op2 = half;
30062 }
30063 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30064 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30065 {
30066 rtx (*gen) (rtx, rtx);
30067 rtx half = gen_reg_rtx (mode0);
30068 if (mode0 == V4SFmode)
30069 gen = gen_vec_extract_lo_v8sf;
30070 else
30071 gen = gen_vec_extract_lo_v8si;
30072 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30073 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30074 emit_insn (gen (half, op0));
30075 op0 = half;
30076 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30077 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30078 emit_insn (gen (half, op3));
30079 op3 = half;
30080 }
30081
30082 /* Force memory operand only with base register here. But we
30083 don't want to do it on memory operand for other builtin
30084 functions. */
30085 if (GET_MODE (op1) != Pmode)
30086 op1 = convert_to_mode (Pmode, op1, 1);
30087 op1 = force_reg (Pmode, op1);
30088
30089 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30090 op0 = copy_to_mode_reg (mode0, op0);
30091 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30092 op1 = copy_to_mode_reg (Pmode, op1);
30093 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30094 op2 = copy_to_mode_reg (mode2, op2);
30095 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30096 op3 = copy_to_mode_reg (mode3, op3);
30097 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30098 {
30099 error ("last argument must be scale 1, 2, 4, 8");
30100 return const0_rtx;
30101 }
30102
30103 /* Optimize. If mask is known to have all high bits set,
30104 replace op0 with pc_rtx to signal that the instruction
30105 overwrites the whole destination and doesn't use its
30106 previous contents. */
30107 if (optimize)
30108 {
30109 if (TREE_CODE (arg3) == VECTOR_CST)
30110 {
30111 unsigned int negative = 0;
30112 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30113 {
30114 tree cst = VECTOR_CST_ELT (arg3, i);
30115 if (TREE_CODE (cst) == INTEGER_CST
30116 && tree_int_cst_sign_bit (cst))
30117 negative++;
30118 else if (TREE_CODE (cst) == REAL_CST
30119 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30120 negative++;
30121 }
30122 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30123 op0 = pc_rtx;
30124 }
30125 else if (TREE_CODE (arg3) == SSA_NAME)
30126 {
30127 /* Recognize also when mask is like:
30128 __v2df src = _mm_setzero_pd ();
30129 __v2df mask = _mm_cmpeq_pd (src, src);
30130 or
30131 __v8sf src = _mm256_setzero_ps ();
30132 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30133 as that is a cheaper way to load all ones into
30134 a register than having to load a constant from
30135 memory. */
30136 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30137 if (is_gimple_call (def_stmt))
30138 {
30139 tree fndecl = gimple_call_fndecl (def_stmt);
30140 if (fndecl
30141 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30142 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30143 {
30144 case IX86_BUILTIN_CMPPD:
30145 case IX86_BUILTIN_CMPPS:
30146 case IX86_BUILTIN_CMPPD256:
30147 case IX86_BUILTIN_CMPPS256:
30148 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30149 break;
30150 /* FALLTHRU */
30151 case IX86_BUILTIN_CMPEQPD:
30152 case IX86_BUILTIN_CMPEQPS:
30153 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30154 && initializer_zerop (gimple_call_arg (def_stmt,
30155 1)))
30156 op0 = pc_rtx;
30157 break;
30158 default:
30159 break;
30160 }
30161 }
30162 }
30163 }
30164
30165 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30166 if (! pat)
30167 return const0_rtx;
30168 emit_insn (pat);
30169
30170 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30171 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30172 {
30173 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30174 ? V4SFmode : V4SImode;
30175 if (target == NULL_RTX)
30176 target = gen_reg_rtx (tmode);
30177 if (tmode == V4SFmode)
30178 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30179 else
30180 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30181 }
30182 else
30183 target = subtarget;
30184
30185 return target;
30186
30187 case IX86_BUILTIN_XABORT:
30188 icode = CODE_FOR_xabort;
30189 arg0 = CALL_EXPR_ARG (exp, 0);
30190 op0 = expand_normal (arg0);
30191 mode0 = insn_data[icode].operand[0].mode;
30192 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30193 {
30194 error ("the xabort's argument must be an 8-bit immediate");
30195 return const0_rtx;
30196 }
30197 emit_insn (gen_xabort (op0));
30198 return 0;
30199
30200 default:
30201 break;
30202 }
30203
30204 for (i = 0, d = bdesc_special_args;
30205 i < ARRAY_SIZE (bdesc_special_args);
30206 i++, d++)
30207 if (d->code == fcode)
30208 return ix86_expand_special_args_builtin (d, exp, target);
30209
30210 for (i = 0, d = bdesc_args;
30211 i < ARRAY_SIZE (bdesc_args);
30212 i++, d++)
30213 if (d->code == fcode)
30214 switch (fcode)
30215 {
30216 case IX86_BUILTIN_FABSQ:
30217 case IX86_BUILTIN_COPYSIGNQ:
30218 if (!TARGET_SSE)
30219 /* Emit a normal call if SSE isn't available. */
30220 return expand_call (exp, target, ignore);
30221 default:
30222 return ix86_expand_args_builtin (d, exp, target);
30223 }
30224
30225 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30226 if (d->code == fcode)
30227 return ix86_expand_sse_comi (d, exp, target);
30228
30229 for (i = 0, d = bdesc_pcmpestr;
30230 i < ARRAY_SIZE (bdesc_pcmpestr);
30231 i++, d++)
30232 if (d->code == fcode)
30233 return ix86_expand_sse_pcmpestr (d, exp, target);
30234
30235 for (i = 0, d = bdesc_pcmpistr;
30236 i < ARRAY_SIZE (bdesc_pcmpistr);
30237 i++, d++)
30238 if (d->code == fcode)
30239 return ix86_expand_sse_pcmpistr (d, exp, target);
30240
30241 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30242 if (d->code == fcode)
30243 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30244 (enum ix86_builtin_func_type)
30245 d->flag, d->comparison);
30246
30247 gcc_unreachable ();
30248 }
30249
30250 /* Returns a function decl for a vectorized version of the builtin function
30251 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30252 if it is not available. */
30253
30254 static tree
30255 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30256 tree type_in)
30257 {
30258 enum machine_mode in_mode, out_mode;
30259 int in_n, out_n;
30260 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30261
30262 if (TREE_CODE (type_out) != VECTOR_TYPE
30263 || TREE_CODE (type_in) != VECTOR_TYPE
30264 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30265 return NULL_TREE;
30266
30267 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30268 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30269 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30270 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30271
30272 switch (fn)
30273 {
30274 case BUILT_IN_SQRT:
30275 if (out_mode == DFmode && in_mode == DFmode)
30276 {
30277 if (out_n == 2 && in_n == 2)
30278 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30279 else if (out_n == 4 && in_n == 4)
30280 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30281 }
30282 break;
30283
30284 case BUILT_IN_SQRTF:
30285 if (out_mode == SFmode && in_mode == SFmode)
30286 {
30287 if (out_n == 4 && in_n == 4)
30288 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30289 else if (out_n == 8 && in_n == 8)
30290 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30291 }
30292 break;
30293
30294 case BUILT_IN_IFLOOR:
30295 case BUILT_IN_LFLOOR:
30296 case BUILT_IN_LLFLOOR:
30297 /* The round insn does not trap on denormals. */
30298 if (flag_trapping_math || !TARGET_ROUND)
30299 break;
30300
30301 if (out_mode == SImode && in_mode == DFmode)
30302 {
30303 if (out_n == 4 && in_n == 2)
30304 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30305 else if (out_n == 8 && in_n == 4)
30306 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30307 }
30308 break;
30309
30310 case BUILT_IN_IFLOORF:
30311 case BUILT_IN_LFLOORF:
30312 case BUILT_IN_LLFLOORF:
30313 /* The round insn does not trap on denormals. */
30314 if (flag_trapping_math || !TARGET_ROUND)
30315 break;
30316
30317 if (out_mode == SImode && in_mode == SFmode)
30318 {
30319 if (out_n == 4 && in_n == 4)
30320 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30321 else if (out_n == 8 && in_n == 8)
30322 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30323 }
30324 break;
30325
30326 case BUILT_IN_ICEIL:
30327 case BUILT_IN_LCEIL:
30328 case BUILT_IN_LLCEIL:
30329 /* The round insn does not trap on denormals. */
30330 if (flag_trapping_math || !TARGET_ROUND)
30331 break;
30332
30333 if (out_mode == SImode && in_mode == DFmode)
30334 {
30335 if (out_n == 4 && in_n == 2)
30336 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30337 else if (out_n == 8 && in_n == 4)
30338 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30339 }
30340 break;
30341
30342 case BUILT_IN_ICEILF:
30343 case BUILT_IN_LCEILF:
30344 case BUILT_IN_LLCEILF:
30345 /* The round insn does not trap on denormals. */
30346 if (flag_trapping_math || !TARGET_ROUND)
30347 break;
30348
30349 if (out_mode == SImode && in_mode == SFmode)
30350 {
30351 if (out_n == 4 && in_n == 4)
30352 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30353 else if (out_n == 8 && in_n == 8)
30354 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30355 }
30356 break;
30357
30358 case BUILT_IN_IRINT:
30359 case BUILT_IN_LRINT:
30360 case BUILT_IN_LLRINT:
30361 if (out_mode == SImode && in_mode == DFmode)
30362 {
30363 if (out_n == 4 && in_n == 2)
30364 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30365 else if (out_n == 8 && in_n == 4)
30366 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30367 }
30368 break;
30369
30370 case BUILT_IN_IRINTF:
30371 case BUILT_IN_LRINTF:
30372 case BUILT_IN_LLRINTF:
30373 if (out_mode == SImode && in_mode == SFmode)
30374 {
30375 if (out_n == 4 && in_n == 4)
30376 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30377 else if (out_n == 8 && in_n == 8)
30378 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30379 }
30380 break;
30381
30382 case BUILT_IN_IROUND:
30383 case BUILT_IN_LROUND:
30384 case BUILT_IN_LLROUND:
30385 /* The round insn does not trap on denormals. */
30386 if (flag_trapping_math || !TARGET_ROUND)
30387 break;
30388
30389 if (out_mode == SImode && in_mode == DFmode)
30390 {
30391 if (out_n == 4 && in_n == 2)
30392 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30393 else if (out_n == 8 && in_n == 4)
30394 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30395 }
30396 break;
30397
30398 case BUILT_IN_IROUNDF:
30399 case BUILT_IN_LROUNDF:
30400 case BUILT_IN_LLROUNDF:
30401 /* The round insn does not trap on denormals. */
30402 if (flag_trapping_math || !TARGET_ROUND)
30403 break;
30404
30405 if (out_mode == SImode && in_mode == SFmode)
30406 {
30407 if (out_n == 4 && in_n == 4)
30408 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30409 else if (out_n == 8 && in_n == 8)
30410 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30411 }
30412 break;
30413
30414 case BUILT_IN_COPYSIGN:
30415 if (out_mode == DFmode && in_mode == DFmode)
30416 {
30417 if (out_n == 2 && in_n == 2)
30418 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30419 else if (out_n == 4 && in_n == 4)
30420 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30421 }
30422 break;
30423
30424 case BUILT_IN_COPYSIGNF:
30425 if (out_mode == SFmode && in_mode == SFmode)
30426 {
30427 if (out_n == 4 && in_n == 4)
30428 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30429 else if (out_n == 8 && in_n == 8)
30430 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30431 }
30432 break;
30433
30434 case BUILT_IN_FLOOR:
30435 /* The round insn does not trap on denormals. */
30436 if (flag_trapping_math || !TARGET_ROUND)
30437 break;
30438
30439 if (out_mode == DFmode && in_mode == DFmode)
30440 {
30441 if (out_n == 2 && in_n == 2)
30442 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30443 else if (out_n == 4 && in_n == 4)
30444 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30445 }
30446 break;
30447
30448 case BUILT_IN_FLOORF:
30449 /* The round insn does not trap on denormals. */
30450 if (flag_trapping_math || !TARGET_ROUND)
30451 break;
30452
30453 if (out_mode == SFmode && in_mode == SFmode)
30454 {
30455 if (out_n == 4 && in_n == 4)
30456 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30457 else if (out_n == 8 && in_n == 8)
30458 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30459 }
30460 break;
30461
30462 case BUILT_IN_CEIL:
30463 /* The round insn does not trap on denormals. */
30464 if (flag_trapping_math || !TARGET_ROUND)
30465 break;
30466
30467 if (out_mode == DFmode && in_mode == DFmode)
30468 {
30469 if (out_n == 2 && in_n == 2)
30470 return ix86_builtins[IX86_BUILTIN_CEILPD];
30471 else if (out_n == 4 && in_n == 4)
30472 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30473 }
30474 break;
30475
30476 case BUILT_IN_CEILF:
30477 /* The round insn does not trap on denormals. */
30478 if (flag_trapping_math || !TARGET_ROUND)
30479 break;
30480
30481 if (out_mode == SFmode && in_mode == SFmode)
30482 {
30483 if (out_n == 4 && in_n == 4)
30484 return ix86_builtins[IX86_BUILTIN_CEILPS];
30485 else if (out_n == 8 && in_n == 8)
30486 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30487 }
30488 break;
30489
30490 case BUILT_IN_TRUNC:
30491 /* The round insn does not trap on denormals. */
30492 if (flag_trapping_math || !TARGET_ROUND)
30493 break;
30494
30495 if (out_mode == DFmode && in_mode == DFmode)
30496 {
30497 if (out_n == 2 && in_n == 2)
30498 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30499 else if (out_n == 4 && in_n == 4)
30500 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30501 }
30502 break;
30503
30504 case BUILT_IN_TRUNCF:
30505 /* The round insn does not trap on denormals. */
30506 if (flag_trapping_math || !TARGET_ROUND)
30507 break;
30508
30509 if (out_mode == SFmode && in_mode == SFmode)
30510 {
30511 if (out_n == 4 && in_n == 4)
30512 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30513 else if (out_n == 8 && in_n == 8)
30514 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30515 }
30516 break;
30517
30518 case BUILT_IN_RINT:
30519 /* The round insn does not trap on denormals. */
30520 if (flag_trapping_math || !TARGET_ROUND)
30521 break;
30522
30523 if (out_mode == DFmode && in_mode == DFmode)
30524 {
30525 if (out_n == 2 && in_n == 2)
30526 return ix86_builtins[IX86_BUILTIN_RINTPD];
30527 else if (out_n == 4 && in_n == 4)
30528 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30529 }
30530 break;
30531
30532 case BUILT_IN_RINTF:
30533 /* The round insn does not trap on denormals. */
30534 if (flag_trapping_math || !TARGET_ROUND)
30535 break;
30536
30537 if (out_mode == SFmode && in_mode == SFmode)
30538 {
30539 if (out_n == 4 && in_n == 4)
30540 return ix86_builtins[IX86_BUILTIN_RINTPS];
30541 else if (out_n == 8 && in_n == 8)
30542 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30543 }
30544 break;
30545
30546 case BUILT_IN_ROUND:
30547 /* The round insn does not trap on denormals. */
30548 if (flag_trapping_math || !TARGET_ROUND)
30549 break;
30550
30551 if (out_mode == DFmode && in_mode == DFmode)
30552 {
30553 if (out_n == 2 && in_n == 2)
30554 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30555 else if (out_n == 4 && in_n == 4)
30556 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30557 }
30558 break;
30559
30560 case BUILT_IN_ROUNDF:
30561 /* The round insn does not trap on denormals. */
30562 if (flag_trapping_math || !TARGET_ROUND)
30563 break;
30564
30565 if (out_mode == SFmode && in_mode == SFmode)
30566 {
30567 if (out_n == 4 && in_n == 4)
30568 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30569 else if (out_n == 8 && in_n == 8)
30570 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30571 }
30572 break;
30573
30574 case BUILT_IN_FMA:
30575 if (out_mode == DFmode && in_mode == DFmode)
30576 {
30577 if (out_n == 2 && in_n == 2)
30578 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30579 if (out_n == 4 && in_n == 4)
30580 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30581 }
30582 break;
30583
30584 case BUILT_IN_FMAF:
30585 if (out_mode == SFmode && in_mode == SFmode)
30586 {
30587 if (out_n == 4 && in_n == 4)
30588 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30589 if (out_n == 8 && in_n == 8)
30590 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30591 }
30592 break;
30593
30594 default:
30595 break;
30596 }
30597
30598 /* Dispatch to a handler for a vectorization library. */
30599 if (ix86_veclib_handler)
30600 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30601 type_in);
30602
30603 return NULL_TREE;
30604 }
30605
30606 /* Handler for an SVML-style interface to
30607 a library with vectorized intrinsics. */
30608
30609 static tree
30610 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30611 {
30612 char name[20];
30613 tree fntype, new_fndecl, args;
30614 unsigned arity;
30615 const char *bname;
30616 enum machine_mode el_mode, in_mode;
30617 int n, in_n;
30618
30619 /* The SVML is suitable for unsafe math only. */
30620 if (!flag_unsafe_math_optimizations)
30621 return NULL_TREE;
30622
30623 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30624 n = TYPE_VECTOR_SUBPARTS (type_out);
30625 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30626 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30627 if (el_mode != in_mode
30628 || n != in_n)
30629 return NULL_TREE;
30630
30631 switch (fn)
30632 {
30633 case BUILT_IN_EXP:
30634 case BUILT_IN_LOG:
30635 case BUILT_IN_LOG10:
30636 case BUILT_IN_POW:
30637 case BUILT_IN_TANH:
30638 case BUILT_IN_TAN:
30639 case BUILT_IN_ATAN:
30640 case BUILT_IN_ATAN2:
30641 case BUILT_IN_ATANH:
30642 case BUILT_IN_CBRT:
30643 case BUILT_IN_SINH:
30644 case BUILT_IN_SIN:
30645 case BUILT_IN_ASINH:
30646 case BUILT_IN_ASIN:
30647 case BUILT_IN_COSH:
30648 case BUILT_IN_COS:
30649 case BUILT_IN_ACOSH:
30650 case BUILT_IN_ACOS:
30651 if (el_mode != DFmode || n != 2)
30652 return NULL_TREE;
30653 break;
30654
30655 case BUILT_IN_EXPF:
30656 case BUILT_IN_LOGF:
30657 case BUILT_IN_LOG10F:
30658 case BUILT_IN_POWF:
30659 case BUILT_IN_TANHF:
30660 case BUILT_IN_TANF:
30661 case BUILT_IN_ATANF:
30662 case BUILT_IN_ATAN2F:
30663 case BUILT_IN_ATANHF:
30664 case BUILT_IN_CBRTF:
30665 case BUILT_IN_SINHF:
30666 case BUILT_IN_SINF:
30667 case BUILT_IN_ASINHF:
30668 case BUILT_IN_ASINF:
30669 case BUILT_IN_COSHF:
30670 case BUILT_IN_COSF:
30671 case BUILT_IN_ACOSHF:
30672 case BUILT_IN_ACOSF:
30673 if (el_mode != SFmode || n != 4)
30674 return NULL_TREE;
30675 break;
30676
30677 default:
30678 return NULL_TREE;
30679 }
30680
30681 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30682
30683 if (fn == BUILT_IN_LOGF)
30684 strcpy (name, "vmlsLn4");
30685 else if (fn == BUILT_IN_LOG)
30686 strcpy (name, "vmldLn2");
30687 else if (n == 4)
30688 {
30689 sprintf (name, "vmls%s", bname+10);
30690 name[strlen (name)-1] = '4';
30691 }
30692 else
30693 sprintf (name, "vmld%s2", bname+10);
30694
30695 /* Convert to uppercase. */
30696 name[4] &= ~0x20;
30697
30698 arity = 0;
30699 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30700 args;
30701 args = TREE_CHAIN (args))
30702 arity++;
30703
30704 if (arity == 1)
30705 fntype = build_function_type_list (type_out, type_in, NULL);
30706 else
30707 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30708
30709 /* Build a function declaration for the vectorized function. */
30710 new_fndecl = build_decl (BUILTINS_LOCATION,
30711 FUNCTION_DECL, get_identifier (name), fntype);
30712 TREE_PUBLIC (new_fndecl) = 1;
30713 DECL_EXTERNAL (new_fndecl) = 1;
30714 DECL_IS_NOVOPS (new_fndecl) = 1;
30715 TREE_READONLY (new_fndecl) = 1;
30716
30717 return new_fndecl;
30718 }
30719
30720 /* Handler for an ACML-style interface to
30721 a library with vectorized intrinsics. */
30722
30723 static tree
30724 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30725 {
30726 char name[20] = "__vr.._";
30727 tree fntype, new_fndecl, args;
30728 unsigned arity;
30729 const char *bname;
30730 enum machine_mode el_mode, in_mode;
30731 int n, in_n;
30732
30733 /* The ACML is 64bits only and suitable for unsafe math only as
30734 it does not correctly support parts of IEEE with the required
30735 precision such as denormals. */
30736 if (!TARGET_64BIT
30737 || !flag_unsafe_math_optimizations)
30738 return NULL_TREE;
30739
30740 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30741 n = TYPE_VECTOR_SUBPARTS (type_out);
30742 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30743 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30744 if (el_mode != in_mode
30745 || n != in_n)
30746 return NULL_TREE;
30747
30748 switch (fn)
30749 {
30750 case BUILT_IN_SIN:
30751 case BUILT_IN_COS:
30752 case BUILT_IN_EXP:
30753 case BUILT_IN_LOG:
30754 case BUILT_IN_LOG2:
30755 case BUILT_IN_LOG10:
30756 name[4] = 'd';
30757 name[5] = '2';
30758 if (el_mode != DFmode
30759 || n != 2)
30760 return NULL_TREE;
30761 break;
30762
30763 case BUILT_IN_SINF:
30764 case BUILT_IN_COSF:
30765 case BUILT_IN_EXPF:
30766 case BUILT_IN_POWF:
30767 case BUILT_IN_LOGF:
30768 case BUILT_IN_LOG2F:
30769 case BUILT_IN_LOG10F:
30770 name[4] = 's';
30771 name[5] = '4';
30772 if (el_mode != SFmode
30773 || n != 4)
30774 return NULL_TREE;
30775 break;
30776
30777 default:
30778 return NULL_TREE;
30779 }
30780
30781 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30782 sprintf (name + 7, "%s", bname+10);
30783
30784 arity = 0;
30785 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30786 args;
30787 args = TREE_CHAIN (args))
30788 arity++;
30789
30790 if (arity == 1)
30791 fntype = build_function_type_list (type_out, type_in, NULL);
30792 else
30793 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30794
30795 /* Build a function declaration for the vectorized function. */
30796 new_fndecl = build_decl (BUILTINS_LOCATION,
30797 FUNCTION_DECL, get_identifier (name), fntype);
30798 TREE_PUBLIC (new_fndecl) = 1;
30799 DECL_EXTERNAL (new_fndecl) = 1;
30800 DECL_IS_NOVOPS (new_fndecl) = 1;
30801 TREE_READONLY (new_fndecl) = 1;
30802
30803 return new_fndecl;
30804 }
30805
30806 /* Returns a decl of a function that implements gather load with
30807 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30808 Return NULL_TREE if it is not available. */
30809
30810 static tree
30811 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30812 const_tree index_type, int scale)
30813 {
30814 bool si;
30815 enum ix86_builtins code;
30816
30817 if (! TARGET_AVX2)
30818 return NULL_TREE;
30819
30820 if ((TREE_CODE (index_type) != INTEGER_TYPE
30821 && !POINTER_TYPE_P (index_type))
30822 || (TYPE_MODE (index_type) != SImode
30823 && TYPE_MODE (index_type) != DImode))
30824 return NULL_TREE;
30825
30826 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30827 return NULL_TREE;
30828
30829 /* v*gather* insn sign extends index to pointer mode. */
30830 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30831 && TYPE_UNSIGNED (index_type))
30832 return NULL_TREE;
30833
30834 if (scale <= 0
30835 || scale > 8
30836 || (scale & (scale - 1)) != 0)
30837 return NULL_TREE;
30838
30839 si = TYPE_MODE (index_type) == SImode;
30840 switch (TYPE_MODE (mem_vectype))
30841 {
30842 case V2DFmode:
30843 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30844 break;
30845 case V4DFmode:
30846 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30847 break;
30848 case V2DImode:
30849 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30850 break;
30851 case V4DImode:
30852 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30853 break;
30854 case V4SFmode:
30855 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30856 break;
30857 case V8SFmode:
30858 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30859 break;
30860 case V4SImode:
30861 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30862 break;
30863 case V8SImode:
30864 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30865 break;
30866 default:
30867 return NULL_TREE;
30868 }
30869
30870 return ix86_builtins[code];
30871 }
30872
30873 /* Returns a code for a target-specific builtin that implements
30874 reciprocal of the function, or NULL_TREE if not available. */
30875
30876 static tree
30877 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30878 bool sqrt ATTRIBUTE_UNUSED)
30879 {
30880 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30881 && flag_finite_math_only && !flag_trapping_math
30882 && flag_unsafe_math_optimizations))
30883 return NULL_TREE;
30884
30885 if (md_fn)
30886 /* Machine dependent builtins. */
30887 switch (fn)
30888 {
30889 /* Vectorized version of sqrt to rsqrt conversion. */
30890 case IX86_BUILTIN_SQRTPS_NR:
30891 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30892
30893 case IX86_BUILTIN_SQRTPS_NR256:
30894 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30895
30896 default:
30897 return NULL_TREE;
30898 }
30899 else
30900 /* Normal builtins. */
30901 switch (fn)
30902 {
30903 /* Sqrt to rsqrt conversion. */
30904 case BUILT_IN_SQRTF:
30905 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30906
30907 default:
30908 return NULL_TREE;
30909 }
30910 }
30911 \f
30912 /* Helper for avx_vpermilps256_operand et al. This is also used by
30913 the expansion functions to turn the parallel back into a mask.
30914 The return value is 0 for no match and the imm8+1 for a match. */
30915
30916 int
30917 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30918 {
30919 unsigned i, nelt = GET_MODE_NUNITS (mode);
30920 unsigned mask = 0;
30921 unsigned char ipar[8];
30922
30923 if (XVECLEN (par, 0) != (int) nelt)
30924 return 0;
30925
30926 /* Validate that all of the elements are constants, and not totally
30927 out of range. Copy the data into an integral array to make the
30928 subsequent checks easier. */
30929 for (i = 0; i < nelt; ++i)
30930 {
30931 rtx er = XVECEXP (par, 0, i);
30932 unsigned HOST_WIDE_INT ei;
30933
30934 if (!CONST_INT_P (er))
30935 return 0;
30936 ei = INTVAL (er);
30937 if (ei >= nelt)
30938 return 0;
30939 ipar[i] = ei;
30940 }
30941
30942 switch (mode)
30943 {
30944 case V4DFmode:
30945 /* In the 256-bit DFmode case, we can only move elements within
30946 a 128-bit lane. */
30947 for (i = 0; i < 2; ++i)
30948 {
30949 if (ipar[i] >= 2)
30950 return 0;
30951 mask |= ipar[i] << i;
30952 }
30953 for (i = 2; i < 4; ++i)
30954 {
30955 if (ipar[i] < 2)
30956 return 0;
30957 mask |= (ipar[i] - 2) << i;
30958 }
30959 break;
30960
30961 case V8SFmode:
30962 /* In the 256-bit SFmode case, we have full freedom of movement
30963 within the low 128-bit lane, but the high 128-bit lane must
30964 mirror the exact same pattern. */
30965 for (i = 0; i < 4; ++i)
30966 if (ipar[i] + 4 != ipar[i + 4])
30967 return 0;
30968 nelt = 4;
30969 /* FALLTHRU */
30970
30971 case V2DFmode:
30972 case V4SFmode:
30973 /* In the 128-bit case, we've full freedom in the placement of
30974 the elements from the source operand. */
30975 for (i = 0; i < nelt; ++i)
30976 mask |= ipar[i] << (i * (nelt / 2));
30977 break;
30978
30979 default:
30980 gcc_unreachable ();
30981 }
30982
30983 /* Make sure success has a non-zero value by adding one. */
30984 return mask + 1;
30985 }
30986
30987 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30988 the expansion functions to turn the parallel back into a mask.
30989 The return value is 0 for no match and the imm8+1 for a match. */
30990
30991 int
30992 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30993 {
30994 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30995 unsigned mask = 0;
30996 unsigned char ipar[8];
30997
30998 if (XVECLEN (par, 0) != (int) nelt)
30999 return 0;
31000
31001 /* Validate that all of the elements are constants, and not totally
31002 out of range. Copy the data into an integral array to make the
31003 subsequent checks easier. */
31004 for (i = 0; i < nelt; ++i)
31005 {
31006 rtx er = XVECEXP (par, 0, i);
31007 unsigned HOST_WIDE_INT ei;
31008
31009 if (!CONST_INT_P (er))
31010 return 0;
31011 ei = INTVAL (er);
31012 if (ei >= 2 * nelt)
31013 return 0;
31014 ipar[i] = ei;
31015 }
31016
31017 /* Validate that the halves of the permute are halves. */
31018 for (i = 0; i < nelt2 - 1; ++i)
31019 if (ipar[i] + 1 != ipar[i + 1])
31020 return 0;
31021 for (i = nelt2; i < nelt - 1; ++i)
31022 if (ipar[i] + 1 != ipar[i + 1])
31023 return 0;
31024
31025 /* Reconstruct the mask. */
31026 for (i = 0; i < 2; ++i)
31027 {
31028 unsigned e = ipar[i * nelt2];
31029 if (e % nelt2)
31030 return 0;
31031 e /= nelt2;
31032 mask |= e << (i * 4);
31033 }
31034
31035 /* Make sure success has a non-zero value by adding one. */
31036 return mask + 1;
31037 }
31038 \f
31039 /* Store OPERAND to the memory after reload is completed. This means
31040 that we can't easily use assign_stack_local. */
31041 rtx
31042 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31043 {
31044 rtx result;
31045
31046 gcc_assert (reload_completed);
31047 if (ix86_using_red_zone ())
31048 {
31049 result = gen_rtx_MEM (mode,
31050 gen_rtx_PLUS (Pmode,
31051 stack_pointer_rtx,
31052 GEN_INT (-RED_ZONE_SIZE)));
31053 emit_move_insn (result, operand);
31054 }
31055 else if (TARGET_64BIT)
31056 {
31057 switch (mode)
31058 {
31059 case HImode:
31060 case SImode:
31061 operand = gen_lowpart (DImode, operand);
31062 /* FALLTHRU */
31063 case DImode:
31064 emit_insn (
31065 gen_rtx_SET (VOIDmode,
31066 gen_rtx_MEM (DImode,
31067 gen_rtx_PRE_DEC (DImode,
31068 stack_pointer_rtx)),
31069 operand));
31070 break;
31071 default:
31072 gcc_unreachable ();
31073 }
31074 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31075 }
31076 else
31077 {
31078 switch (mode)
31079 {
31080 case DImode:
31081 {
31082 rtx operands[2];
31083 split_double_mode (mode, &operand, 1, operands, operands + 1);
31084 emit_insn (
31085 gen_rtx_SET (VOIDmode,
31086 gen_rtx_MEM (SImode,
31087 gen_rtx_PRE_DEC (Pmode,
31088 stack_pointer_rtx)),
31089 operands[1]));
31090 emit_insn (
31091 gen_rtx_SET (VOIDmode,
31092 gen_rtx_MEM (SImode,
31093 gen_rtx_PRE_DEC (Pmode,
31094 stack_pointer_rtx)),
31095 operands[0]));
31096 }
31097 break;
31098 case HImode:
31099 /* Store HImodes as SImodes. */
31100 operand = gen_lowpart (SImode, operand);
31101 /* FALLTHRU */
31102 case SImode:
31103 emit_insn (
31104 gen_rtx_SET (VOIDmode,
31105 gen_rtx_MEM (GET_MODE (operand),
31106 gen_rtx_PRE_DEC (SImode,
31107 stack_pointer_rtx)),
31108 operand));
31109 break;
31110 default:
31111 gcc_unreachable ();
31112 }
31113 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31114 }
31115 return result;
31116 }
31117
31118 /* Free operand from the memory. */
31119 void
31120 ix86_free_from_memory (enum machine_mode mode)
31121 {
31122 if (!ix86_using_red_zone ())
31123 {
31124 int size;
31125
31126 if (mode == DImode || TARGET_64BIT)
31127 size = 8;
31128 else
31129 size = 4;
31130 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31131 to pop or add instruction if registers are available. */
31132 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31133 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31134 GEN_INT (size))));
31135 }
31136 }
31137
31138 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31139
31140 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31141 QImode must go into class Q_REGS.
31142 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31143 movdf to do mem-to-mem moves through integer regs. */
31144
31145 static reg_class_t
31146 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31147 {
31148 enum machine_mode mode = GET_MODE (x);
31149
31150 /* We're only allowed to return a subclass of CLASS. Many of the
31151 following checks fail for NO_REGS, so eliminate that early. */
31152 if (regclass == NO_REGS)
31153 return NO_REGS;
31154
31155 /* All classes can load zeros. */
31156 if (x == CONST0_RTX (mode))
31157 return regclass;
31158
31159 /* Force constants into memory if we are loading a (nonzero) constant into
31160 an MMX or SSE register. This is because there are no MMX/SSE instructions
31161 to load from a constant. */
31162 if (CONSTANT_P (x)
31163 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31164 return NO_REGS;
31165
31166 /* Prefer SSE regs only, if we can use them for math. */
31167 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31168 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31169
31170 /* Floating-point constants need more complex checks. */
31171 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31172 {
31173 /* General regs can load everything. */
31174 if (reg_class_subset_p (regclass, GENERAL_REGS))
31175 return regclass;
31176
31177 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31178 zero above. We only want to wind up preferring 80387 registers if
31179 we plan on doing computation with them. */
31180 if (TARGET_80387
31181 && standard_80387_constant_p (x) > 0)
31182 {
31183 /* Limit class to non-sse. */
31184 if (regclass == FLOAT_SSE_REGS)
31185 return FLOAT_REGS;
31186 if (regclass == FP_TOP_SSE_REGS)
31187 return FP_TOP_REG;
31188 if (regclass == FP_SECOND_SSE_REGS)
31189 return FP_SECOND_REG;
31190 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31191 return regclass;
31192 }
31193
31194 return NO_REGS;
31195 }
31196
31197 /* Generally when we see PLUS here, it's the function invariant
31198 (plus soft-fp const_int). Which can only be computed into general
31199 regs. */
31200 if (GET_CODE (x) == PLUS)
31201 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31202
31203 /* QImode constants are easy to load, but non-constant QImode data
31204 must go into Q_REGS. */
31205 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31206 {
31207 if (reg_class_subset_p (regclass, Q_REGS))
31208 return regclass;
31209 if (reg_class_subset_p (Q_REGS, regclass))
31210 return Q_REGS;
31211 return NO_REGS;
31212 }
31213
31214 return regclass;
31215 }
31216
31217 /* Discourage putting floating-point values in SSE registers unless
31218 SSE math is being used, and likewise for the 387 registers. */
31219 static reg_class_t
31220 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31221 {
31222 enum machine_mode mode = GET_MODE (x);
31223
31224 /* Restrict the output reload class to the register bank that we are doing
31225 math on. If we would like not to return a subset of CLASS, reject this
31226 alternative: if reload cannot do this, it will still use its choice. */
31227 mode = GET_MODE (x);
31228 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31229 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31230
31231 if (X87_FLOAT_MODE_P (mode))
31232 {
31233 if (regclass == FP_TOP_SSE_REGS)
31234 return FP_TOP_REG;
31235 else if (regclass == FP_SECOND_SSE_REGS)
31236 return FP_SECOND_REG;
31237 else
31238 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31239 }
31240
31241 return regclass;
31242 }
31243
31244 static reg_class_t
31245 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31246 enum machine_mode mode, secondary_reload_info *sri)
31247 {
31248 /* Double-word spills from general registers to non-offsettable memory
31249 references (zero-extended addresses) require special handling. */
31250 if (TARGET_64BIT
31251 && MEM_P (x)
31252 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31253 && rclass == GENERAL_REGS
31254 && !offsettable_memref_p (x))
31255 {
31256 sri->icode = (in_p
31257 ? CODE_FOR_reload_noff_load
31258 : CODE_FOR_reload_noff_store);
31259 /* Add the cost of moving address to a temporary. */
31260 sri->extra_cost = 1;
31261
31262 return NO_REGS;
31263 }
31264
31265 /* QImode spills from non-QI registers require
31266 intermediate register on 32bit targets. */
31267 if (!TARGET_64BIT
31268 && !in_p && mode == QImode
31269 && (rclass == GENERAL_REGS
31270 || rclass == LEGACY_REGS
31271 || rclass == INDEX_REGS))
31272 {
31273 int regno;
31274
31275 if (REG_P (x))
31276 regno = REGNO (x);
31277 else
31278 regno = -1;
31279
31280 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31281 regno = true_regnum (x);
31282
31283 /* Return Q_REGS if the operand is in memory. */
31284 if (regno == -1)
31285 return Q_REGS;
31286 }
31287
31288 /* This condition handles corner case where an expression involving
31289 pointers gets vectorized. We're trying to use the address of a
31290 stack slot as a vector initializer.
31291
31292 (set (reg:V2DI 74 [ vect_cst_.2 ])
31293 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31294
31295 Eventually frame gets turned into sp+offset like this:
31296
31297 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31298 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31299 (const_int 392 [0x188]))))
31300
31301 That later gets turned into:
31302
31303 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31304 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31305 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31306
31307 We'll have the following reload recorded:
31308
31309 Reload 0: reload_in (DI) =
31310 (plus:DI (reg/f:DI 7 sp)
31311 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31312 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31313 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31314 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31315 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31316 reload_reg_rtx: (reg:V2DI 22 xmm1)
31317
31318 Which isn't going to work since SSE instructions can't handle scalar
31319 additions. Returning GENERAL_REGS forces the addition into integer
31320 register and reload can handle subsequent reloads without problems. */
31321
31322 if (in_p && GET_CODE (x) == PLUS
31323 && SSE_CLASS_P (rclass)
31324 && SCALAR_INT_MODE_P (mode))
31325 return GENERAL_REGS;
31326
31327 return NO_REGS;
31328 }
31329
31330 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31331
31332 static bool
31333 ix86_class_likely_spilled_p (reg_class_t rclass)
31334 {
31335 switch (rclass)
31336 {
31337 case AREG:
31338 case DREG:
31339 case CREG:
31340 case BREG:
31341 case AD_REGS:
31342 case SIREG:
31343 case DIREG:
31344 case SSE_FIRST_REG:
31345 case FP_TOP_REG:
31346 case FP_SECOND_REG:
31347 return true;
31348
31349 default:
31350 break;
31351 }
31352
31353 return false;
31354 }
31355
31356 /* If we are copying between general and FP registers, we need a memory
31357 location. The same is true for SSE and MMX registers.
31358
31359 To optimize register_move_cost performance, allow inline variant.
31360
31361 The macro can't work reliably when one of the CLASSES is class containing
31362 registers from multiple units (SSE, MMX, integer). We avoid this by never
31363 combining those units in single alternative in the machine description.
31364 Ensure that this constraint holds to avoid unexpected surprises.
31365
31366 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31367 enforce these sanity checks. */
31368
31369 static inline bool
31370 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31371 enum machine_mode mode, int strict)
31372 {
31373 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31374 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31375 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31376 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31377 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31378 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31379 {
31380 gcc_assert (!strict);
31381 return true;
31382 }
31383
31384 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31385 return true;
31386
31387 /* ??? This is a lie. We do have moves between mmx/general, and for
31388 mmx/sse2. But by saying we need secondary memory we discourage the
31389 register allocator from using the mmx registers unless needed. */
31390 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31391 return true;
31392
31393 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31394 {
31395 /* SSE1 doesn't have any direct moves from other classes. */
31396 if (!TARGET_SSE2)
31397 return true;
31398
31399 /* If the target says that inter-unit moves are more expensive
31400 than moving through memory, then don't generate them. */
31401 if (!TARGET_INTER_UNIT_MOVES)
31402 return true;
31403
31404 /* Between SSE and general, we have moves no larger than word size. */
31405 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31406 return true;
31407 }
31408
31409 return false;
31410 }
31411
31412 bool
31413 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31414 enum machine_mode mode, int strict)
31415 {
31416 return inline_secondary_memory_needed (class1, class2, mode, strict);
31417 }
31418
31419 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31420
31421 On the 80386, this is the size of MODE in words,
31422 except in the FP regs, where a single reg is always enough. */
31423
31424 static unsigned char
31425 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31426 {
31427 if (MAYBE_INTEGER_CLASS_P (rclass))
31428 {
31429 if (mode == XFmode)
31430 return (TARGET_64BIT ? 2 : 3);
31431 else if (mode == XCmode)
31432 return (TARGET_64BIT ? 4 : 6);
31433 else
31434 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31435 }
31436 else
31437 {
31438 if (COMPLEX_MODE_P (mode))
31439 return 2;
31440 else
31441 return 1;
31442 }
31443 }
31444
31445 /* Return true if the registers in CLASS cannot represent the change from
31446 modes FROM to TO. */
31447
31448 bool
31449 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31450 enum reg_class regclass)
31451 {
31452 if (from == to)
31453 return false;
31454
31455 /* x87 registers can't do subreg at all, as all values are reformatted
31456 to extended precision. */
31457 if (MAYBE_FLOAT_CLASS_P (regclass))
31458 return true;
31459
31460 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31461 {
31462 /* Vector registers do not support QI or HImode loads. If we don't
31463 disallow a change to these modes, reload will assume it's ok to
31464 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31465 the vec_dupv4hi pattern. */
31466 if (GET_MODE_SIZE (from) < 4)
31467 return true;
31468
31469 /* Vector registers do not support subreg with nonzero offsets, which
31470 are otherwise valid for integer registers. Since we can't see
31471 whether we have a nonzero offset from here, prohibit all
31472 nonparadoxical subregs changing size. */
31473 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31474 return true;
31475 }
31476
31477 return false;
31478 }
31479
31480 /* Return the cost of moving data of mode M between a
31481 register and memory. A value of 2 is the default; this cost is
31482 relative to those in `REGISTER_MOVE_COST'.
31483
31484 This function is used extensively by register_move_cost that is used to
31485 build tables at startup. Make it inline in this case.
31486 When IN is 2, return maximum of in and out move cost.
31487
31488 If moving between registers and memory is more expensive than
31489 between two registers, you should define this macro to express the
31490 relative cost.
31491
31492 Model also increased moving costs of QImode registers in non
31493 Q_REGS classes.
31494 */
31495 static inline int
31496 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31497 int in)
31498 {
31499 int cost;
31500 if (FLOAT_CLASS_P (regclass))
31501 {
31502 int index;
31503 switch (mode)
31504 {
31505 case SFmode:
31506 index = 0;
31507 break;
31508 case DFmode:
31509 index = 1;
31510 break;
31511 case XFmode:
31512 index = 2;
31513 break;
31514 default:
31515 return 100;
31516 }
31517 if (in == 2)
31518 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31519 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31520 }
31521 if (SSE_CLASS_P (regclass))
31522 {
31523 int index;
31524 switch (GET_MODE_SIZE (mode))
31525 {
31526 case 4:
31527 index = 0;
31528 break;
31529 case 8:
31530 index = 1;
31531 break;
31532 case 16:
31533 index = 2;
31534 break;
31535 default:
31536 return 100;
31537 }
31538 if (in == 2)
31539 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31540 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31541 }
31542 if (MMX_CLASS_P (regclass))
31543 {
31544 int index;
31545 switch (GET_MODE_SIZE (mode))
31546 {
31547 case 4:
31548 index = 0;
31549 break;
31550 case 8:
31551 index = 1;
31552 break;
31553 default:
31554 return 100;
31555 }
31556 if (in)
31557 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31558 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31559 }
31560 switch (GET_MODE_SIZE (mode))
31561 {
31562 case 1:
31563 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31564 {
31565 if (!in)
31566 return ix86_cost->int_store[0];
31567 if (TARGET_PARTIAL_REG_DEPENDENCY
31568 && optimize_function_for_speed_p (cfun))
31569 cost = ix86_cost->movzbl_load;
31570 else
31571 cost = ix86_cost->int_load[0];
31572 if (in == 2)
31573 return MAX (cost, ix86_cost->int_store[0]);
31574 return cost;
31575 }
31576 else
31577 {
31578 if (in == 2)
31579 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31580 if (in)
31581 return ix86_cost->movzbl_load;
31582 else
31583 return ix86_cost->int_store[0] + 4;
31584 }
31585 break;
31586 case 2:
31587 if (in == 2)
31588 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31589 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31590 default:
31591 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31592 if (mode == TFmode)
31593 mode = XFmode;
31594 if (in == 2)
31595 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31596 else if (in)
31597 cost = ix86_cost->int_load[2];
31598 else
31599 cost = ix86_cost->int_store[2];
31600 return (cost * (((int) GET_MODE_SIZE (mode)
31601 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31602 }
31603 }
31604
31605 static int
31606 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31607 bool in)
31608 {
31609 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31610 }
31611
31612
31613 /* Return the cost of moving data from a register in class CLASS1 to
31614 one in class CLASS2.
31615
31616 It is not required that the cost always equal 2 when FROM is the same as TO;
31617 on some machines it is expensive to move between registers if they are not
31618 general registers. */
31619
31620 static int
31621 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31622 reg_class_t class2_i)
31623 {
31624 enum reg_class class1 = (enum reg_class) class1_i;
31625 enum reg_class class2 = (enum reg_class) class2_i;
31626
31627 /* In case we require secondary memory, compute cost of the store followed
31628 by load. In order to avoid bad register allocation choices, we need
31629 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31630
31631 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31632 {
31633 int cost = 1;
31634
31635 cost += inline_memory_move_cost (mode, class1, 2);
31636 cost += inline_memory_move_cost (mode, class2, 2);
31637
31638 /* In case of copying from general_purpose_register we may emit multiple
31639 stores followed by single load causing memory size mismatch stall.
31640 Count this as arbitrarily high cost of 20. */
31641 if (targetm.class_max_nregs (class1, mode)
31642 > targetm.class_max_nregs (class2, mode))
31643 cost += 20;
31644
31645 /* In the case of FP/MMX moves, the registers actually overlap, and we
31646 have to switch modes in order to treat them differently. */
31647 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31648 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31649 cost += 20;
31650
31651 return cost;
31652 }
31653
31654 /* Moves between SSE/MMX and integer unit are expensive. */
31655 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31656 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31657
31658 /* ??? By keeping returned value relatively high, we limit the number
31659 of moves between integer and MMX/SSE registers for all targets.
31660 Additionally, high value prevents problem with x86_modes_tieable_p(),
31661 where integer modes in MMX/SSE registers are not tieable
31662 because of missing QImode and HImode moves to, from or between
31663 MMX/SSE registers. */
31664 return MAX (8, ix86_cost->mmxsse_to_integer);
31665
31666 if (MAYBE_FLOAT_CLASS_P (class1))
31667 return ix86_cost->fp_move;
31668 if (MAYBE_SSE_CLASS_P (class1))
31669 return ix86_cost->sse_move;
31670 if (MAYBE_MMX_CLASS_P (class1))
31671 return ix86_cost->mmx_move;
31672 return 2;
31673 }
31674
31675 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31676 MODE. */
31677
31678 bool
31679 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31680 {
31681 /* Flags and only flags can only hold CCmode values. */
31682 if (CC_REGNO_P (regno))
31683 return GET_MODE_CLASS (mode) == MODE_CC;
31684 if (GET_MODE_CLASS (mode) == MODE_CC
31685 || GET_MODE_CLASS (mode) == MODE_RANDOM
31686 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31687 return false;
31688 if (FP_REGNO_P (regno))
31689 return VALID_FP_MODE_P (mode);
31690 if (SSE_REGNO_P (regno))
31691 {
31692 /* We implement the move patterns for all vector modes into and
31693 out of SSE registers, even when no operation instructions
31694 are available. OImode move is available only when AVX is
31695 enabled. */
31696 return ((TARGET_AVX && mode == OImode)
31697 || VALID_AVX256_REG_MODE (mode)
31698 || VALID_SSE_REG_MODE (mode)
31699 || VALID_SSE2_REG_MODE (mode)
31700 || VALID_MMX_REG_MODE (mode)
31701 || VALID_MMX_REG_MODE_3DNOW (mode));
31702 }
31703 if (MMX_REGNO_P (regno))
31704 {
31705 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31706 so if the register is available at all, then we can move data of
31707 the given mode into or out of it. */
31708 return (VALID_MMX_REG_MODE (mode)
31709 || VALID_MMX_REG_MODE_3DNOW (mode));
31710 }
31711
31712 if (mode == QImode)
31713 {
31714 /* Take care for QImode values - they can be in non-QI regs,
31715 but then they do cause partial register stalls. */
31716 if (regno <= BX_REG || TARGET_64BIT)
31717 return true;
31718 if (!TARGET_PARTIAL_REG_STALL)
31719 return true;
31720 return !can_create_pseudo_p ();
31721 }
31722 /* We handle both integer and floats in the general purpose registers. */
31723 else if (VALID_INT_MODE_P (mode))
31724 return true;
31725 else if (VALID_FP_MODE_P (mode))
31726 return true;
31727 else if (VALID_DFP_MODE_P (mode))
31728 return true;
31729 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31730 on to use that value in smaller contexts, this can easily force a
31731 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31732 supporting DImode, allow it. */
31733 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31734 return true;
31735
31736 return false;
31737 }
31738
31739 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31740 tieable integer mode. */
31741
31742 static bool
31743 ix86_tieable_integer_mode_p (enum machine_mode mode)
31744 {
31745 switch (mode)
31746 {
31747 case HImode:
31748 case SImode:
31749 return true;
31750
31751 case QImode:
31752 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31753
31754 case DImode:
31755 return TARGET_64BIT;
31756
31757 default:
31758 return false;
31759 }
31760 }
31761
31762 /* Return true if MODE1 is accessible in a register that can hold MODE2
31763 without copying. That is, all register classes that can hold MODE2
31764 can also hold MODE1. */
31765
31766 bool
31767 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31768 {
31769 if (mode1 == mode2)
31770 return true;
31771
31772 if (ix86_tieable_integer_mode_p (mode1)
31773 && ix86_tieable_integer_mode_p (mode2))
31774 return true;
31775
31776 /* MODE2 being XFmode implies fp stack or general regs, which means we
31777 can tie any smaller floating point modes to it. Note that we do not
31778 tie this with TFmode. */
31779 if (mode2 == XFmode)
31780 return mode1 == SFmode || mode1 == DFmode;
31781
31782 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31783 that we can tie it with SFmode. */
31784 if (mode2 == DFmode)
31785 return mode1 == SFmode;
31786
31787 /* If MODE2 is only appropriate for an SSE register, then tie with
31788 any other mode acceptable to SSE registers. */
31789 if (GET_MODE_SIZE (mode2) == 32
31790 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31791 return (GET_MODE_SIZE (mode1) == 32
31792 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31793 if (GET_MODE_SIZE (mode2) == 16
31794 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31795 return (GET_MODE_SIZE (mode1) == 16
31796 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31797
31798 /* If MODE2 is appropriate for an MMX register, then tie
31799 with any other mode acceptable to MMX registers. */
31800 if (GET_MODE_SIZE (mode2) == 8
31801 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31802 return (GET_MODE_SIZE (mode1) == 8
31803 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31804
31805 return false;
31806 }
31807
31808 /* Return the cost of moving between two registers of mode MODE. */
31809
31810 static int
31811 ix86_set_reg_reg_cost (enum machine_mode mode)
31812 {
31813 unsigned int units = UNITS_PER_WORD;
31814
31815 switch (GET_MODE_CLASS (mode))
31816 {
31817 default:
31818 break;
31819
31820 case MODE_CC:
31821 units = GET_MODE_SIZE (CCmode);
31822 break;
31823
31824 case MODE_FLOAT:
31825 if ((TARGET_SSE2 && mode == TFmode)
31826 || (TARGET_80387 && mode == XFmode)
31827 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
31828 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
31829 units = GET_MODE_SIZE (mode);
31830 break;
31831
31832 case MODE_COMPLEX_FLOAT:
31833 if ((TARGET_SSE2 && mode == TCmode)
31834 || (TARGET_80387 && mode == XCmode)
31835 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
31836 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
31837 units = GET_MODE_SIZE (mode);
31838 break;
31839
31840 case MODE_VECTOR_INT:
31841 case MODE_VECTOR_FLOAT:
31842 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
31843 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
31844 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
31845 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
31846 units = GET_MODE_SIZE (mode);
31847 }
31848
31849 /* Return the cost of moving between two registers of mode MODE,
31850 assuming that the move will be in pieces of at most UNITS bytes. */
31851 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
31852 }
31853
31854 /* Compute a (partial) cost for rtx X. Return true if the complete
31855 cost has been computed, and false if subexpressions should be
31856 scanned. In either case, *TOTAL contains the cost result. */
31857
31858 static bool
31859 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31860 bool speed)
31861 {
31862 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31863 enum machine_mode mode = GET_MODE (x);
31864 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31865
31866 switch (code)
31867 {
31868 case SET:
31869 if (register_operand (SET_DEST (x), VOIDmode)
31870 && reg_or_0_operand (SET_SRC (x), VOIDmode))
31871 {
31872 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
31873 return true;
31874 }
31875 return false;
31876
31877 case CONST_INT:
31878 case CONST:
31879 case LABEL_REF:
31880 case SYMBOL_REF:
31881 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31882 *total = 3;
31883 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31884 *total = 2;
31885 else if (flag_pic && SYMBOLIC_CONST (x)
31886 && (!TARGET_64BIT
31887 || (!GET_CODE (x) != LABEL_REF
31888 && (GET_CODE (x) != SYMBOL_REF
31889 || !SYMBOL_REF_LOCAL_P (x)))))
31890 *total = 1;
31891 else
31892 *total = 0;
31893 return true;
31894
31895 case CONST_DOUBLE:
31896 if (mode == VOIDmode)
31897 *total = 0;
31898 else
31899 switch (standard_80387_constant_p (x))
31900 {
31901 case 1: /* 0.0 */
31902 *total = 1;
31903 break;
31904 default: /* Other constants */
31905 *total = 2;
31906 break;
31907 case 0:
31908 case -1:
31909 /* Start with (MEM (SYMBOL_REF)), since that's where
31910 it'll probably end up. Add a penalty for size. */
31911 *total = (COSTS_N_INSNS (1)
31912 + (flag_pic != 0 && !TARGET_64BIT)
31913 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31914 break;
31915 }
31916 return true;
31917
31918 case ZERO_EXTEND:
31919 /* The zero extensions is often completely free on x86_64, so make
31920 it as cheap as possible. */
31921 if (TARGET_64BIT && mode == DImode
31922 && GET_MODE (XEXP (x, 0)) == SImode)
31923 *total = 1;
31924 else if (TARGET_ZERO_EXTEND_WITH_AND)
31925 *total = cost->add;
31926 else
31927 *total = cost->movzx;
31928 return false;
31929
31930 case SIGN_EXTEND:
31931 *total = cost->movsx;
31932 return false;
31933
31934 case ASHIFT:
31935 if (CONST_INT_P (XEXP (x, 1))
31936 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31937 {
31938 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31939 if (value == 1)
31940 {
31941 *total = cost->add;
31942 return false;
31943 }
31944 if ((value == 2 || value == 3)
31945 && cost->lea <= cost->shift_const)
31946 {
31947 *total = cost->lea;
31948 return false;
31949 }
31950 }
31951 /* FALLTHRU */
31952
31953 case ROTATE:
31954 case ASHIFTRT:
31955 case LSHIFTRT:
31956 case ROTATERT:
31957 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31958 {
31959 if (CONST_INT_P (XEXP (x, 1)))
31960 {
31961 if (INTVAL (XEXP (x, 1)) > 32)
31962 *total = cost->shift_const + COSTS_N_INSNS (2);
31963 else
31964 *total = cost->shift_const * 2;
31965 }
31966 else
31967 {
31968 if (GET_CODE (XEXP (x, 1)) == AND)
31969 *total = cost->shift_var * 2;
31970 else
31971 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31972 }
31973 }
31974 else
31975 {
31976 if (CONST_INT_P (XEXP (x, 1)))
31977 *total = cost->shift_const;
31978 else
31979 *total = cost->shift_var;
31980 }
31981 return false;
31982
31983 case FMA:
31984 {
31985 rtx sub;
31986
31987 gcc_assert (FLOAT_MODE_P (mode));
31988 gcc_assert (TARGET_FMA || TARGET_FMA4);
31989
31990 /* ??? SSE scalar/vector cost should be used here. */
31991 /* ??? Bald assumption that fma has the same cost as fmul. */
31992 *total = cost->fmul;
31993 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31994
31995 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31996 sub = XEXP (x, 0);
31997 if (GET_CODE (sub) == NEG)
31998 sub = XEXP (sub, 0);
31999 *total += rtx_cost (sub, FMA, 0, speed);
32000
32001 sub = XEXP (x, 2);
32002 if (GET_CODE (sub) == NEG)
32003 sub = XEXP (sub, 0);
32004 *total += rtx_cost (sub, FMA, 2, speed);
32005 return true;
32006 }
32007
32008 case MULT:
32009 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32010 {
32011 /* ??? SSE scalar cost should be used here. */
32012 *total = cost->fmul;
32013 return false;
32014 }
32015 else if (X87_FLOAT_MODE_P (mode))
32016 {
32017 *total = cost->fmul;
32018 return false;
32019 }
32020 else if (FLOAT_MODE_P (mode))
32021 {
32022 /* ??? SSE vector cost should be used here. */
32023 *total = cost->fmul;
32024 return false;
32025 }
32026 else
32027 {
32028 rtx op0 = XEXP (x, 0);
32029 rtx op1 = XEXP (x, 1);
32030 int nbits;
32031 if (CONST_INT_P (XEXP (x, 1)))
32032 {
32033 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32034 for (nbits = 0; value != 0; value &= value - 1)
32035 nbits++;
32036 }
32037 else
32038 /* This is arbitrary. */
32039 nbits = 7;
32040
32041 /* Compute costs correctly for widening multiplication. */
32042 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32043 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32044 == GET_MODE_SIZE (mode))
32045 {
32046 int is_mulwiden = 0;
32047 enum machine_mode inner_mode = GET_MODE (op0);
32048
32049 if (GET_CODE (op0) == GET_CODE (op1))
32050 is_mulwiden = 1, op1 = XEXP (op1, 0);
32051 else if (CONST_INT_P (op1))
32052 {
32053 if (GET_CODE (op0) == SIGN_EXTEND)
32054 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32055 == INTVAL (op1);
32056 else
32057 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32058 }
32059
32060 if (is_mulwiden)
32061 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32062 }
32063
32064 *total = (cost->mult_init[MODE_INDEX (mode)]
32065 + nbits * cost->mult_bit
32066 + rtx_cost (op0, outer_code, opno, speed)
32067 + rtx_cost (op1, outer_code, opno, speed));
32068
32069 return true;
32070 }
32071
32072 case DIV:
32073 case UDIV:
32074 case MOD:
32075 case UMOD:
32076 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32077 /* ??? SSE cost should be used here. */
32078 *total = cost->fdiv;
32079 else if (X87_FLOAT_MODE_P (mode))
32080 *total = cost->fdiv;
32081 else if (FLOAT_MODE_P (mode))
32082 /* ??? SSE vector cost should be used here. */
32083 *total = cost->fdiv;
32084 else
32085 *total = cost->divide[MODE_INDEX (mode)];
32086 return false;
32087
32088 case PLUS:
32089 if (GET_MODE_CLASS (mode) == MODE_INT
32090 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
32091 {
32092 if (GET_CODE (XEXP (x, 0)) == PLUS
32093 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32094 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32095 && CONSTANT_P (XEXP (x, 1)))
32096 {
32097 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32098 if (val == 2 || val == 4 || val == 8)
32099 {
32100 *total = cost->lea;
32101 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32102 outer_code, opno, speed);
32103 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32104 outer_code, opno, speed);
32105 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32106 return true;
32107 }
32108 }
32109 else if (GET_CODE (XEXP (x, 0)) == MULT
32110 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32111 {
32112 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32113 if (val == 2 || val == 4 || val == 8)
32114 {
32115 *total = cost->lea;
32116 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32117 outer_code, opno, speed);
32118 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32119 return true;
32120 }
32121 }
32122 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32123 {
32124 *total = cost->lea;
32125 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32126 outer_code, opno, speed);
32127 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32128 outer_code, opno, speed);
32129 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32130 return true;
32131 }
32132 }
32133 /* FALLTHRU */
32134
32135 case MINUS:
32136 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32137 {
32138 /* ??? SSE cost should be used here. */
32139 *total = cost->fadd;
32140 return false;
32141 }
32142 else if (X87_FLOAT_MODE_P (mode))
32143 {
32144 *total = cost->fadd;
32145 return false;
32146 }
32147 else if (FLOAT_MODE_P (mode))
32148 {
32149 /* ??? SSE vector cost should be used here. */
32150 *total = cost->fadd;
32151 return false;
32152 }
32153 /* FALLTHRU */
32154
32155 case AND:
32156 case IOR:
32157 case XOR:
32158 if (!TARGET_64BIT && mode == DImode)
32159 {
32160 *total = (cost->add * 2
32161 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32162 << (GET_MODE (XEXP (x, 0)) != DImode))
32163 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32164 << (GET_MODE (XEXP (x, 1)) != DImode)));
32165 return true;
32166 }
32167 /* FALLTHRU */
32168
32169 case NEG:
32170 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32171 {
32172 /* ??? SSE cost should be used here. */
32173 *total = cost->fchs;
32174 return false;
32175 }
32176 else if (X87_FLOAT_MODE_P (mode))
32177 {
32178 *total = cost->fchs;
32179 return false;
32180 }
32181 else if (FLOAT_MODE_P (mode))
32182 {
32183 /* ??? SSE vector cost should be used here. */
32184 *total = cost->fchs;
32185 return false;
32186 }
32187 /* FALLTHRU */
32188
32189 case NOT:
32190 if (!TARGET_64BIT && mode == DImode)
32191 *total = cost->add * 2;
32192 else
32193 *total = cost->add;
32194 return false;
32195
32196 case COMPARE:
32197 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32198 && XEXP (XEXP (x, 0), 1) == const1_rtx
32199 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32200 && XEXP (x, 1) == const0_rtx)
32201 {
32202 /* This kind of construct is implemented using test[bwl].
32203 Treat it as if we had an AND. */
32204 *total = (cost->add
32205 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32206 + rtx_cost (const1_rtx, outer_code, opno, speed));
32207 return true;
32208 }
32209 return false;
32210
32211 case FLOAT_EXTEND:
32212 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32213 *total = 0;
32214 return false;
32215
32216 case ABS:
32217 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32218 /* ??? SSE cost should be used here. */
32219 *total = cost->fabs;
32220 else if (X87_FLOAT_MODE_P (mode))
32221 *total = cost->fabs;
32222 else if (FLOAT_MODE_P (mode))
32223 /* ??? SSE vector cost should be used here. */
32224 *total = cost->fabs;
32225 return false;
32226
32227 case SQRT:
32228 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32229 /* ??? SSE cost should be used here. */
32230 *total = cost->fsqrt;
32231 else if (X87_FLOAT_MODE_P (mode))
32232 *total = cost->fsqrt;
32233 else if (FLOAT_MODE_P (mode))
32234 /* ??? SSE vector cost should be used here. */
32235 *total = cost->fsqrt;
32236 return false;
32237
32238 case UNSPEC:
32239 if (XINT (x, 1) == UNSPEC_TP)
32240 *total = 0;
32241 return false;
32242
32243 case VEC_SELECT:
32244 case VEC_CONCAT:
32245 case VEC_MERGE:
32246 case VEC_DUPLICATE:
32247 /* ??? Assume all of these vector manipulation patterns are
32248 recognizable. In which case they all pretty much have the
32249 same cost. */
32250 *total = COSTS_N_INSNS (1);
32251 return true;
32252
32253 default:
32254 return false;
32255 }
32256 }
32257
32258 #if TARGET_MACHO
32259
32260 static int current_machopic_label_num;
32261
32262 /* Given a symbol name and its associated stub, write out the
32263 definition of the stub. */
32264
32265 void
32266 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32267 {
32268 unsigned int length;
32269 char *binder_name, *symbol_name, lazy_ptr_name[32];
32270 int label = ++current_machopic_label_num;
32271
32272 /* For 64-bit we shouldn't get here. */
32273 gcc_assert (!TARGET_64BIT);
32274
32275 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32276 symb = targetm.strip_name_encoding (symb);
32277
32278 length = strlen (stub);
32279 binder_name = XALLOCAVEC (char, length + 32);
32280 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32281
32282 length = strlen (symb);
32283 symbol_name = XALLOCAVEC (char, length + 32);
32284 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32285
32286 sprintf (lazy_ptr_name, "L%d$lz", label);
32287
32288 if (MACHOPIC_ATT_STUB)
32289 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32290 else if (MACHOPIC_PURE)
32291 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32292 else
32293 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32294
32295 fprintf (file, "%s:\n", stub);
32296 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32297
32298 if (MACHOPIC_ATT_STUB)
32299 {
32300 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32301 }
32302 else if (MACHOPIC_PURE)
32303 {
32304 /* PIC stub. */
32305 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32306 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32307 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32308 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32309 label, lazy_ptr_name, label);
32310 fprintf (file, "\tjmp\t*%%ecx\n");
32311 }
32312 else
32313 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32314
32315 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32316 it needs no stub-binding-helper. */
32317 if (MACHOPIC_ATT_STUB)
32318 return;
32319
32320 fprintf (file, "%s:\n", binder_name);
32321
32322 if (MACHOPIC_PURE)
32323 {
32324 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32325 fprintf (file, "\tpushl\t%%ecx\n");
32326 }
32327 else
32328 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32329
32330 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32331
32332 /* N.B. Keep the correspondence of these
32333 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32334 old-pic/new-pic/non-pic stubs; altering this will break
32335 compatibility with existing dylibs. */
32336 if (MACHOPIC_PURE)
32337 {
32338 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32339 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32340 }
32341 else
32342 /* 16-byte -mdynamic-no-pic stub. */
32343 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32344
32345 fprintf (file, "%s:\n", lazy_ptr_name);
32346 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32347 fprintf (file, ASM_LONG "%s\n", binder_name);
32348 }
32349 #endif /* TARGET_MACHO */
32350
32351 /* Order the registers for register allocator. */
32352
32353 void
32354 x86_order_regs_for_local_alloc (void)
32355 {
32356 int pos = 0;
32357 int i;
32358
32359 /* First allocate the local general purpose registers. */
32360 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32361 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32362 reg_alloc_order [pos++] = i;
32363
32364 /* Global general purpose registers. */
32365 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32366 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32367 reg_alloc_order [pos++] = i;
32368
32369 /* x87 registers come first in case we are doing FP math
32370 using them. */
32371 if (!TARGET_SSE_MATH)
32372 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32373 reg_alloc_order [pos++] = i;
32374
32375 /* SSE registers. */
32376 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32377 reg_alloc_order [pos++] = i;
32378 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32379 reg_alloc_order [pos++] = i;
32380
32381 /* x87 registers. */
32382 if (TARGET_SSE_MATH)
32383 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32384 reg_alloc_order [pos++] = i;
32385
32386 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32387 reg_alloc_order [pos++] = i;
32388
32389 /* Initialize the rest of array as we do not allocate some registers
32390 at all. */
32391 while (pos < FIRST_PSEUDO_REGISTER)
32392 reg_alloc_order [pos++] = 0;
32393 }
32394
32395 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32396 in struct attribute_spec handler. */
32397 static tree
32398 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32399 tree args,
32400 int flags ATTRIBUTE_UNUSED,
32401 bool *no_add_attrs)
32402 {
32403 if (TREE_CODE (*node) != FUNCTION_TYPE
32404 && TREE_CODE (*node) != METHOD_TYPE
32405 && TREE_CODE (*node) != FIELD_DECL
32406 && TREE_CODE (*node) != TYPE_DECL)
32407 {
32408 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32409 name);
32410 *no_add_attrs = true;
32411 return NULL_TREE;
32412 }
32413 if (TARGET_64BIT)
32414 {
32415 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32416 name);
32417 *no_add_attrs = true;
32418 return NULL_TREE;
32419 }
32420 if (is_attribute_p ("callee_pop_aggregate_return", name))
32421 {
32422 tree cst;
32423
32424 cst = TREE_VALUE (args);
32425 if (TREE_CODE (cst) != INTEGER_CST)
32426 {
32427 warning (OPT_Wattributes,
32428 "%qE attribute requires an integer constant argument",
32429 name);
32430 *no_add_attrs = true;
32431 }
32432 else if (compare_tree_int (cst, 0) != 0
32433 && compare_tree_int (cst, 1) != 0)
32434 {
32435 warning (OPT_Wattributes,
32436 "argument to %qE attribute is neither zero, nor one",
32437 name);
32438 *no_add_attrs = true;
32439 }
32440
32441 return NULL_TREE;
32442 }
32443
32444 return NULL_TREE;
32445 }
32446
32447 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32448 struct attribute_spec.handler. */
32449 static tree
32450 ix86_handle_abi_attribute (tree *node, tree name,
32451 tree args ATTRIBUTE_UNUSED,
32452 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32453 {
32454 if (TREE_CODE (*node) != FUNCTION_TYPE
32455 && TREE_CODE (*node) != METHOD_TYPE
32456 && TREE_CODE (*node) != FIELD_DECL
32457 && TREE_CODE (*node) != TYPE_DECL)
32458 {
32459 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32460 name);
32461 *no_add_attrs = true;
32462 return NULL_TREE;
32463 }
32464
32465 /* Can combine regparm with all attributes but fastcall. */
32466 if (is_attribute_p ("ms_abi", name))
32467 {
32468 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32469 {
32470 error ("ms_abi and sysv_abi attributes are not compatible");
32471 }
32472
32473 return NULL_TREE;
32474 }
32475 else if (is_attribute_p ("sysv_abi", name))
32476 {
32477 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32478 {
32479 error ("ms_abi and sysv_abi attributes are not compatible");
32480 }
32481
32482 return NULL_TREE;
32483 }
32484
32485 return NULL_TREE;
32486 }
32487
32488 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32489 struct attribute_spec.handler. */
32490 static tree
32491 ix86_handle_struct_attribute (tree *node, tree name,
32492 tree args ATTRIBUTE_UNUSED,
32493 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32494 {
32495 tree *type = NULL;
32496 if (DECL_P (*node))
32497 {
32498 if (TREE_CODE (*node) == TYPE_DECL)
32499 type = &TREE_TYPE (*node);
32500 }
32501 else
32502 type = node;
32503
32504 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32505 {
32506 warning (OPT_Wattributes, "%qE attribute ignored",
32507 name);
32508 *no_add_attrs = true;
32509 }
32510
32511 else if ((is_attribute_p ("ms_struct", name)
32512 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32513 || ((is_attribute_p ("gcc_struct", name)
32514 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32515 {
32516 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32517 name);
32518 *no_add_attrs = true;
32519 }
32520
32521 return NULL_TREE;
32522 }
32523
32524 static tree
32525 ix86_handle_fndecl_attribute (tree *node, tree name,
32526 tree args ATTRIBUTE_UNUSED,
32527 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32528 {
32529 if (TREE_CODE (*node) != FUNCTION_DECL)
32530 {
32531 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32532 name);
32533 *no_add_attrs = true;
32534 }
32535 return NULL_TREE;
32536 }
32537
32538 static bool
32539 ix86_ms_bitfield_layout_p (const_tree record_type)
32540 {
32541 return ((TARGET_MS_BITFIELD_LAYOUT
32542 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32543 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32544 }
32545
32546 /* Returns an expression indicating where the this parameter is
32547 located on entry to the FUNCTION. */
32548
32549 static rtx
32550 x86_this_parameter (tree function)
32551 {
32552 tree type = TREE_TYPE (function);
32553 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32554 int nregs;
32555
32556 if (TARGET_64BIT)
32557 {
32558 const int *parm_regs;
32559
32560 if (ix86_function_type_abi (type) == MS_ABI)
32561 parm_regs = x86_64_ms_abi_int_parameter_registers;
32562 else
32563 parm_regs = x86_64_int_parameter_registers;
32564 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32565 }
32566
32567 nregs = ix86_function_regparm (type, function);
32568
32569 if (nregs > 0 && !stdarg_p (type))
32570 {
32571 int regno;
32572 unsigned int ccvt = ix86_get_callcvt (type);
32573
32574 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32575 regno = aggr ? DX_REG : CX_REG;
32576 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32577 {
32578 regno = CX_REG;
32579 if (aggr)
32580 return gen_rtx_MEM (SImode,
32581 plus_constant (Pmode, stack_pointer_rtx, 4));
32582 }
32583 else
32584 {
32585 regno = AX_REG;
32586 if (aggr)
32587 {
32588 regno = DX_REG;
32589 if (nregs == 1)
32590 return gen_rtx_MEM (SImode,
32591 plus_constant (Pmode,
32592 stack_pointer_rtx, 4));
32593 }
32594 }
32595 return gen_rtx_REG (SImode, regno);
32596 }
32597
32598 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
32599 aggr ? 8 : 4));
32600 }
32601
32602 /* Determine whether x86_output_mi_thunk can succeed. */
32603
32604 static bool
32605 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32606 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32607 HOST_WIDE_INT vcall_offset, const_tree function)
32608 {
32609 /* 64-bit can handle anything. */
32610 if (TARGET_64BIT)
32611 return true;
32612
32613 /* For 32-bit, everything's fine if we have one free register. */
32614 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32615 return true;
32616
32617 /* Need a free register for vcall_offset. */
32618 if (vcall_offset)
32619 return false;
32620
32621 /* Need a free register for GOT references. */
32622 if (flag_pic && !targetm.binds_local_p (function))
32623 return false;
32624
32625 /* Otherwise ok. */
32626 return true;
32627 }
32628
32629 /* Output the assembler code for a thunk function. THUNK_DECL is the
32630 declaration for the thunk function itself, FUNCTION is the decl for
32631 the target function. DELTA is an immediate constant offset to be
32632 added to THIS. If VCALL_OFFSET is nonzero, the word at
32633 *(*this + vcall_offset) should be added to THIS. */
32634
32635 static void
32636 x86_output_mi_thunk (FILE *file,
32637 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32638 HOST_WIDE_INT vcall_offset, tree function)
32639 {
32640 rtx this_param = x86_this_parameter (function);
32641 rtx this_reg, tmp, fnaddr;
32642
32643 emit_note (NOTE_INSN_PROLOGUE_END);
32644
32645 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32646 pull it in now and let DELTA benefit. */
32647 if (REG_P (this_param))
32648 this_reg = this_param;
32649 else if (vcall_offset)
32650 {
32651 /* Put the this parameter into %eax. */
32652 this_reg = gen_rtx_REG (Pmode, AX_REG);
32653 emit_move_insn (this_reg, this_param);
32654 }
32655 else
32656 this_reg = NULL_RTX;
32657
32658 /* Adjust the this parameter by a fixed constant. */
32659 if (delta)
32660 {
32661 rtx delta_rtx = GEN_INT (delta);
32662 rtx delta_dst = this_reg ? this_reg : this_param;
32663
32664 if (TARGET_64BIT)
32665 {
32666 if (!x86_64_general_operand (delta_rtx, Pmode))
32667 {
32668 tmp = gen_rtx_REG (Pmode, R10_REG);
32669 emit_move_insn (tmp, delta_rtx);
32670 delta_rtx = tmp;
32671 }
32672 }
32673
32674 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32675 }
32676
32677 /* Adjust the this parameter by a value stored in the vtable. */
32678 if (vcall_offset)
32679 {
32680 rtx vcall_addr, vcall_mem, this_mem;
32681 unsigned int tmp_regno;
32682
32683 if (TARGET_64BIT)
32684 tmp_regno = R10_REG;
32685 else
32686 {
32687 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32688 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32689 tmp_regno = AX_REG;
32690 else
32691 tmp_regno = CX_REG;
32692 }
32693 tmp = gen_rtx_REG (Pmode, tmp_regno);
32694
32695 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32696 if (Pmode != ptr_mode)
32697 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32698 emit_move_insn (tmp, this_mem);
32699
32700 /* Adjust the this parameter. */
32701 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
32702 if (TARGET_64BIT
32703 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32704 {
32705 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32706 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32707 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32708 }
32709
32710 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32711 if (Pmode != ptr_mode)
32712 emit_insn (gen_addsi_1_zext (this_reg,
32713 gen_rtx_REG (ptr_mode,
32714 REGNO (this_reg)),
32715 vcall_mem));
32716 else
32717 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32718 }
32719
32720 /* If necessary, drop THIS back to its stack slot. */
32721 if (this_reg && this_reg != this_param)
32722 emit_move_insn (this_param, this_reg);
32723
32724 fnaddr = XEXP (DECL_RTL (function), 0);
32725 if (TARGET_64BIT)
32726 {
32727 if (!flag_pic || targetm.binds_local_p (function)
32728 || cfun->machine->call_abi == MS_ABI)
32729 ;
32730 else
32731 {
32732 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32733 tmp = gen_rtx_CONST (Pmode, tmp);
32734 fnaddr = gen_rtx_MEM (Pmode, tmp);
32735 }
32736 }
32737 else
32738 {
32739 if (!flag_pic || targetm.binds_local_p (function))
32740 ;
32741 #if TARGET_MACHO
32742 else if (TARGET_MACHO)
32743 {
32744 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32745 fnaddr = XEXP (fnaddr, 0);
32746 }
32747 #endif /* TARGET_MACHO */
32748 else
32749 {
32750 tmp = gen_rtx_REG (Pmode, CX_REG);
32751 output_set_got (tmp, NULL_RTX);
32752
32753 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32754 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32755 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32756 }
32757 }
32758
32759 /* Our sibling call patterns do not allow memories, because we have no
32760 predicate that can distinguish between frame and non-frame memory.
32761 For our purposes here, we can get away with (ab)using a jump pattern,
32762 because we're going to do no optimization. */
32763 if (MEM_P (fnaddr))
32764 emit_jump_insn (gen_indirect_jump (fnaddr));
32765 else
32766 {
32767 tmp = gen_rtx_MEM (QImode, fnaddr);
32768 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32769 tmp = emit_call_insn (tmp);
32770 SIBLING_CALL_P (tmp) = 1;
32771 }
32772 emit_barrier ();
32773
32774 /* Emit just enough of rest_of_compilation to get the insns emitted.
32775 Note that use_thunk calls assemble_start_function et al. */
32776 tmp = get_insns ();
32777 insn_locators_alloc ();
32778 shorten_branches (tmp);
32779 final_start_function (tmp, file, 1);
32780 final (tmp, file, 1);
32781 final_end_function ();
32782 }
32783
32784 static void
32785 x86_file_start (void)
32786 {
32787 default_file_start ();
32788 #if TARGET_MACHO
32789 darwin_file_start ();
32790 #endif
32791 if (X86_FILE_START_VERSION_DIRECTIVE)
32792 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32793 if (X86_FILE_START_FLTUSED)
32794 fputs ("\t.global\t__fltused\n", asm_out_file);
32795 if (ix86_asm_dialect == ASM_INTEL)
32796 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32797 }
32798
32799 int
32800 x86_field_alignment (tree field, int computed)
32801 {
32802 enum machine_mode mode;
32803 tree type = TREE_TYPE (field);
32804
32805 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32806 return computed;
32807 mode = TYPE_MODE (strip_array_types (type));
32808 if (mode == DFmode || mode == DCmode
32809 || GET_MODE_CLASS (mode) == MODE_INT
32810 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32811 return MIN (32, computed);
32812 return computed;
32813 }
32814
32815 /* Output assembler code to FILE to increment profiler label # LABELNO
32816 for profiling a function entry. */
32817 void
32818 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32819 {
32820 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32821 : MCOUNT_NAME);
32822
32823 if (TARGET_64BIT)
32824 {
32825 #ifndef NO_PROFILE_COUNTERS
32826 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32827 #endif
32828
32829 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32830 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32831 else
32832 fprintf (file, "\tcall\t%s\n", mcount_name);
32833 }
32834 else if (flag_pic)
32835 {
32836 #ifndef NO_PROFILE_COUNTERS
32837 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32838 LPREFIX, labelno);
32839 #endif
32840 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32841 }
32842 else
32843 {
32844 #ifndef NO_PROFILE_COUNTERS
32845 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32846 LPREFIX, labelno);
32847 #endif
32848 fprintf (file, "\tcall\t%s\n", mcount_name);
32849 }
32850 }
32851
32852 /* We don't have exact information about the insn sizes, but we may assume
32853 quite safely that we are informed about all 1 byte insns and memory
32854 address sizes. This is enough to eliminate unnecessary padding in
32855 99% of cases. */
32856
32857 static int
32858 min_insn_size (rtx insn)
32859 {
32860 int l = 0, len;
32861
32862 if (!INSN_P (insn) || !active_insn_p (insn))
32863 return 0;
32864
32865 /* Discard alignments we've emit and jump instructions. */
32866 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32867 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32868 return 0;
32869 if (JUMP_TABLE_DATA_P (insn))
32870 return 0;
32871
32872 /* Important case - calls are always 5 bytes.
32873 It is common to have many calls in the row. */
32874 if (CALL_P (insn)
32875 && symbolic_reference_mentioned_p (PATTERN (insn))
32876 && !SIBLING_CALL_P (insn))
32877 return 5;
32878 len = get_attr_length (insn);
32879 if (len <= 1)
32880 return 1;
32881
32882 /* For normal instructions we rely on get_attr_length being exact,
32883 with a few exceptions. */
32884 if (!JUMP_P (insn))
32885 {
32886 enum attr_type type = get_attr_type (insn);
32887
32888 switch (type)
32889 {
32890 case TYPE_MULTI:
32891 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32892 || asm_noperands (PATTERN (insn)) >= 0)
32893 return 0;
32894 break;
32895 case TYPE_OTHER:
32896 case TYPE_FCMP:
32897 break;
32898 default:
32899 /* Otherwise trust get_attr_length. */
32900 return len;
32901 }
32902
32903 l = get_attr_length_address (insn);
32904 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32905 l = 4;
32906 }
32907 if (l)
32908 return 1+l;
32909 else
32910 return 2;
32911 }
32912
32913 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32914
32915 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32916 window. */
32917
32918 static void
32919 ix86_avoid_jump_mispredicts (void)
32920 {
32921 rtx insn, start = get_insns ();
32922 int nbytes = 0, njumps = 0;
32923 int isjump = 0;
32924
32925 /* Look for all minimal intervals of instructions containing 4 jumps.
32926 The intervals are bounded by START and INSN. NBYTES is the total
32927 size of instructions in the interval including INSN and not including
32928 START. When the NBYTES is smaller than 16 bytes, it is possible
32929 that the end of START and INSN ends up in the same 16byte page.
32930
32931 The smallest offset in the page INSN can start is the case where START
32932 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32933 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32934 */
32935 for (insn = start; insn; insn = NEXT_INSN (insn))
32936 {
32937 int min_size;
32938
32939 if (LABEL_P (insn))
32940 {
32941 int align = label_to_alignment (insn);
32942 int max_skip = label_to_max_skip (insn);
32943
32944 if (max_skip > 15)
32945 max_skip = 15;
32946 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32947 already in the current 16 byte page, because otherwise
32948 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32949 bytes to reach 16 byte boundary. */
32950 if (align <= 0
32951 || (align <= 3 && max_skip != (1 << align) - 1))
32952 max_skip = 0;
32953 if (dump_file)
32954 fprintf (dump_file, "Label %i with max_skip %i\n",
32955 INSN_UID (insn), max_skip);
32956 if (max_skip)
32957 {
32958 while (nbytes + max_skip >= 16)
32959 {
32960 start = NEXT_INSN (start);
32961 if ((JUMP_P (start)
32962 && GET_CODE (PATTERN (start)) != ADDR_VEC
32963 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32964 || CALL_P (start))
32965 njumps--, isjump = 1;
32966 else
32967 isjump = 0;
32968 nbytes -= min_insn_size (start);
32969 }
32970 }
32971 continue;
32972 }
32973
32974 min_size = min_insn_size (insn);
32975 nbytes += min_size;
32976 if (dump_file)
32977 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32978 INSN_UID (insn), min_size);
32979 if ((JUMP_P (insn)
32980 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32981 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32982 || CALL_P (insn))
32983 njumps++;
32984 else
32985 continue;
32986
32987 while (njumps > 3)
32988 {
32989 start = NEXT_INSN (start);
32990 if ((JUMP_P (start)
32991 && GET_CODE (PATTERN (start)) != ADDR_VEC
32992 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32993 || CALL_P (start))
32994 njumps--, isjump = 1;
32995 else
32996 isjump = 0;
32997 nbytes -= min_insn_size (start);
32998 }
32999 gcc_assert (njumps >= 0);
33000 if (dump_file)
33001 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33002 INSN_UID (start), INSN_UID (insn), nbytes);
33003
33004 if (njumps == 3 && isjump && nbytes < 16)
33005 {
33006 int padsize = 15 - nbytes + min_insn_size (insn);
33007
33008 if (dump_file)
33009 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33010 INSN_UID (insn), padsize);
33011 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33012 }
33013 }
33014 }
33015 #endif
33016
33017 /* AMD Athlon works faster
33018 when RET is not destination of conditional jump or directly preceded
33019 by other jump instruction. We avoid the penalty by inserting NOP just
33020 before the RET instructions in such cases. */
33021 static void
33022 ix86_pad_returns (void)
33023 {
33024 edge e;
33025 edge_iterator ei;
33026
33027 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33028 {
33029 basic_block bb = e->src;
33030 rtx ret = BB_END (bb);
33031 rtx prev;
33032 bool replace = false;
33033
33034 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33035 || optimize_bb_for_size_p (bb))
33036 continue;
33037 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33038 if (active_insn_p (prev) || LABEL_P (prev))
33039 break;
33040 if (prev && LABEL_P (prev))
33041 {
33042 edge e;
33043 edge_iterator ei;
33044
33045 FOR_EACH_EDGE (e, ei, bb->preds)
33046 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33047 && !(e->flags & EDGE_FALLTHRU))
33048 replace = true;
33049 }
33050 if (!replace)
33051 {
33052 prev = prev_active_insn (ret);
33053 if (prev
33054 && ((JUMP_P (prev) && any_condjump_p (prev))
33055 || CALL_P (prev)))
33056 replace = true;
33057 /* Empty functions get branch mispredict even when
33058 the jump destination is not visible to us. */
33059 if (!prev && !optimize_function_for_size_p (cfun))
33060 replace = true;
33061 }
33062 if (replace)
33063 {
33064 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33065 delete_insn (ret);
33066 }
33067 }
33068 }
33069
33070 /* Count the minimum number of instructions in BB. Return 4 if the
33071 number of instructions >= 4. */
33072
33073 static int
33074 ix86_count_insn_bb (basic_block bb)
33075 {
33076 rtx insn;
33077 int insn_count = 0;
33078
33079 /* Count number of instructions in this block. Return 4 if the number
33080 of instructions >= 4. */
33081 FOR_BB_INSNS (bb, insn)
33082 {
33083 /* Only happen in exit blocks. */
33084 if (JUMP_P (insn)
33085 && ANY_RETURN_P (PATTERN (insn)))
33086 break;
33087
33088 if (NONDEBUG_INSN_P (insn)
33089 && GET_CODE (PATTERN (insn)) != USE
33090 && GET_CODE (PATTERN (insn)) != CLOBBER)
33091 {
33092 insn_count++;
33093 if (insn_count >= 4)
33094 return insn_count;
33095 }
33096 }
33097
33098 return insn_count;
33099 }
33100
33101
33102 /* Count the minimum number of instructions in code path in BB.
33103 Return 4 if the number of instructions >= 4. */
33104
33105 static int
33106 ix86_count_insn (basic_block bb)
33107 {
33108 edge e;
33109 edge_iterator ei;
33110 int min_prev_count;
33111
33112 /* Only bother counting instructions along paths with no
33113 more than 2 basic blocks between entry and exit. Given
33114 that BB has an edge to exit, determine if a predecessor
33115 of BB has an edge from entry. If so, compute the number
33116 of instructions in the predecessor block. If there
33117 happen to be multiple such blocks, compute the minimum. */
33118 min_prev_count = 4;
33119 FOR_EACH_EDGE (e, ei, bb->preds)
33120 {
33121 edge prev_e;
33122 edge_iterator prev_ei;
33123
33124 if (e->src == ENTRY_BLOCK_PTR)
33125 {
33126 min_prev_count = 0;
33127 break;
33128 }
33129 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33130 {
33131 if (prev_e->src == ENTRY_BLOCK_PTR)
33132 {
33133 int count = ix86_count_insn_bb (e->src);
33134 if (count < min_prev_count)
33135 min_prev_count = count;
33136 break;
33137 }
33138 }
33139 }
33140
33141 if (min_prev_count < 4)
33142 min_prev_count += ix86_count_insn_bb (bb);
33143
33144 return min_prev_count;
33145 }
33146
33147 /* Pad short funtion to 4 instructions. */
33148
33149 static void
33150 ix86_pad_short_function (void)
33151 {
33152 edge e;
33153 edge_iterator ei;
33154
33155 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33156 {
33157 rtx ret = BB_END (e->src);
33158 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33159 {
33160 int insn_count = ix86_count_insn (e->src);
33161
33162 /* Pad short function. */
33163 if (insn_count < 4)
33164 {
33165 rtx insn = ret;
33166
33167 /* Find epilogue. */
33168 while (insn
33169 && (!NOTE_P (insn)
33170 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33171 insn = PREV_INSN (insn);
33172
33173 if (!insn)
33174 insn = ret;
33175
33176 /* Two NOPs count as one instruction. */
33177 insn_count = 2 * (4 - insn_count);
33178 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33179 }
33180 }
33181 }
33182 }
33183
33184 /* Implement machine specific optimizations. We implement padding of returns
33185 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33186 static void
33187 ix86_reorg (void)
33188 {
33189 /* We are freeing block_for_insn in the toplev to keep compatibility
33190 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33191 compute_bb_for_insn ();
33192
33193 /* Run the vzeroupper optimization if needed. */
33194 if (TARGET_VZEROUPPER)
33195 move_or_delete_vzeroupper ();
33196
33197 if (optimize && optimize_function_for_speed_p (cfun))
33198 {
33199 if (TARGET_PAD_SHORT_FUNCTION)
33200 ix86_pad_short_function ();
33201 else if (TARGET_PAD_RETURNS)
33202 ix86_pad_returns ();
33203 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33204 if (TARGET_FOUR_JUMP_LIMIT)
33205 ix86_avoid_jump_mispredicts ();
33206 #endif
33207 }
33208 }
33209
33210 /* Return nonzero when QImode register that must be represented via REX prefix
33211 is used. */
33212 bool
33213 x86_extended_QIreg_mentioned_p (rtx insn)
33214 {
33215 int i;
33216 extract_insn_cached (insn);
33217 for (i = 0; i < recog_data.n_operands; i++)
33218 if (REG_P (recog_data.operand[i])
33219 && REGNO (recog_data.operand[i]) > BX_REG)
33220 return true;
33221 return false;
33222 }
33223
33224 /* Return nonzero when P points to register encoded via REX prefix.
33225 Called via for_each_rtx. */
33226 static int
33227 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33228 {
33229 unsigned int regno;
33230 if (!REG_P (*p))
33231 return 0;
33232 regno = REGNO (*p);
33233 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33234 }
33235
33236 /* Return true when INSN mentions register that must be encoded using REX
33237 prefix. */
33238 bool
33239 x86_extended_reg_mentioned_p (rtx insn)
33240 {
33241 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33242 extended_reg_mentioned_1, NULL);
33243 }
33244
33245 /* If profitable, negate (without causing overflow) integer constant
33246 of mode MODE at location LOC. Return true in this case. */
33247 bool
33248 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33249 {
33250 HOST_WIDE_INT val;
33251
33252 if (!CONST_INT_P (*loc))
33253 return false;
33254
33255 switch (mode)
33256 {
33257 case DImode:
33258 /* DImode x86_64 constants must fit in 32 bits. */
33259 gcc_assert (x86_64_immediate_operand (*loc, mode));
33260
33261 mode = SImode;
33262 break;
33263
33264 case SImode:
33265 case HImode:
33266 case QImode:
33267 break;
33268
33269 default:
33270 gcc_unreachable ();
33271 }
33272
33273 /* Avoid overflows. */
33274 if (mode_signbit_p (mode, *loc))
33275 return false;
33276
33277 val = INTVAL (*loc);
33278
33279 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33280 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33281 if ((val < 0 && val != -128)
33282 || val == 128)
33283 {
33284 *loc = GEN_INT (-val);
33285 return true;
33286 }
33287
33288 return false;
33289 }
33290
33291 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33292 optabs would emit if we didn't have TFmode patterns. */
33293
33294 void
33295 x86_emit_floatuns (rtx operands[2])
33296 {
33297 rtx neglab, donelab, i0, i1, f0, in, out;
33298 enum machine_mode mode, inmode;
33299
33300 inmode = GET_MODE (operands[1]);
33301 gcc_assert (inmode == SImode || inmode == DImode);
33302
33303 out = operands[0];
33304 in = force_reg (inmode, operands[1]);
33305 mode = GET_MODE (out);
33306 neglab = gen_label_rtx ();
33307 donelab = gen_label_rtx ();
33308 f0 = gen_reg_rtx (mode);
33309
33310 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33311
33312 expand_float (out, in, 0);
33313
33314 emit_jump_insn (gen_jump (donelab));
33315 emit_barrier ();
33316
33317 emit_label (neglab);
33318
33319 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33320 1, OPTAB_DIRECT);
33321 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33322 1, OPTAB_DIRECT);
33323 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33324
33325 expand_float (f0, i0, 0);
33326
33327 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33328
33329 emit_label (donelab);
33330 }
33331 \f
33332 /* AVX2 does support 32-byte integer vector operations,
33333 thus the longest vector we are faced with is V32QImode. */
33334 #define MAX_VECT_LEN 32
33335
33336 struct expand_vec_perm_d
33337 {
33338 rtx target, op0, op1;
33339 unsigned char perm[MAX_VECT_LEN];
33340 enum machine_mode vmode;
33341 unsigned char nelt;
33342 bool one_operand_p;
33343 bool testing_p;
33344 };
33345
33346 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33347 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33348
33349 /* Get a vector mode of the same size as the original but with elements
33350 twice as wide. This is only guaranteed to apply to integral vectors. */
33351
33352 static inline enum machine_mode
33353 get_mode_wider_vector (enum machine_mode o)
33354 {
33355 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33356 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33357 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33358 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33359 return n;
33360 }
33361
33362 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33363 with all elements equal to VAR. Return true if successful. */
33364
33365 static bool
33366 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33367 rtx target, rtx val)
33368 {
33369 bool ok;
33370
33371 switch (mode)
33372 {
33373 case V2SImode:
33374 case V2SFmode:
33375 if (!mmx_ok)
33376 return false;
33377 /* FALLTHRU */
33378
33379 case V4DFmode:
33380 case V4DImode:
33381 case V8SFmode:
33382 case V8SImode:
33383 case V2DFmode:
33384 case V2DImode:
33385 case V4SFmode:
33386 case V4SImode:
33387 {
33388 rtx insn, dup;
33389
33390 /* First attempt to recognize VAL as-is. */
33391 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33392 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33393 if (recog_memoized (insn) < 0)
33394 {
33395 rtx seq;
33396 /* If that fails, force VAL into a register. */
33397
33398 start_sequence ();
33399 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33400 seq = get_insns ();
33401 end_sequence ();
33402 if (seq)
33403 emit_insn_before (seq, insn);
33404
33405 ok = recog_memoized (insn) >= 0;
33406 gcc_assert (ok);
33407 }
33408 }
33409 return true;
33410
33411 case V4HImode:
33412 if (!mmx_ok)
33413 return false;
33414 if (TARGET_SSE || TARGET_3DNOW_A)
33415 {
33416 rtx x;
33417
33418 val = gen_lowpart (SImode, val);
33419 x = gen_rtx_TRUNCATE (HImode, val);
33420 x = gen_rtx_VEC_DUPLICATE (mode, x);
33421 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33422 return true;
33423 }
33424 goto widen;
33425
33426 case V8QImode:
33427 if (!mmx_ok)
33428 return false;
33429 goto widen;
33430
33431 case V8HImode:
33432 if (TARGET_SSE2)
33433 {
33434 struct expand_vec_perm_d dperm;
33435 rtx tmp1, tmp2;
33436
33437 permute:
33438 memset (&dperm, 0, sizeof (dperm));
33439 dperm.target = target;
33440 dperm.vmode = mode;
33441 dperm.nelt = GET_MODE_NUNITS (mode);
33442 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33443 dperm.one_operand_p = true;
33444
33445 /* Extend to SImode using a paradoxical SUBREG. */
33446 tmp1 = gen_reg_rtx (SImode);
33447 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33448
33449 /* Insert the SImode value as low element of a V4SImode vector. */
33450 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33451 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33452
33453 ok = (expand_vec_perm_1 (&dperm)
33454 || expand_vec_perm_broadcast_1 (&dperm));
33455 gcc_assert (ok);
33456 return ok;
33457 }
33458 goto widen;
33459
33460 case V16QImode:
33461 if (TARGET_SSE2)
33462 goto permute;
33463 goto widen;
33464
33465 widen:
33466 /* Replicate the value once into the next wider mode and recurse. */
33467 {
33468 enum machine_mode smode, wsmode, wvmode;
33469 rtx x;
33470
33471 smode = GET_MODE_INNER (mode);
33472 wvmode = get_mode_wider_vector (mode);
33473 wsmode = GET_MODE_INNER (wvmode);
33474
33475 val = convert_modes (wsmode, smode, val, true);
33476 x = expand_simple_binop (wsmode, ASHIFT, val,
33477 GEN_INT (GET_MODE_BITSIZE (smode)),
33478 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33479 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33480
33481 x = gen_lowpart (wvmode, target);
33482 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33483 gcc_assert (ok);
33484 return ok;
33485 }
33486
33487 case V16HImode:
33488 case V32QImode:
33489 {
33490 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33491 rtx x = gen_reg_rtx (hvmode);
33492
33493 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33494 gcc_assert (ok);
33495
33496 x = gen_rtx_VEC_CONCAT (mode, x, x);
33497 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33498 }
33499 return true;
33500
33501 default:
33502 return false;
33503 }
33504 }
33505
33506 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33507 whose ONE_VAR element is VAR, and other elements are zero. Return true
33508 if successful. */
33509
33510 static bool
33511 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33512 rtx target, rtx var, int one_var)
33513 {
33514 enum machine_mode vsimode;
33515 rtx new_target;
33516 rtx x, tmp;
33517 bool use_vector_set = false;
33518
33519 switch (mode)
33520 {
33521 case V2DImode:
33522 /* For SSE4.1, we normally use vector set. But if the second
33523 element is zero and inter-unit moves are OK, we use movq
33524 instead. */
33525 use_vector_set = (TARGET_64BIT
33526 && TARGET_SSE4_1
33527 && !(TARGET_INTER_UNIT_MOVES
33528 && one_var == 0));
33529 break;
33530 case V16QImode:
33531 case V4SImode:
33532 case V4SFmode:
33533 use_vector_set = TARGET_SSE4_1;
33534 break;
33535 case V8HImode:
33536 use_vector_set = TARGET_SSE2;
33537 break;
33538 case V4HImode:
33539 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33540 break;
33541 case V32QImode:
33542 case V16HImode:
33543 case V8SImode:
33544 case V8SFmode:
33545 case V4DFmode:
33546 use_vector_set = TARGET_AVX;
33547 break;
33548 case V4DImode:
33549 /* Use ix86_expand_vector_set in 64bit mode only. */
33550 use_vector_set = TARGET_AVX && TARGET_64BIT;
33551 break;
33552 default:
33553 break;
33554 }
33555
33556 if (use_vector_set)
33557 {
33558 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33559 var = force_reg (GET_MODE_INNER (mode), var);
33560 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33561 return true;
33562 }
33563
33564 switch (mode)
33565 {
33566 case V2SFmode:
33567 case V2SImode:
33568 if (!mmx_ok)
33569 return false;
33570 /* FALLTHRU */
33571
33572 case V2DFmode:
33573 case V2DImode:
33574 if (one_var != 0)
33575 return false;
33576 var = force_reg (GET_MODE_INNER (mode), var);
33577 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33578 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33579 return true;
33580
33581 case V4SFmode:
33582 case V4SImode:
33583 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33584 new_target = gen_reg_rtx (mode);
33585 else
33586 new_target = target;
33587 var = force_reg (GET_MODE_INNER (mode), var);
33588 x = gen_rtx_VEC_DUPLICATE (mode, var);
33589 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33590 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33591 if (one_var != 0)
33592 {
33593 /* We need to shuffle the value to the correct position, so
33594 create a new pseudo to store the intermediate result. */
33595
33596 /* With SSE2, we can use the integer shuffle insns. */
33597 if (mode != V4SFmode && TARGET_SSE2)
33598 {
33599 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33600 const1_rtx,
33601 GEN_INT (one_var == 1 ? 0 : 1),
33602 GEN_INT (one_var == 2 ? 0 : 1),
33603 GEN_INT (one_var == 3 ? 0 : 1)));
33604 if (target != new_target)
33605 emit_move_insn (target, new_target);
33606 return true;
33607 }
33608
33609 /* Otherwise convert the intermediate result to V4SFmode and
33610 use the SSE1 shuffle instructions. */
33611 if (mode != V4SFmode)
33612 {
33613 tmp = gen_reg_rtx (V4SFmode);
33614 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33615 }
33616 else
33617 tmp = new_target;
33618
33619 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33620 const1_rtx,
33621 GEN_INT (one_var == 1 ? 0 : 1),
33622 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33623 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33624
33625 if (mode != V4SFmode)
33626 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33627 else if (tmp != target)
33628 emit_move_insn (target, tmp);
33629 }
33630 else if (target != new_target)
33631 emit_move_insn (target, new_target);
33632 return true;
33633
33634 case V8HImode:
33635 case V16QImode:
33636 vsimode = V4SImode;
33637 goto widen;
33638 case V4HImode:
33639 case V8QImode:
33640 if (!mmx_ok)
33641 return false;
33642 vsimode = V2SImode;
33643 goto widen;
33644 widen:
33645 if (one_var != 0)
33646 return false;
33647
33648 /* Zero extend the variable element to SImode and recurse. */
33649 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33650
33651 x = gen_reg_rtx (vsimode);
33652 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33653 var, one_var))
33654 gcc_unreachable ();
33655
33656 emit_move_insn (target, gen_lowpart (mode, x));
33657 return true;
33658
33659 default:
33660 return false;
33661 }
33662 }
33663
33664 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33665 consisting of the values in VALS. It is known that all elements
33666 except ONE_VAR are constants. Return true if successful. */
33667
33668 static bool
33669 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33670 rtx target, rtx vals, int one_var)
33671 {
33672 rtx var = XVECEXP (vals, 0, one_var);
33673 enum machine_mode wmode;
33674 rtx const_vec, x;
33675
33676 const_vec = copy_rtx (vals);
33677 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33678 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33679
33680 switch (mode)
33681 {
33682 case V2DFmode:
33683 case V2DImode:
33684 case V2SFmode:
33685 case V2SImode:
33686 /* For the two element vectors, it's just as easy to use
33687 the general case. */
33688 return false;
33689
33690 case V4DImode:
33691 /* Use ix86_expand_vector_set in 64bit mode only. */
33692 if (!TARGET_64BIT)
33693 return false;
33694 case V4DFmode:
33695 case V8SFmode:
33696 case V8SImode:
33697 case V16HImode:
33698 case V32QImode:
33699 case V4SFmode:
33700 case V4SImode:
33701 case V8HImode:
33702 case V4HImode:
33703 break;
33704
33705 case V16QImode:
33706 if (TARGET_SSE4_1)
33707 break;
33708 wmode = V8HImode;
33709 goto widen;
33710 case V8QImode:
33711 wmode = V4HImode;
33712 goto widen;
33713 widen:
33714 /* There's no way to set one QImode entry easily. Combine
33715 the variable value with its adjacent constant value, and
33716 promote to an HImode set. */
33717 x = XVECEXP (vals, 0, one_var ^ 1);
33718 if (one_var & 1)
33719 {
33720 var = convert_modes (HImode, QImode, var, true);
33721 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33722 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33723 x = GEN_INT (INTVAL (x) & 0xff);
33724 }
33725 else
33726 {
33727 var = convert_modes (HImode, QImode, var, true);
33728 x = gen_int_mode (INTVAL (x) << 8, HImode);
33729 }
33730 if (x != const0_rtx)
33731 var = expand_simple_binop (HImode, IOR, var, x, var,
33732 1, OPTAB_LIB_WIDEN);
33733
33734 x = gen_reg_rtx (wmode);
33735 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33736 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33737
33738 emit_move_insn (target, gen_lowpart (mode, x));
33739 return true;
33740
33741 default:
33742 return false;
33743 }
33744
33745 emit_move_insn (target, const_vec);
33746 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33747 return true;
33748 }
33749
33750 /* A subroutine of ix86_expand_vector_init_general. Use vector
33751 concatenate to handle the most general case: all values variable,
33752 and none identical. */
33753
33754 static void
33755 ix86_expand_vector_init_concat (enum machine_mode mode,
33756 rtx target, rtx *ops, int n)
33757 {
33758 enum machine_mode cmode, hmode = VOIDmode;
33759 rtx first[8], second[4];
33760 rtvec v;
33761 int i, j;
33762
33763 switch (n)
33764 {
33765 case 2:
33766 switch (mode)
33767 {
33768 case V8SImode:
33769 cmode = V4SImode;
33770 break;
33771 case V8SFmode:
33772 cmode = V4SFmode;
33773 break;
33774 case V4DImode:
33775 cmode = V2DImode;
33776 break;
33777 case V4DFmode:
33778 cmode = V2DFmode;
33779 break;
33780 case V4SImode:
33781 cmode = V2SImode;
33782 break;
33783 case V4SFmode:
33784 cmode = V2SFmode;
33785 break;
33786 case V2DImode:
33787 cmode = DImode;
33788 break;
33789 case V2SImode:
33790 cmode = SImode;
33791 break;
33792 case V2DFmode:
33793 cmode = DFmode;
33794 break;
33795 case V2SFmode:
33796 cmode = SFmode;
33797 break;
33798 default:
33799 gcc_unreachable ();
33800 }
33801
33802 if (!register_operand (ops[1], cmode))
33803 ops[1] = force_reg (cmode, ops[1]);
33804 if (!register_operand (ops[0], cmode))
33805 ops[0] = force_reg (cmode, ops[0]);
33806 emit_insn (gen_rtx_SET (VOIDmode, target,
33807 gen_rtx_VEC_CONCAT (mode, ops[0],
33808 ops[1])));
33809 break;
33810
33811 case 4:
33812 switch (mode)
33813 {
33814 case V4DImode:
33815 cmode = V2DImode;
33816 break;
33817 case V4DFmode:
33818 cmode = V2DFmode;
33819 break;
33820 case V4SImode:
33821 cmode = V2SImode;
33822 break;
33823 case V4SFmode:
33824 cmode = V2SFmode;
33825 break;
33826 default:
33827 gcc_unreachable ();
33828 }
33829 goto half;
33830
33831 case 8:
33832 switch (mode)
33833 {
33834 case V8SImode:
33835 cmode = V2SImode;
33836 hmode = V4SImode;
33837 break;
33838 case V8SFmode:
33839 cmode = V2SFmode;
33840 hmode = V4SFmode;
33841 break;
33842 default:
33843 gcc_unreachable ();
33844 }
33845 goto half;
33846
33847 half:
33848 /* FIXME: We process inputs backward to help RA. PR 36222. */
33849 i = n - 1;
33850 j = (n >> 1) - 1;
33851 for (; i > 0; i -= 2, j--)
33852 {
33853 first[j] = gen_reg_rtx (cmode);
33854 v = gen_rtvec (2, ops[i - 1], ops[i]);
33855 ix86_expand_vector_init (false, first[j],
33856 gen_rtx_PARALLEL (cmode, v));
33857 }
33858
33859 n >>= 1;
33860 if (n > 2)
33861 {
33862 gcc_assert (hmode != VOIDmode);
33863 for (i = j = 0; i < n; i += 2, j++)
33864 {
33865 second[j] = gen_reg_rtx (hmode);
33866 ix86_expand_vector_init_concat (hmode, second [j],
33867 &first [i], 2);
33868 }
33869 n >>= 1;
33870 ix86_expand_vector_init_concat (mode, target, second, n);
33871 }
33872 else
33873 ix86_expand_vector_init_concat (mode, target, first, n);
33874 break;
33875
33876 default:
33877 gcc_unreachable ();
33878 }
33879 }
33880
33881 /* A subroutine of ix86_expand_vector_init_general. Use vector
33882 interleave to handle the most general case: all values variable,
33883 and none identical. */
33884
33885 static void
33886 ix86_expand_vector_init_interleave (enum machine_mode mode,
33887 rtx target, rtx *ops, int n)
33888 {
33889 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33890 int i, j;
33891 rtx op0, op1;
33892 rtx (*gen_load_even) (rtx, rtx, rtx);
33893 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33894 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33895
33896 switch (mode)
33897 {
33898 case V8HImode:
33899 gen_load_even = gen_vec_setv8hi;
33900 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33901 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33902 inner_mode = HImode;
33903 first_imode = V4SImode;
33904 second_imode = V2DImode;
33905 third_imode = VOIDmode;
33906 break;
33907 case V16QImode:
33908 gen_load_even = gen_vec_setv16qi;
33909 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33910 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33911 inner_mode = QImode;
33912 first_imode = V8HImode;
33913 second_imode = V4SImode;
33914 third_imode = V2DImode;
33915 break;
33916 default:
33917 gcc_unreachable ();
33918 }
33919
33920 for (i = 0; i < n; i++)
33921 {
33922 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33923 op0 = gen_reg_rtx (SImode);
33924 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33925
33926 /* Insert the SImode value as low element of V4SImode vector. */
33927 op1 = gen_reg_rtx (V4SImode);
33928 op0 = gen_rtx_VEC_MERGE (V4SImode,
33929 gen_rtx_VEC_DUPLICATE (V4SImode,
33930 op0),
33931 CONST0_RTX (V4SImode),
33932 const1_rtx);
33933 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33934
33935 /* Cast the V4SImode vector back to a vector in orignal mode. */
33936 op0 = gen_reg_rtx (mode);
33937 emit_move_insn (op0, gen_lowpart (mode, op1));
33938
33939 /* Load even elements into the second positon. */
33940 emit_insn (gen_load_even (op0,
33941 force_reg (inner_mode,
33942 ops [i + i + 1]),
33943 const1_rtx));
33944
33945 /* Cast vector to FIRST_IMODE vector. */
33946 ops[i] = gen_reg_rtx (first_imode);
33947 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33948 }
33949
33950 /* Interleave low FIRST_IMODE vectors. */
33951 for (i = j = 0; i < n; i += 2, j++)
33952 {
33953 op0 = gen_reg_rtx (first_imode);
33954 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33955
33956 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33957 ops[j] = gen_reg_rtx (second_imode);
33958 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33959 }
33960
33961 /* Interleave low SECOND_IMODE vectors. */
33962 switch (second_imode)
33963 {
33964 case V4SImode:
33965 for (i = j = 0; i < n / 2; i += 2, j++)
33966 {
33967 op0 = gen_reg_rtx (second_imode);
33968 emit_insn (gen_interleave_second_low (op0, ops[i],
33969 ops[i + 1]));
33970
33971 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33972 vector. */
33973 ops[j] = gen_reg_rtx (third_imode);
33974 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33975 }
33976 second_imode = V2DImode;
33977 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33978 /* FALLTHRU */
33979
33980 case V2DImode:
33981 op0 = gen_reg_rtx (second_imode);
33982 emit_insn (gen_interleave_second_low (op0, ops[0],
33983 ops[1]));
33984
33985 /* Cast the SECOND_IMODE vector back to a vector on original
33986 mode. */
33987 emit_insn (gen_rtx_SET (VOIDmode, target,
33988 gen_lowpart (mode, op0)));
33989 break;
33990
33991 default:
33992 gcc_unreachable ();
33993 }
33994 }
33995
33996 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33997 all values variable, and none identical. */
33998
33999 static void
34000 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34001 rtx target, rtx vals)
34002 {
34003 rtx ops[32], op0, op1;
34004 enum machine_mode half_mode = VOIDmode;
34005 int n, i;
34006
34007 switch (mode)
34008 {
34009 case V2SFmode:
34010 case V2SImode:
34011 if (!mmx_ok && !TARGET_SSE)
34012 break;
34013 /* FALLTHRU */
34014
34015 case V8SFmode:
34016 case V8SImode:
34017 case V4DFmode:
34018 case V4DImode:
34019 case V4SFmode:
34020 case V4SImode:
34021 case V2DFmode:
34022 case V2DImode:
34023 n = GET_MODE_NUNITS (mode);
34024 for (i = 0; i < n; i++)
34025 ops[i] = XVECEXP (vals, 0, i);
34026 ix86_expand_vector_init_concat (mode, target, ops, n);
34027 return;
34028
34029 case V32QImode:
34030 half_mode = V16QImode;
34031 goto half;
34032
34033 case V16HImode:
34034 half_mode = V8HImode;
34035 goto half;
34036
34037 half:
34038 n = GET_MODE_NUNITS (mode);
34039 for (i = 0; i < n; i++)
34040 ops[i] = XVECEXP (vals, 0, i);
34041 op0 = gen_reg_rtx (half_mode);
34042 op1 = gen_reg_rtx (half_mode);
34043 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34044 n >> 2);
34045 ix86_expand_vector_init_interleave (half_mode, op1,
34046 &ops [n >> 1], n >> 2);
34047 emit_insn (gen_rtx_SET (VOIDmode, target,
34048 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34049 return;
34050
34051 case V16QImode:
34052 if (!TARGET_SSE4_1)
34053 break;
34054 /* FALLTHRU */
34055
34056 case V8HImode:
34057 if (!TARGET_SSE2)
34058 break;
34059
34060 /* Don't use ix86_expand_vector_init_interleave if we can't
34061 move from GPR to SSE register directly. */
34062 if (!TARGET_INTER_UNIT_MOVES)
34063 break;
34064
34065 n = GET_MODE_NUNITS (mode);
34066 for (i = 0; i < n; i++)
34067 ops[i] = XVECEXP (vals, 0, i);
34068 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34069 return;
34070
34071 case V4HImode:
34072 case V8QImode:
34073 break;
34074
34075 default:
34076 gcc_unreachable ();
34077 }
34078
34079 {
34080 int i, j, n_elts, n_words, n_elt_per_word;
34081 enum machine_mode inner_mode;
34082 rtx words[4], shift;
34083
34084 inner_mode = GET_MODE_INNER (mode);
34085 n_elts = GET_MODE_NUNITS (mode);
34086 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34087 n_elt_per_word = n_elts / n_words;
34088 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34089
34090 for (i = 0; i < n_words; ++i)
34091 {
34092 rtx word = NULL_RTX;
34093
34094 for (j = 0; j < n_elt_per_word; ++j)
34095 {
34096 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34097 elt = convert_modes (word_mode, inner_mode, elt, true);
34098
34099 if (j == 0)
34100 word = elt;
34101 else
34102 {
34103 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34104 word, 1, OPTAB_LIB_WIDEN);
34105 word = expand_simple_binop (word_mode, IOR, word, elt,
34106 word, 1, OPTAB_LIB_WIDEN);
34107 }
34108 }
34109
34110 words[i] = word;
34111 }
34112
34113 if (n_words == 1)
34114 emit_move_insn (target, gen_lowpart (mode, words[0]));
34115 else if (n_words == 2)
34116 {
34117 rtx tmp = gen_reg_rtx (mode);
34118 emit_clobber (tmp);
34119 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34120 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34121 emit_move_insn (target, tmp);
34122 }
34123 else if (n_words == 4)
34124 {
34125 rtx tmp = gen_reg_rtx (V4SImode);
34126 gcc_assert (word_mode == SImode);
34127 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34128 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34129 emit_move_insn (target, gen_lowpart (mode, tmp));
34130 }
34131 else
34132 gcc_unreachable ();
34133 }
34134 }
34135
34136 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34137 instructions unless MMX_OK is true. */
34138
34139 void
34140 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34141 {
34142 enum machine_mode mode = GET_MODE (target);
34143 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34144 int n_elts = GET_MODE_NUNITS (mode);
34145 int n_var = 0, one_var = -1;
34146 bool all_same = true, all_const_zero = true;
34147 int i;
34148 rtx x;
34149
34150 for (i = 0; i < n_elts; ++i)
34151 {
34152 x = XVECEXP (vals, 0, i);
34153 if (!(CONST_INT_P (x)
34154 || GET_CODE (x) == CONST_DOUBLE
34155 || GET_CODE (x) == CONST_FIXED))
34156 n_var++, one_var = i;
34157 else if (x != CONST0_RTX (inner_mode))
34158 all_const_zero = false;
34159 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34160 all_same = false;
34161 }
34162
34163 /* Constants are best loaded from the constant pool. */
34164 if (n_var == 0)
34165 {
34166 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34167 return;
34168 }
34169
34170 /* If all values are identical, broadcast the value. */
34171 if (all_same
34172 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34173 XVECEXP (vals, 0, 0)))
34174 return;
34175
34176 /* Values where only one field is non-constant are best loaded from
34177 the pool and overwritten via move later. */
34178 if (n_var == 1)
34179 {
34180 if (all_const_zero
34181 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34182 XVECEXP (vals, 0, one_var),
34183 one_var))
34184 return;
34185
34186 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34187 return;
34188 }
34189
34190 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34191 }
34192
34193 void
34194 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34195 {
34196 enum machine_mode mode = GET_MODE (target);
34197 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34198 enum machine_mode half_mode;
34199 bool use_vec_merge = false;
34200 rtx tmp;
34201 static rtx (*gen_extract[6][2]) (rtx, rtx)
34202 = {
34203 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34204 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34205 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34206 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34207 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34208 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34209 };
34210 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34211 = {
34212 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34213 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34214 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34215 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34216 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34217 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34218 };
34219 int i, j, n;
34220
34221 switch (mode)
34222 {
34223 case V2SFmode:
34224 case V2SImode:
34225 if (mmx_ok)
34226 {
34227 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34228 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34229 if (elt == 0)
34230 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34231 else
34232 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34233 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34234 return;
34235 }
34236 break;
34237
34238 case V2DImode:
34239 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34240 if (use_vec_merge)
34241 break;
34242
34243 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34244 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34245 if (elt == 0)
34246 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34247 else
34248 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34249 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34250 return;
34251
34252 case V2DFmode:
34253 {
34254 rtx op0, op1;
34255
34256 /* For the two element vectors, we implement a VEC_CONCAT with
34257 the extraction of the other element. */
34258
34259 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34260 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34261
34262 if (elt == 0)
34263 op0 = val, op1 = tmp;
34264 else
34265 op0 = tmp, op1 = val;
34266
34267 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34268 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34269 }
34270 return;
34271
34272 case V4SFmode:
34273 use_vec_merge = TARGET_SSE4_1;
34274 if (use_vec_merge)
34275 break;
34276
34277 switch (elt)
34278 {
34279 case 0:
34280 use_vec_merge = true;
34281 break;
34282
34283 case 1:
34284 /* tmp = target = A B C D */
34285 tmp = copy_to_reg (target);
34286 /* target = A A B B */
34287 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34288 /* target = X A B B */
34289 ix86_expand_vector_set (false, target, val, 0);
34290 /* target = A X C D */
34291 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34292 const1_rtx, const0_rtx,
34293 GEN_INT (2+4), GEN_INT (3+4)));
34294 return;
34295
34296 case 2:
34297 /* tmp = target = A B C D */
34298 tmp = copy_to_reg (target);
34299 /* tmp = X B C D */
34300 ix86_expand_vector_set (false, tmp, val, 0);
34301 /* target = A B X D */
34302 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34303 const0_rtx, const1_rtx,
34304 GEN_INT (0+4), GEN_INT (3+4)));
34305 return;
34306
34307 case 3:
34308 /* tmp = target = A B C D */
34309 tmp = copy_to_reg (target);
34310 /* tmp = X B C D */
34311 ix86_expand_vector_set (false, tmp, val, 0);
34312 /* target = A B X D */
34313 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34314 const0_rtx, const1_rtx,
34315 GEN_INT (2+4), GEN_INT (0+4)));
34316 return;
34317
34318 default:
34319 gcc_unreachable ();
34320 }
34321 break;
34322
34323 case V4SImode:
34324 use_vec_merge = TARGET_SSE4_1;
34325 if (use_vec_merge)
34326 break;
34327
34328 /* Element 0 handled by vec_merge below. */
34329 if (elt == 0)
34330 {
34331 use_vec_merge = true;
34332 break;
34333 }
34334
34335 if (TARGET_SSE2)
34336 {
34337 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34338 store into element 0, then shuffle them back. */
34339
34340 rtx order[4];
34341
34342 order[0] = GEN_INT (elt);
34343 order[1] = const1_rtx;
34344 order[2] = const2_rtx;
34345 order[3] = GEN_INT (3);
34346 order[elt] = const0_rtx;
34347
34348 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34349 order[1], order[2], order[3]));
34350
34351 ix86_expand_vector_set (false, target, val, 0);
34352
34353 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34354 order[1], order[2], order[3]));
34355 }
34356 else
34357 {
34358 /* For SSE1, we have to reuse the V4SF code. */
34359 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34360 gen_lowpart (SFmode, val), elt);
34361 }
34362 return;
34363
34364 case V8HImode:
34365 use_vec_merge = TARGET_SSE2;
34366 break;
34367 case V4HImode:
34368 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34369 break;
34370
34371 case V16QImode:
34372 use_vec_merge = TARGET_SSE4_1;
34373 break;
34374
34375 case V8QImode:
34376 break;
34377
34378 case V32QImode:
34379 half_mode = V16QImode;
34380 j = 0;
34381 n = 16;
34382 goto half;
34383
34384 case V16HImode:
34385 half_mode = V8HImode;
34386 j = 1;
34387 n = 8;
34388 goto half;
34389
34390 case V8SImode:
34391 half_mode = V4SImode;
34392 j = 2;
34393 n = 4;
34394 goto half;
34395
34396 case V4DImode:
34397 half_mode = V2DImode;
34398 j = 3;
34399 n = 2;
34400 goto half;
34401
34402 case V8SFmode:
34403 half_mode = V4SFmode;
34404 j = 4;
34405 n = 4;
34406 goto half;
34407
34408 case V4DFmode:
34409 half_mode = V2DFmode;
34410 j = 5;
34411 n = 2;
34412 goto half;
34413
34414 half:
34415 /* Compute offset. */
34416 i = elt / n;
34417 elt %= n;
34418
34419 gcc_assert (i <= 1);
34420
34421 /* Extract the half. */
34422 tmp = gen_reg_rtx (half_mode);
34423 emit_insn (gen_extract[j][i] (tmp, target));
34424
34425 /* Put val in tmp at elt. */
34426 ix86_expand_vector_set (false, tmp, val, elt);
34427
34428 /* Put it back. */
34429 emit_insn (gen_insert[j][i] (target, target, tmp));
34430 return;
34431
34432 default:
34433 break;
34434 }
34435
34436 if (use_vec_merge)
34437 {
34438 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34439 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34440 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34441 }
34442 else
34443 {
34444 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34445
34446 emit_move_insn (mem, target);
34447
34448 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34449 emit_move_insn (tmp, val);
34450
34451 emit_move_insn (target, mem);
34452 }
34453 }
34454
34455 void
34456 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34457 {
34458 enum machine_mode mode = GET_MODE (vec);
34459 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34460 bool use_vec_extr = false;
34461 rtx tmp;
34462
34463 switch (mode)
34464 {
34465 case V2SImode:
34466 case V2SFmode:
34467 if (!mmx_ok)
34468 break;
34469 /* FALLTHRU */
34470
34471 case V2DFmode:
34472 case V2DImode:
34473 use_vec_extr = true;
34474 break;
34475
34476 case V4SFmode:
34477 use_vec_extr = TARGET_SSE4_1;
34478 if (use_vec_extr)
34479 break;
34480
34481 switch (elt)
34482 {
34483 case 0:
34484 tmp = vec;
34485 break;
34486
34487 case 1:
34488 case 3:
34489 tmp = gen_reg_rtx (mode);
34490 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34491 GEN_INT (elt), GEN_INT (elt),
34492 GEN_INT (elt+4), GEN_INT (elt+4)));
34493 break;
34494
34495 case 2:
34496 tmp = gen_reg_rtx (mode);
34497 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34498 break;
34499
34500 default:
34501 gcc_unreachable ();
34502 }
34503 vec = tmp;
34504 use_vec_extr = true;
34505 elt = 0;
34506 break;
34507
34508 case V4SImode:
34509 use_vec_extr = TARGET_SSE4_1;
34510 if (use_vec_extr)
34511 break;
34512
34513 if (TARGET_SSE2)
34514 {
34515 switch (elt)
34516 {
34517 case 0:
34518 tmp = vec;
34519 break;
34520
34521 case 1:
34522 case 3:
34523 tmp = gen_reg_rtx (mode);
34524 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34525 GEN_INT (elt), GEN_INT (elt),
34526 GEN_INT (elt), GEN_INT (elt)));
34527 break;
34528
34529 case 2:
34530 tmp = gen_reg_rtx (mode);
34531 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34532 break;
34533
34534 default:
34535 gcc_unreachable ();
34536 }
34537 vec = tmp;
34538 use_vec_extr = true;
34539 elt = 0;
34540 }
34541 else
34542 {
34543 /* For SSE1, we have to reuse the V4SF code. */
34544 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34545 gen_lowpart (V4SFmode, vec), elt);
34546 return;
34547 }
34548 break;
34549
34550 case V8HImode:
34551 use_vec_extr = TARGET_SSE2;
34552 break;
34553 case V4HImode:
34554 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34555 break;
34556
34557 case V16QImode:
34558 use_vec_extr = TARGET_SSE4_1;
34559 break;
34560
34561 case V8SFmode:
34562 if (TARGET_AVX)
34563 {
34564 tmp = gen_reg_rtx (V4SFmode);
34565 if (elt < 4)
34566 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34567 else
34568 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34569 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34570 return;
34571 }
34572 break;
34573
34574 case V4DFmode:
34575 if (TARGET_AVX)
34576 {
34577 tmp = gen_reg_rtx (V2DFmode);
34578 if (elt < 2)
34579 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34580 else
34581 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34582 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34583 return;
34584 }
34585 break;
34586
34587 case V32QImode:
34588 if (TARGET_AVX)
34589 {
34590 tmp = gen_reg_rtx (V16QImode);
34591 if (elt < 16)
34592 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34593 else
34594 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34595 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34596 return;
34597 }
34598 break;
34599
34600 case V16HImode:
34601 if (TARGET_AVX)
34602 {
34603 tmp = gen_reg_rtx (V8HImode);
34604 if (elt < 8)
34605 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34606 else
34607 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34608 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34609 return;
34610 }
34611 break;
34612
34613 case V8SImode:
34614 if (TARGET_AVX)
34615 {
34616 tmp = gen_reg_rtx (V4SImode);
34617 if (elt < 4)
34618 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34619 else
34620 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34621 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34622 return;
34623 }
34624 break;
34625
34626 case V4DImode:
34627 if (TARGET_AVX)
34628 {
34629 tmp = gen_reg_rtx (V2DImode);
34630 if (elt < 2)
34631 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34632 else
34633 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34634 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34635 return;
34636 }
34637 break;
34638
34639 case V8QImode:
34640 /* ??? Could extract the appropriate HImode element and shift. */
34641 default:
34642 break;
34643 }
34644
34645 if (use_vec_extr)
34646 {
34647 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34648 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34649
34650 /* Let the rtl optimizers know about the zero extension performed. */
34651 if (inner_mode == QImode || inner_mode == HImode)
34652 {
34653 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34654 target = gen_lowpart (SImode, target);
34655 }
34656
34657 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34658 }
34659 else
34660 {
34661 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34662
34663 emit_move_insn (mem, vec);
34664
34665 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34666 emit_move_insn (target, tmp);
34667 }
34668 }
34669
34670 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34671 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34672 The upper bits of DEST are undefined, though they shouldn't cause
34673 exceptions (some bits from src or all zeros are ok). */
34674
34675 static void
34676 emit_reduc_half (rtx dest, rtx src, int i)
34677 {
34678 rtx tem;
34679 switch (GET_MODE (src))
34680 {
34681 case V4SFmode:
34682 if (i == 128)
34683 tem = gen_sse_movhlps (dest, src, src);
34684 else
34685 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34686 GEN_INT (1 + 4), GEN_INT (1 + 4));
34687 break;
34688 case V2DFmode:
34689 tem = gen_vec_interleave_highv2df (dest, src, src);
34690 break;
34691 case V16QImode:
34692 case V8HImode:
34693 case V4SImode:
34694 case V2DImode:
34695 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34696 gen_lowpart (V1TImode, src),
34697 GEN_INT (i / 2));
34698 break;
34699 case V8SFmode:
34700 if (i == 256)
34701 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34702 else
34703 tem = gen_avx_shufps256 (dest, src, src,
34704 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34705 break;
34706 case V4DFmode:
34707 if (i == 256)
34708 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34709 else
34710 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34711 break;
34712 case V32QImode:
34713 case V16HImode:
34714 case V8SImode:
34715 case V4DImode:
34716 if (i == 256)
34717 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34718 gen_lowpart (V4DImode, src),
34719 gen_lowpart (V4DImode, src),
34720 const1_rtx);
34721 else
34722 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34723 gen_lowpart (V2TImode, src),
34724 GEN_INT (i / 2));
34725 break;
34726 default:
34727 gcc_unreachable ();
34728 }
34729 emit_insn (tem);
34730 }
34731
34732 /* Expand a vector reduction. FN is the binary pattern to reduce;
34733 DEST is the destination; IN is the input vector. */
34734
34735 void
34736 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34737 {
34738 rtx half, dst, vec = in;
34739 enum machine_mode mode = GET_MODE (in);
34740 int i;
34741
34742 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34743 if (TARGET_SSE4_1
34744 && mode == V8HImode
34745 && fn == gen_uminv8hi3)
34746 {
34747 emit_insn (gen_sse4_1_phminposuw (dest, in));
34748 return;
34749 }
34750
34751 for (i = GET_MODE_BITSIZE (mode);
34752 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34753 i >>= 1)
34754 {
34755 half = gen_reg_rtx (mode);
34756 emit_reduc_half (half, vec, i);
34757 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34758 dst = dest;
34759 else
34760 dst = gen_reg_rtx (mode);
34761 emit_insn (fn (dst, half, vec));
34762 vec = dst;
34763 }
34764 }
34765 \f
34766 /* Target hook for scalar_mode_supported_p. */
34767 static bool
34768 ix86_scalar_mode_supported_p (enum machine_mode mode)
34769 {
34770 if (DECIMAL_FLOAT_MODE_P (mode))
34771 return default_decimal_float_supported_p ();
34772 else if (mode == TFmode)
34773 return true;
34774 else
34775 return default_scalar_mode_supported_p (mode);
34776 }
34777
34778 /* Implements target hook vector_mode_supported_p. */
34779 static bool
34780 ix86_vector_mode_supported_p (enum machine_mode mode)
34781 {
34782 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34783 return true;
34784 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34785 return true;
34786 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34787 return true;
34788 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34789 return true;
34790 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34791 return true;
34792 return false;
34793 }
34794
34795 /* Target hook for c_mode_for_suffix. */
34796 static enum machine_mode
34797 ix86_c_mode_for_suffix (char suffix)
34798 {
34799 if (suffix == 'q')
34800 return TFmode;
34801 if (suffix == 'w')
34802 return XFmode;
34803
34804 return VOIDmode;
34805 }
34806
34807 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34808
34809 We do this in the new i386 backend to maintain source compatibility
34810 with the old cc0-based compiler. */
34811
34812 static tree
34813 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34814 tree inputs ATTRIBUTE_UNUSED,
34815 tree clobbers)
34816 {
34817 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34818 clobbers);
34819 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34820 clobbers);
34821 return clobbers;
34822 }
34823
34824 /* Implements target vector targetm.asm.encode_section_info. */
34825
34826 static void ATTRIBUTE_UNUSED
34827 ix86_encode_section_info (tree decl, rtx rtl, int first)
34828 {
34829 default_encode_section_info (decl, rtl, first);
34830
34831 if (TREE_CODE (decl) == VAR_DECL
34832 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34833 && ix86_in_large_data_p (decl))
34834 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34835 }
34836
34837 /* Worker function for REVERSE_CONDITION. */
34838
34839 enum rtx_code
34840 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34841 {
34842 return (mode != CCFPmode && mode != CCFPUmode
34843 ? reverse_condition (code)
34844 : reverse_condition_maybe_unordered (code));
34845 }
34846
34847 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34848 to OPERANDS[0]. */
34849
34850 const char *
34851 output_387_reg_move (rtx insn, rtx *operands)
34852 {
34853 if (REG_P (operands[0]))
34854 {
34855 if (REG_P (operands[1])
34856 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34857 {
34858 if (REGNO (operands[0]) == FIRST_STACK_REG)
34859 return output_387_ffreep (operands, 0);
34860 return "fstp\t%y0";
34861 }
34862 if (STACK_TOP_P (operands[0]))
34863 return "fld%Z1\t%y1";
34864 return "fst\t%y0";
34865 }
34866 else if (MEM_P (operands[0]))
34867 {
34868 gcc_assert (REG_P (operands[1]));
34869 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34870 return "fstp%Z0\t%y0";
34871 else
34872 {
34873 /* There is no non-popping store to memory for XFmode.
34874 So if we need one, follow the store with a load. */
34875 if (GET_MODE (operands[0]) == XFmode)
34876 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34877 else
34878 return "fst%Z0\t%y0";
34879 }
34880 }
34881 else
34882 gcc_unreachable();
34883 }
34884
34885 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34886 FP status register is set. */
34887
34888 void
34889 ix86_emit_fp_unordered_jump (rtx label)
34890 {
34891 rtx reg = gen_reg_rtx (HImode);
34892 rtx temp;
34893
34894 emit_insn (gen_x86_fnstsw_1 (reg));
34895
34896 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34897 {
34898 emit_insn (gen_x86_sahf_1 (reg));
34899
34900 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34901 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34902 }
34903 else
34904 {
34905 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34906
34907 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34908 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34909 }
34910
34911 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34912 gen_rtx_LABEL_REF (VOIDmode, label),
34913 pc_rtx);
34914 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34915
34916 emit_jump_insn (temp);
34917 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34918 }
34919
34920 /* Output code to perform a log1p XFmode calculation. */
34921
34922 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34923 {
34924 rtx label1 = gen_label_rtx ();
34925 rtx label2 = gen_label_rtx ();
34926
34927 rtx tmp = gen_reg_rtx (XFmode);
34928 rtx tmp2 = gen_reg_rtx (XFmode);
34929 rtx test;
34930
34931 emit_insn (gen_absxf2 (tmp, op1));
34932 test = gen_rtx_GE (VOIDmode, tmp,
34933 CONST_DOUBLE_FROM_REAL_VALUE (
34934 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34935 XFmode));
34936 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34937
34938 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34939 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34940 emit_jump (label2);
34941
34942 emit_label (label1);
34943 emit_move_insn (tmp, CONST1_RTX (XFmode));
34944 emit_insn (gen_addxf3 (tmp, op1, tmp));
34945 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34946 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34947
34948 emit_label (label2);
34949 }
34950
34951 /* Emit code for round calculation. */
34952 void ix86_emit_i387_round (rtx op0, rtx op1)
34953 {
34954 enum machine_mode inmode = GET_MODE (op1);
34955 enum machine_mode outmode = GET_MODE (op0);
34956 rtx e1, e2, res, tmp, tmp1, half;
34957 rtx scratch = gen_reg_rtx (HImode);
34958 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34959 rtx jump_label = gen_label_rtx ();
34960 rtx insn;
34961 rtx (*gen_abs) (rtx, rtx);
34962 rtx (*gen_neg) (rtx, rtx);
34963
34964 switch (inmode)
34965 {
34966 case SFmode:
34967 gen_abs = gen_abssf2;
34968 break;
34969 case DFmode:
34970 gen_abs = gen_absdf2;
34971 break;
34972 case XFmode:
34973 gen_abs = gen_absxf2;
34974 break;
34975 default:
34976 gcc_unreachable ();
34977 }
34978
34979 switch (outmode)
34980 {
34981 case SFmode:
34982 gen_neg = gen_negsf2;
34983 break;
34984 case DFmode:
34985 gen_neg = gen_negdf2;
34986 break;
34987 case XFmode:
34988 gen_neg = gen_negxf2;
34989 break;
34990 case HImode:
34991 gen_neg = gen_neghi2;
34992 break;
34993 case SImode:
34994 gen_neg = gen_negsi2;
34995 break;
34996 case DImode:
34997 gen_neg = gen_negdi2;
34998 break;
34999 default:
35000 gcc_unreachable ();
35001 }
35002
35003 e1 = gen_reg_rtx (inmode);
35004 e2 = gen_reg_rtx (inmode);
35005 res = gen_reg_rtx (outmode);
35006
35007 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35008
35009 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35010
35011 /* scratch = fxam(op1) */
35012 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35013 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35014 UNSPEC_FXAM)));
35015 /* e1 = fabs(op1) */
35016 emit_insn (gen_abs (e1, op1));
35017
35018 /* e2 = e1 + 0.5 */
35019 half = force_reg (inmode, half);
35020 emit_insn (gen_rtx_SET (VOIDmode, e2,
35021 gen_rtx_PLUS (inmode, e1, half)));
35022
35023 /* res = floor(e2) */
35024 if (inmode != XFmode)
35025 {
35026 tmp1 = gen_reg_rtx (XFmode);
35027
35028 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35029 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35030 }
35031 else
35032 tmp1 = e2;
35033
35034 switch (outmode)
35035 {
35036 case SFmode:
35037 case DFmode:
35038 {
35039 rtx tmp0 = gen_reg_rtx (XFmode);
35040
35041 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35042
35043 emit_insn (gen_rtx_SET (VOIDmode, res,
35044 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35045 UNSPEC_TRUNC_NOOP)));
35046 }
35047 break;
35048 case XFmode:
35049 emit_insn (gen_frndintxf2_floor (res, tmp1));
35050 break;
35051 case HImode:
35052 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35053 break;
35054 case SImode:
35055 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35056 break;
35057 case DImode:
35058 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35059 break;
35060 default:
35061 gcc_unreachable ();
35062 }
35063
35064 /* flags = signbit(a) */
35065 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35066
35067 /* if (flags) then res = -res */
35068 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35069 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35070 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35071 pc_rtx);
35072 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35073 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35074 JUMP_LABEL (insn) = jump_label;
35075
35076 emit_insn (gen_neg (res, res));
35077
35078 emit_label (jump_label);
35079 LABEL_NUSES (jump_label) = 1;
35080
35081 emit_move_insn (op0, res);
35082 }
35083
35084 /* Output code to perform a Newton-Rhapson approximation of a single precision
35085 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35086
35087 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35088 {
35089 rtx x0, x1, e0, e1;
35090
35091 x0 = gen_reg_rtx (mode);
35092 e0 = gen_reg_rtx (mode);
35093 e1 = gen_reg_rtx (mode);
35094 x1 = gen_reg_rtx (mode);
35095
35096 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35097
35098 b = force_reg (mode, b);
35099
35100 /* x0 = rcp(b) estimate */
35101 emit_insn (gen_rtx_SET (VOIDmode, x0,
35102 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35103 UNSPEC_RCP)));
35104 /* e0 = x0 * b */
35105 emit_insn (gen_rtx_SET (VOIDmode, e0,
35106 gen_rtx_MULT (mode, x0, b)));
35107
35108 /* e0 = x0 * e0 */
35109 emit_insn (gen_rtx_SET (VOIDmode, e0,
35110 gen_rtx_MULT (mode, x0, e0)));
35111
35112 /* e1 = x0 + x0 */
35113 emit_insn (gen_rtx_SET (VOIDmode, e1,
35114 gen_rtx_PLUS (mode, x0, x0)));
35115
35116 /* x1 = e1 - e0 */
35117 emit_insn (gen_rtx_SET (VOIDmode, x1,
35118 gen_rtx_MINUS (mode, e1, e0)));
35119
35120 /* res = a * x1 */
35121 emit_insn (gen_rtx_SET (VOIDmode, res,
35122 gen_rtx_MULT (mode, a, x1)));
35123 }
35124
35125 /* Output code to perform a Newton-Rhapson approximation of a
35126 single precision floating point [reciprocal] square root. */
35127
35128 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35129 bool recip)
35130 {
35131 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35132 REAL_VALUE_TYPE r;
35133
35134 x0 = gen_reg_rtx (mode);
35135 e0 = gen_reg_rtx (mode);
35136 e1 = gen_reg_rtx (mode);
35137 e2 = gen_reg_rtx (mode);
35138 e3 = gen_reg_rtx (mode);
35139
35140 real_from_integer (&r, VOIDmode, -3, -1, 0);
35141 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35142
35143 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35144 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35145
35146 if (VECTOR_MODE_P (mode))
35147 {
35148 mthree = ix86_build_const_vector (mode, true, mthree);
35149 mhalf = ix86_build_const_vector (mode, true, mhalf);
35150 }
35151
35152 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35153 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35154
35155 a = force_reg (mode, a);
35156
35157 /* x0 = rsqrt(a) estimate */
35158 emit_insn (gen_rtx_SET (VOIDmode, x0,
35159 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35160 UNSPEC_RSQRT)));
35161
35162 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35163 if (!recip)
35164 {
35165 rtx zero, mask;
35166
35167 zero = gen_reg_rtx (mode);
35168 mask = gen_reg_rtx (mode);
35169
35170 zero = force_reg (mode, CONST0_RTX(mode));
35171 emit_insn (gen_rtx_SET (VOIDmode, mask,
35172 gen_rtx_NE (mode, zero, a)));
35173
35174 emit_insn (gen_rtx_SET (VOIDmode, x0,
35175 gen_rtx_AND (mode, x0, mask)));
35176 }
35177
35178 /* e0 = x0 * a */
35179 emit_insn (gen_rtx_SET (VOIDmode, e0,
35180 gen_rtx_MULT (mode, x0, a)));
35181 /* e1 = e0 * x0 */
35182 emit_insn (gen_rtx_SET (VOIDmode, e1,
35183 gen_rtx_MULT (mode, e0, x0)));
35184
35185 /* e2 = e1 - 3. */
35186 mthree = force_reg (mode, mthree);
35187 emit_insn (gen_rtx_SET (VOIDmode, e2,
35188 gen_rtx_PLUS (mode, e1, mthree)));
35189
35190 mhalf = force_reg (mode, mhalf);
35191 if (recip)
35192 /* e3 = -.5 * x0 */
35193 emit_insn (gen_rtx_SET (VOIDmode, e3,
35194 gen_rtx_MULT (mode, x0, mhalf)));
35195 else
35196 /* e3 = -.5 * e0 */
35197 emit_insn (gen_rtx_SET (VOIDmode, e3,
35198 gen_rtx_MULT (mode, e0, mhalf)));
35199 /* ret = e2 * e3 */
35200 emit_insn (gen_rtx_SET (VOIDmode, res,
35201 gen_rtx_MULT (mode, e2, e3)));
35202 }
35203
35204 #ifdef TARGET_SOLARIS
35205 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35206
35207 static void
35208 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35209 tree decl)
35210 {
35211 /* With Binutils 2.15, the "@unwind" marker must be specified on
35212 every occurrence of the ".eh_frame" section, not just the first
35213 one. */
35214 if (TARGET_64BIT
35215 && strcmp (name, ".eh_frame") == 0)
35216 {
35217 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35218 flags & SECTION_WRITE ? "aw" : "a");
35219 return;
35220 }
35221
35222 #ifndef USE_GAS
35223 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35224 {
35225 solaris_elf_asm_comdat_section (name, flags, decl);
35226 return;
35227 }
35228 #endif
35229
35230 default_elf_asm_named_section (name, flags, decl);
35231 }
35232 #endif /* TARGET_SOLARIS */
35233
35234 /* Return the mangling of TYPE if it is an extended fundamental type. */
35235
35236 static const char *
35237 ix86_mangle_type (const_tree type)
35238 {
35239 type = TYPE_MAIN_VARIANT (type);
35240
35241 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35242 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35243 return NULL;
35244
35245 switch (TYPE_MODE (type))
35246 {
35247 case TFmode:
35248 /* __float128 is "g". */
35249 return "g";
35250 case XFmode:
35251 /* "long double" or __float80 is "e". */
35252 return "e";
35253 default:
35254 return NULL;
35255 }
35256 }
35257
35258 /* For 32-bit code we can save PIC register setup by using
35259 __stack_chk_fail_local hidden function instead of calling
35260 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35261 register, so it is better to call __stack_chk_fail directly. */
35262
35263 static tree ATTRIBUTE_UNUSED
35264 ix86_stack_protect_fail (void)
35265 {
35266 return TARGET_64BIT
35267 ? default_external_stack_protect_fail ()
35268 : default_hidden_stack_protect_fail ();
35269 }
35270
35271 /* Select a format to encode pointers in exception handling data. CODE
35272 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35273 true if the symbol may be affected by dynamic relocations.
35274
35275 ??? All x86 object file formats are capable of representing this.
35276 After all, the relocation needed is the same as for the call insn.
35277 Whether or not a particular assembler allows us to enter such, I
35278 guess we'll have to see. */
35279 int
35280 asm_preferred_eh_data_format (int code, int global)
35281 {
35282 if (flag_pic)
35283 {
35284 int type = DW_EH_PE_sdata8;
35285 if (!TARGET_64BIT
35286 || ix86_cmodel == CM_SMALL_PIC
35287 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35288 type = DW_EH_PE_sdata4;
35289 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35290 }
35291 if (ix86_cmodel == CM_SMALL
35292 || (ix86_cmodel == CM_MEDIUM && code))
35293 return DW_EH_PE_udata4;
35294 return DW_EH_PE_absptr;
35295 }
35296 \f
35297 /* Expand copysign from SIGN to the positive value ABS_VALUE
35298 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35299 the sign-bit. */
35300 static void
35301 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35302 {
35303 enum machine_mode mode = GET_MODE (sign);
35304 rtx sgn = gen_reg_rtx (mode);
35305 if (mask == NULL_RTX)
35306 {
35307 enum machine_mode vmode;
35308
35309 if (mode == SFmode)
35310 vmode = V4SFmode;
35311 else if (mode == DFmode)
35312 vmode = V2DFmode;
35313 else
35314 vmode = mode;
35315
35316 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35317 if (!VECTOR_MODE_P (mode))
35318 {
35319 /* We need to generate a scalar mode mask in this case. */
35320 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35321 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35322 mask = gen_reg_rtx (mode);
35323 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35324 }
35325 }
35326 else
35327 mask = gen_rtx_NOT (mode, mask);
35328 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35329 gen_rtx_AND (mode, mask, sign)));
35330 emit_insn (gen_rtx_SET (VOIDmode, result,
35331 gen_rtx_IOR (mode, abs_value, sgn)));
35332 }
35333
35334 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35335 mask for masking out the sign-bit is stored in *SMASK, if that is
35336 non-null. */
35337 static rtx
35338 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35339 {
35340 enum machine_mode vmode, mode = GET_MODE (op0);
35341 rtx xa, mask;
35342
35343 xa = gen_reg_rtx (mode);
35344 if (mode == SFmode)
35345 vmode = V4SFmode;
35346 else if (mode == DFmode)
35347 vmode = V2DFmode;
35348 else
35349 vmode = mode;
35350 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35351 if (!VECTOR_MODE_P (mode))
35352 {
35353 /* We need to generate a scalar mode mask in this case. */
35354 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35355 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35356 mask = gen_reg_rtx (mode);
35357 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35358 }
35359 emit_insn (gen_rtx_SET (VOIDmode, xa,
35360 gen_rtx_AND (mode, op0, mask)));
35361
35362 if (smask)
35363 *smask = mask;
35364
35365 return xa;
35366 }
35367
35368 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35369 swapping the operands if SWAP_OPERANDS is true. The expanded
35370 code is a forward jump to a newly created label in case the
35371 comparison is true. The generated label rtx is returned. */
35372 static rtx
35373 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35374 bool swap_operands)
35375 {
35376 rtx label, tmp;
35377
35378 if (swap_operands)
35379 {
35380 tmp = op0;
35381 op0 = op1;
35382 op1 = tmp;
35383 }
35384
35385 label = gen_label_rtx ();
35386 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35387 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35388 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35389 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35390 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35391 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35392 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35393 JUMP_LABEL (tmp) = label;
35394
35395 return label;
35396 }
35397
35398 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35399 using comparison code CODE. Operands are swapped for the comparison if
35400 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35401 static rtx
35402 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35403 bool swap_operands)
35404 {
35405 rtx (*insn)(rtx, rtx, rtx, rtx);
35406 enum machine_mode mode = GET_MODE (op0);
35407 rtx mask = gen_reg_rtx (mode);
35408
35409 if (swap_operands)
35410 {
35411 rtx tmp = op0;
35412 op0 = op1;
35413 op1 = tmp;
35414 }
35415
35416 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35417
35418 emit_insn (insn (mask, op0, op1,
35419 gen_rtx_fmt_ee (code, mode, op0, op1)));
35420 return mask;
35421 }
35422
35423 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35424 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35425 static rtx
35426 ix86_gen_TWO52 (enum machine_mode mode)
35427 {
35428 REAL_VALUE_TYPE TWO52r;
35429 rtx TWO52;
35430
35431 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35432 TWO52 = const_double_from_real_value (TWO52r, mode);
35433 TWO52 = force_reg (mode, TWO52);
35434
35435 return TWO52;
35436 }
35437
35438 /* Expand SSE sequence for computing lround from OP1 storing
35439 into OP0. */
35440 void
35441 ix86_expand_lround (rtx op0, rtx op1)
35442 {
35443 /* C code for the stuff we're doing below:
35444 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35445 return (long)tmp;
35446 */
35447 enum machine_mode mode = GET_MODE (op1);
35448 const struct real_format *fmt;
35449 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35450 rtx adj;
35451
35452 /* load nextafter (0.5, 0.0) */
35453 fmt = REAL_MODE_FORMAT (mode);
35454 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35455 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35456
35457 /* adj = copysign (0.5, op1) */
35458 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35459 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35460
35461 /* adj = op1 + adj */
35462 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35463
35464 /* op0 = (imode)adj */
35465 expand_fix (op0, adj, 0);
35466 }
35467
35468 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35469 into OPERAND0. */
35470 void
35471 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35472 {
35473 /* C code for the stuff we're doing below (for do_floor):
35474 xi = (long)op1;
35475 xi -= (double)xi > op1 ? 1 : 0;
35476 return xi;
35477 */
35478 enum machine_mode fmode = GET_MODE (op1);
35479 enum machine_mode imode = GET_MODE (op0);
35480 rtx ireg, freg, label, tmp;
35481
35482 /* reg = (long)op1 */
35483 ireg = gen_reg_rtx (imode);
35484 expand_fix (ireg, op1, 0);
35485
35486 /* freg = (double)reg */
35487 freg = gen_reg_rtx (fmode);
35488 expand_float (freg, ireg, 0);
35489
35490 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35491 label = ix86_expand_sse_compare_and_jump (UNLE,
35492 freg, op1, !do_floor);
35493 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35494 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35495 emit_move_insn (ireg, tmp);
35496
35497 emit_label (label);
35498 LABEL_NUSES (label) = 1;
35499
35500 emit_move_insn (op0, ireg);
35501 }
35502
35503 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35504 result in OPERAND0. */
35505 void
35506 ix86_expand_rint (rtx operand0, rtx operand1)
35507 {
35508 /* C code for the stuff we're doing below:
35509 xa = fabs (operand1);
35510 if (!isless (xa, 2**52))
35511 return operand1;
35512 xa = xa + 2**52 - 2**52;
35513 return copysign (xa, operand1);
35514 */
35515 enum machine_mode mode = GET_MODE (operand0);
35516 rtx res, xa, label, TWO52, mask;
35517
35518 res = gen_reg_rtx (mode);
35519 emit_move_insn (res, operand1);
35520
35521 /* xa = abs (operand1) */
35522 xa = ix86_expand_sse_fabs (res, &mask);
35523
35524 /* if (!isless (xa, TWO52)) goto label; */
35525 TWO52 = ix86_gen_TWO52 (mode);
35526 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35527
35528 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35529 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35530
35531 ix86_sse_copysign_to_positive (res, xa, res, mask);
35532
35533 emit_label (label);
35534 LABEL_NUSES (label) = 1;
35535
35536 emit_move_insn (operand0, res);
35537 }
35538
35539 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35540 into OPERAND0. */
35541 void
35542 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35543 {
35544 /* C code for the stuff we expand below.
35545 double xa = fabs (x), x2;
35546 if (!isless (xa, TWO52))
35547 return x;
35548 xa = xa + TWO52 - TWO52;
35549 x2 = copysign (xa, x);
35550 Compensate. Floor:
35551 if (x2 > x)
35552 x2 -= 1;
35553 Compensate. Ceil:
35554 if (x2 < x)
35555 x2 -= -1;
35556 return x2;
35557 */
35558 enum machine_mode mode = GET_MODE (operand0);
35559 rtx xa, TWO52, tmp, label, one, res, mask;
35560
35561 TWO52 = ix86_gen_TWO52 (mode);
35562
35563 /* Temporary for holding the result, initialized to the input
35564 operand to ease control flow. */
35565 res = gen_reg_rtx (mode);
35566 emit_move_insn (res, operand1);
35567
35568 /* xa = abs (operand1) */
35569 xa = ix86_expand_sse_fabs (res, &mask);
35570
35571 /* if (!isless (xa, TWO52)) goto label; */
35572 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35573
35574 /* xa = xa + TWO52 - TWO52; */
35575 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35576 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35577
35578 /* xa = copysign (xa, operand1) */
35579 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35580
35581 /* generate 1.0 or -1.0 */
35582 one = force_reg (mode,
35583 const_double_from_real_value (do_floor
35584 ? dconst1 : dconstm1, mode));
35585
35586 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35587 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35588 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35589 gen_rtx_AND (mode, one, tmp)));
35590 /* We always need to subtract here to preserve signed zero. */
35591 tmp = expand_simple_binop (mode, MINUS,
35592 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35593 emit_move_insn (res, tmp);
35594
35595 emit_label (label);
35596 LABEL_NUSES (label) = 1;
35597
35598 emit_move_insn (operand0, res);
35599 }
35600
35601 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35602 into OPERAND0. */
35603 void
35604 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35605 {
35606 /* C code for the stuff we expand below.
35607 double xa = fabs (x), x2;
35608 if (!isless (xa, TWO52))
35609 return x;
35610 x2 = (double)(long)x;
35611 Compensate. Floor:
35612 if (x2 > x)
35613 x2 -= 1;
35614 Compensate. Ceil:
35615 if (x2 < x)
35616 x2 += 1;
35617 if (HONOR_SIGNED_ZEROS (mode))
35618 return copysign (x2, x);
35619 return x2;
35620 */
35621 enum machine_mode mode = GET_MODE (operand0);
35622 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35623
35624 TWO52 = ix86_gen_TWO52 (mode);
35625
35626 /* Temporary for holding the result, initialized to the input
35627 operand to ease control flow. */
35628 res = gen_reg_rtx (mode);
35629 emit_move_insn (res, operand1);
35630
35631 /* xa = abs (operand1) */
35632 xa = ix86_expand_sse_fabs (res, &mask);
35633
35634 /* if (!isless (xa, TWO52)) goto label; */
35635 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35636
35637 /* xa = (double)(long)x */
35638 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35639 expand_fix (xi, res, 0);
35640 expand_float (xa, xi, 0);
35641
35642 /* generate 1.0 */
35643 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35644
35645 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35646 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35647 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35648 gen_rtx_AND (mode, one, tmp)));
35649 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35650 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35651 emit_move_insn (res, tmp);
35652
35653 if (HONOR_SIGNED_ZEROS (mode))
35654 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35655
35656 emit_label (label);
35657 LABEL_NUSES (label) = 1;
35658
35659 emit_move_insn (operand0, res);
35660 }
35661
35662 /* Expand SSE sequence for computing round from OPERAND1 storing
35663 into OPERAND0. Sequence that works without relying on DImode truncation
35664 via cvttsd2siq that is only available on 64bit targets. */
35665 void
35666 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35667 {
35668 /* C code for the stuff we expand below.
35669 double xa = fabs (x), xa2, x2;
35670 if (!isless (xa, TWO52))
35671 return x;
35672 Using the absolute value and copying back sign makes
35673 -0.0 -> -0.0 correct.
35674 xa2 = xa + TWO52 - TWO52;
35675 Compensate.
35676 dxa = xa2 - xa;
35677 if (dxa <= -0.5)
35678 xa2 += 1;
35679 else if (dxa > 0.5)
35680 xa2 -= 1;
35681 x2 = copysign (xa2, x);
35682 return x2;
35683 */
35684 enum machine_mode mode = GET_MODE (operand0);
35685 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35686
35687 TWO52 = ix86_gen_TWO52 (mode);
35688
35689 /* Temporary for holding the result, initialized to the input
35690 operand to ease control flow. */
35691 res = gen_reg_rtx (mode);
35692 emit_move_insn (res, operand1);
35693
35694 /* xa = abs (operand1) */
35695 xa = ix86_expand_sse_fabs (res, &mask);
35696
35697 /* if (!isless (xa, TWO52)) goto label; */
35698 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35699
35700 /* xa2 = xa + TWO52 - TWO52; */
35701 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35702 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35703
35704 /* dxa = xa2 - xa; */
35705 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35706
35707 /* generate 0.5, 1.0 and -0.5 */
35708 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35709 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35710 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35711 0, OPTAB_DIRECT);
35712
35713 /* Compensate. */
35714 tmp = gen_reg_rtx (mode);
35715 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35716 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35717 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35718 gen_rtx_AND (mode, one, tmp)));
35719 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35720 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35721 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35722 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35723 gen_rtx_AND (mode, one, tmp)));
35724 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35725
35726 /* res = copysign (xa2, operand1) */
35727 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35728
35729 emit_label (label);
35730 LABEL_NUSES (label) = 1;
35731
35732 emit_move_insn (operand0, res);
35733 }
35734
35735 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35736 into OPERAND0. */
35737 void
35738 ix86_expand_trunc (rtx operand0, rtx operand1)
35739 {
35740 /* C code for SSE variant we expand below.
35741 double xa = fabs (x), x2;
35742 if (!isless (xa, TWO52))
35743 return x;
35744 x2 = (double)(long)x;
35745 if (HONOR_SIGNED_ZEROS (mode))
35746 return copysign (x2, x);
35747 return x2;
35748 */
35749 enum machine_mode mode = GET_MODE (operand0);
35750 rtx xa, xi, TWO52, label, res, mask;
35751
35752 TWO52 = ix86_gen_TWO52 (mode);
35753
35754 /* Temporary for holding the result, initialized to the input
35755 operand to ease control flow. */
35756 res = gen_reg_rtx (mode);
35757 emit_move_insn (res, operand1);
35758
35759 /* xa = abs (operand1) */
35760 xa = ix86_expand_sse_fabs (res, &mask);
35761
35762 /* if (!isless (xa, TWO52)) goto label; */
35763 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35764
35765 /* x = (double)(long)x */
35766 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35767 expand_fix (xi, res, 0);
35768 expand_float (res, xi, 0);
35769
35770 if (HONOR_SIGNED_ZEROS (mode))
35771 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35772
35773 emit_label (label);
35774 LABEL_NUSES (label) = 1;
35775
35776 emit_move_insn (operand0, res);
35777 }
35778
35779 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35780 into OPERAND0. */
35781 void
35782 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35783 {
35784 enum machine_mode mode = GET_MODE (operand0);
35785 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35786
35787 /* C code for SSE variant we expand below.
35788 double xa = fabs (x), x2;
35789 if (!isless (xa, TWO52))
35790 return x;
35791 xa2 = xa + TWO52 - TWO52;
35792 Compensate:
35793 if (xa2 > xa)
35794 xa2 -= 1.0;
35795 x2 = copysign (xa2, x);
35796 return x2;
35797 */
35798
35799 TWO52 = ix86_gen_TWO52 (mode);
35800
35801 /* Temporary for holding the result, initialized to the input
35802 operand to ease control flow. */
35803 res = gen_reg_rtx (mode);
35804 emit_move_insn (res, operand1);
35805
35806 /* xa = abs (operand1) */
35807 xa = ix86_expand_sse_fabs (res, &smask);
35808
35809 /* if (!isless (xa, TWO52)) goto label; */
35810 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35811
35812 /* res = xa + TWO52 - TWO52; */
35813 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35814 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35815 emit_move_insn (res, tmp);
35816
35817 /* generate 1.0 */
35818 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35819
35820 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35821 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35822 emit_insn (gen_rtx_SET (VOIDmode, mask,
35823 gen_rtx_AND (mode, mask, one)));
35824 tmp = expand_simple_binop (mode, MINUS,
35825 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35826 emit_move_insn (res, tmp);
35827
35828 /* res = copysign (res, operand1) */
35829 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35830
35831 emit_label (label);
35832 LABEL_NUSES (label) = 1;
35833
35834 emit_move_insn (operand0, res);
35835 }
35836
35837 /* Expand SSE sequence for computing round from OPERAND1 storing
35838 into OPERAND0. */
35839 void
35840 ix86_expand_round (rtx operand0, rtx operand1)
35841 {
35842 /* C code for the stuff we're doing below:
35843 double xa = fabs (x);
35844 if (!isless (xa, TWO52))
35845 return x;
35846 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35847 return copysign (xa, x);
35848 */
35849 enum machine_mode mode = GET_MODE (operand0);
35850 rtx res, TWO52, xa, label, xi, half, mask;
35851 const struct real_format *fmt;
35852 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35853
35854 /* Temporary for holding the result, initialized to the input
35855 operand to ease control flow. */
35856 res = gen_reg_rtx (mode);
35857 emit_move_insn (res, operand1);
35858
35859 TWO52 = ix86_gen_TWO52 (mode);
35860 xa = ix86_expand_sse_fabs (res, &mask);
35861 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35862
35863 /* load nextafter (0.5, 0.0) */
35864 fmt = REAL_MODE_FORMAT (mode);
35865 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35866 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35867
35868 /* xa = xa + 0.5 */
35869 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35870 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35871
35872 /* xa = (double)(int64_t)xa */
35873 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35874 expand_fix (xi, xa, 0);
35875 expand_float (xa, xi, 0);
35876
35877 /* res = copysign (xa, operand1) */
35878 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35879
35880 emit_label (label);
35881 LABEL_NUSES (label) = 1;
35882
35883 emit_move_insn (operand0, res);
35884 }
35885
35886 /* Expand SSE sequence for computing round
35887 from OP1 storing into OP0 using sse4 round insn. */
35888 void
35889 ix86_expand_round_sse4 (rtx op0, rtx op1)
35890 {
35891 enum machine_mode mode = GET_MODE (op0);
35892 rtx e1, e2, res, half;
35893 const struct real_format *fmt;
35894 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35895 rtx (*gen_copysign) (rtx, rtx, rtx);
35896 rtx (*gen_round) (rtx, rtx, rtx);
35897
35898 switch (mode)
35899 {
35900 case SFmode:
35901 gen_copysign = gen_copysignsf3;
35902 gen_round = gen_sse4_1_roundsf2;
35903 break;
35904 case DFmode:
35905 gen_copysign = gen_copysigndf3;
35906 gen_round = gen_sse4_1_rounddf2;
35907 break;
35908 default:
35909 gcc_unreachable ();
35910 }
35911
35912 /* round (a) = trunc (a + copysign (0.5, a)) */
35913
35914 /* load nextafter (0.5, 0.0) */
35915 fmt = REAL_MODE_FORMAT (mode);
35916 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35917 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35918 half = const_double_from_real_value (pred_half, mode);
35919
35920 /* e1 = copysign (0.5, op1) */
35921 e1 = gen_reg_rtx (mode);
35922 emit_insn (gen_copysign (e1, half, op1));
35923
35924 /* e2 = op1 + e1 */
35925 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35926
35927 /* res = trunc (e2) */
35928 res = gen_reg_rtx (mode);
35929 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35930
35931 emit_move_insn (op0, res);
35932 }
35933 \f
35934
35935 /* Table of valid machine attributes. */
35936 static const struct attribute_spec ix86_attribute_table[] =
35937 {
35938 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35939 affects_type_identity } */
35940 /* Stdcall attribute says callee is responsible for popping arguments
35941 if they are not variable. */
35942 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35943 true },
35944 /* Fastcall attribute says callee is responsible for popping arguments
35945 if they are not variable. */
35946 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35947 true },
35948 /* Thiscall attribute says callee is responsible for popping arguments
35949 if they are not variable. */
35950 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35951 true },
35952 /* Cdecl attribute says the callee is a normal C declaration */
35953 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35954 true },
35955 /* Regparm attribute specifies how many integer arguments are to be
35956 passed in registers. */
35957 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35958 true },
35959 /* Sseregparm attribute says we are using x86_64 calling conventions
35960 for FP arguments. */
35961 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35962 true },
35963 /* The transactional memory builtins are implicitly regparm or fastcall
35964 depending on the ABI. Override the generic do-nothing attribute that
35965 these builtins were declared with. */
35966 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35967 true },
35968 /* force_align_arg_pointer says this function realigns the stack at entry. */
35969 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35970 false, true, true, ix86_handle_cconv_attribute, false },
35971 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35972 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35973 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35974 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35975 false },
35976 #endif
35977 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35978 false },
35979 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35980 false },
35981 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35982 SUBTARGET_ATTRIBUTE_TABLE,
35983 #endif
35984 /* ms_abi and sysv_abi calling convention function attributes. */
35985 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35986 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35987 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35988 false },
35989 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35990 ix86_handle_callee_pop_aggregate_return, true },
35991 /* End element. */
35992 { NULL, 0, 0, false, false, false, NULL, false }
35993 };
35994
35995 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35996 static int
35997 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35998 tree vectype ATTRIBUTE_UNUSED,
35999 int misalign ATTRIBUTE_UNUSED)
36000 {
36001 switch (type_of_cost)
36002 {
36003 case scalar_stmt:
36004 return ix86_cost->scalar_stmt_cost;
36005
36006 case scalar_load:
36007 return ix86_cost->scalar_load_cost;
36008
36009 case scalar_store:
36010 return ix86_cost->scalar_store_cost;
36011
36012 case vector_stmt:
36013 return ix86_cost->vec_stmt_cost;
36014
36015 case vector_load:
36016 return ix86_cost->vec_align_load_cost;
36017
36018 case vector_store:
36019 return ix86_cost->vec_store_cost;
36020
36021 case vec_to_scalar:
36022 return ix86_cost->vec_to_scalar_cost;
36023
36024 case scalar_to_vec:
36025 return ix86_cost->scalar_to_vec_cost;
36026
36027 case unaligned_load:
36028 case unaligned_store:
36029 return ix86_cost->vec_unalign_load_cost;
36030
36031 case cond_branch_taken:
36032 return ix86_cost->cond_taken_branch_cost;
36033
36034 case cond_branch_not_taken:
36035 return ix86_cost->cond_not_taken_branch_cost;
36036
36037 case vec_perm:
36038 case vec_promote_demote:
36039 return ix86_cost->vec_stmt_cost;
36040
36041 default:
36042 gcc_unreachable ();
36043 }
36044 }
36045
36046 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36047 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36048 insn every time. */
36049
36050 static GTY(()) rtx vselect_insn;
36051
36052 /* Initialize vselect_insn. */
36053
36054 static void
36055 init_vselect_insn (void)
36056 {
36057 unsigned i;
36058 rtx x;
36059
36060 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36061 for (i = 0; i < MAX_VECT_LEN; ++i)
36062 XVECEXP (x, 0, i) = const0_rtx;
36063 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36064 const0_rtx), x);
36065 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36066 start_sequence ();
36067 vselect_insn = emit_insn (x);
36068 end_sequence ();
36069 }
36070
36071 /* Construct (set target (vec_select op0 (parallel perm))) and
36072 return true if that's a valid instruction in the active ISA. */
36073
36074 static bool
36075 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36076 unsigned nelt, bool testing_p)
36077 {
36078 unsigned int i;
36079 rtx x, save_vconcat;
36080 int icode;
36081
36082 if (vselect_insn == NULL_RTX)
36083 init_vselect_insn ();
36084
36085 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36086 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36087 for (i = 0; i < nelt; ++i)
36088 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36089 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36090 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36091 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36092 SET_DEST (PATTERN (vselect_insn)) = target;
36093 icode = recog_memoized (vselect_insn);
36094
36095 if (icode >= 0 && !testing_p)
36096 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36097
36098 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36099 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36100 INSN_CODE (vselect_insn) = -1;
36101
36102 return icode >= 0;
36103 }
36104
36105 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36106
36107 static bool
36108 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36109 const unsigned char *perm, unsigned nelt,
36110 bool testing_p)
36111 {
36112 enum machine_mode v2mode;
36113 rtx x;
36114 bool ok;
36115
36116 if (vselect_insn == NULL_RTX)
36117 init_vselect_insn ();
36118
36119 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36120 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36121 PUT_MODE (x, v2mode);
36122 XEXP (x, 0) = op0;
36123 XEXP (x, 1) = op1;
36124 ok = expand_vselect (target, x, perm, nelt, testing_p);
36125 XEXP (x, 0) = const0_rtx;
36126 XEXP (x, 1) = const0_rtx;
36127 return ok;
36128 }
36129
36130 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36131 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36132
36133 static bool
36134 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36135 {
36136 enum machine_mode vmode = d->vmode;
36137 unsigned i, mask, nelt = d->nelt;
36138 rtx target, op0, op1, x;
36139 rtx rperm[32], vperm;
36140
36141 if (d->one_operand_p)
36142 return false;
36143 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36144 ;
36145 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36146 ;
36147 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36148 ;
36149 else
36150 return false;
36151
36152 /* This is a blend, not a permute. Elements must stay in their
36153 respective lanes. */
36154 for (i = 0; i < nelt; ++i)
36155 {
36156 unsigned e = d->perm[i];
36157 if (!(e == i || e == i + nelt))
36158 return false;
36159 }
36160
36161 if (d->testing_p)
36162 return true;
36163
36164 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36165 decision should be extracted elsewhere, so that we only try that
36166 sequence once all budget==3 options have been tried. */
36167 target = d->target;
36168 op0 = d->op0;
36169 op1 = d->op1;
36170 mask = 0;
36171
36172 switch (vmode)
36173 {
36174 case V4DFmode:
36175 case V8SFmode:
36176 case V2DFmode:
36177 case V4SFmode:
36178 case V8HImode:
36179 case V8SImode:
36180 for (i = 0; i < nelt; ++i)
36181 mask |= (d->perm[i] >= nelt) << i;
36182 break;
36183
36184 case V2DImode:
36185 for (i = 0; i < 2; ++i)
36186 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36187 vmode = V8HImode;
36188 goto do_subreg;
36189
36190 case V4SImode:
36191 for (i = 0; i < 4; ++i)
36192 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36193 vmode = V8HImode;
36194 goto do_subreg;
36195
36196 case V16QImode:
36197 /* See if bytes move in pairs so we can use pblendw with
36198 an immediate argument, rather than pblendvb with a vector
36199 argument. */
36200 for (i = 0; i < 16; i += 2)
36201 if (d->perm[i] + 1 != d->perm[i + 1])
36202 {
36203 use_pblendvb:
36204 for (i = 0; i < nelt; ++i)
36205 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36206
36207 finish_pblendvb:
36208 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36209 vperm = force_reg (vmode, vperm);
36210
36211 if (GET_MODE_SIZE (vmode) == 16)
36212 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36213 else
36214 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36215 return true;
36216 }
36217
36218 for (i = 0; i < 8; ++i)
36219 mask |= (d->perm[i * 2] >= 16) << i;
36220 vmode = V8HImode;
36221 /* FALLTHRU */
36222
36223 do_subreg:
36224 target = gen_lowpart (vmode, target);
36225 op0 = gen_lowpart (vmode, op0);
36226 op1 = gen_lowpart (vmode, op1);
36227 break;
36228
36229 case V32QImode:
36230 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36231 for (i = 0; i < 32; i += 2)
36232 if (d->perm[i] + 1 != d->perm[i + 1])
36233 goto use_pblendvb;
36234 /* See if bytes move in quadruplets. If yes, vpblendd
36235 with immediate can be used. */
36236 for (i = 0; i < 32; i += 4)
36237 if (d->perm[i] + 2 != d->perm[i + 2])
36238 break;
36239 if (i < 32)
36240 {
36241 /* See if bytes move the same in both lanes. If yes,
36242 vpblendw with immediate can be used. */
36243 for (i = 0; i < 16; i += 2)
36244 if (d->perm[i] + 16 != d->perm[i + 16])
36245 goto use_pblendvb;
36246
36247 /* Use vpblendw. */
36248 for (i = 0; i < 16; ++i)
36249 mask |= (d->perm[i * 2] >= 32) << i;
36250 vmode = V16HImode;
36251 goto do_subreg;
36252 }
36253
36254 /* Use vpblendd. */
36255 for (i = 0; i < 8; ++i)
36256 mask |= (d->perm[i * 4] >= 32) << i;
36257 vmode = V8SImode;
36258 goto do_subreg;
36259
36260 case V16HImode:
36261 /* See if words move in pairs. If yes, vpblendd can be used. */
36262 for (i = 0; i < 16; i += 2)
36263 if (d->perm[i] + 1 != d->perm[i + 1])
36264 break;
36265 if (i < 16)
36266 {
36267 /* See if words move the same in both lanes. If not,
36268 vpblendvb must be used. */
36269 for (i = 0; i < 8; i++)
36270 if (d->perm[i] + 8 != d->perm[i + 8])
36271 {
36272 /* Use vpblendvb. */
36273 for (i = 0; i < 32; ++i)
36274 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36275
36276 vmode = V32QImode;
36277 nelt = 32;
36278 target = gen_lowpart (vmode, target);
36279 op0 = gen_lowpart (vmode, op0);
36280 op1 = gen_lowpart (vmode, op1);
36281 goto finish_pblendvb;
36282 }
36283
36284 /* Use vpblendw. */
36285 for (i = 0; i < 16; ++i)
36286 mask |= (d->perm[i] >= 16) << i;
36287 break;
36288 }
36289
36290 /* Use vpblendd. */
36291 for (i = 0; i < 8; ++i)
36292 mask |= (d->perm[i * 2] >= 16) << i;
36293 vmode = V8SImode;
36294 goto do_subreg;
36295
36296 case V4DImode:
36297 /* Use vpblendd. */
36298 for (i = 0; i < 4; ++i)
36299 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36300 vmode = V8SImode;
36301 goto do_subreg;
36302
36303 default:
36304 gcc_unreachable ();
36305 }
36306
36307 /* This matches five different patterns with the different modes. */
36308 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36309 x = gen_rtx_SET (VOIDmode, target, x);
36310 emit_insn (x);
36311
36312 return true;
36313 }
36314
36315 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36316 in terms of the variable form of vpermilps.
36317
36318 Note that we will have already failed the immediate input vpermilps,
36319 which requires that the high and low part shuffle be identical; the
36320 variable form doesn't require that. */
36321
36322 static bool
36323 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36324 {
36325 rtx rperm[8], vperm;
36326 unsigned i;
36327
36328 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36329 return false;
36330
36331 /* We can only permute within the 128-bit lane. */
36332 for (i = 0; i < 8; ++i)
36333 {
36334 unsigned e = d->perm[i];
36335 if (i < 4 ? e >= 4 : e < 4)
36336 return false;
36337 }
36338
36339 if (d->testing_p)
36340 return true;
36341
36342 for (i = 0; i < 8; ++i)
36343 {
36344 unsigned e = d->perm[i];
36345
36346 /* Within each 128-bit lane, the elements of op0 are numbered
36347 from 0 and the elements of op1 are numbered from 4. */
36348 if (e >= 8 + 4)
36349 e -= 8;
36350 else if (e >= 4)
36351 e -= 4;
36352
36353 rperm[i] = GEN_INT (e);
36354 }
36355
36356 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36357 vperm = force_reg (V8SImode, vperm);
36358 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36359
36360 return true;
36361 }
36362
36363 /* Return true if permutation D can be performed as VMODE permutation
36364 instead. */
36365
36366 static bool
36367 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36368 {
36369 unsigned int i, j, chunk;
36370
36371 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36372 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36373 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36374 return false;
36375
36376 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36377 return true;
36378
36379 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36380 for (i = 0; i < d->nelt; i += chunk)
36381 if (d->perm[i] & (chunk - 1))
36382 return false;
36383 else
36384 for (j = 1; j < chunk; ++j)
36385 if (d->perm[i] + j != d->perm[i + j])
36386 return false;
36387
36388 return true;
36389 }
36390
36391 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36392 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36393
36394 static bool
36395 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36396 {
36397 unsigned i, nelt, eltsz, mask;
36398 unsigned char perm[32];
36399 enum machine_mode vmode = V16QImode;
36400 rtx rperm[32], vperm, target, op0, op1;
36401
36402 nelt = d->nelt;
36403
36404 if (!d->one_operand_p)
36405 {
36406 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36407 {
36408 if (TARGET_AVX2
36409 && valid_perm_using_mode_p (V2TImode, d))
36410 {
36411 if (d->testing_p)
36412 return true;
36413
36414 /* Use vperm2i128 insn. The pattern uses
36415 V4DImode instead of V2TImode. */
36416 target = gen_lowpart (V4DImode, d->target);
36417 op0 = gen_lowpart (V4DImode, d->op0);
36418 op1 = gen_lowpart (V4DImode, d->op1);
36419 rperm[0]
36420 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36421 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36422 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36423 return true;
36424 }
36425 return false;
36426 }
36427 }
36428 else
36429 {
36430 if (GET_MODE_SIZE (d->vmode) == 16)
36431 {
36432 if (!TARGET_SSSE3)
36433 return false;
36434 }
36435 else if (GET_MODE_SIZE (d->vmode) == 32)
36436 {
36437 if (!TARGET_AVX2)
36438 return false;
36439
36440 /* V4DImode should be already handled through
36441 expand_vselect by vpermq instruction. */
36442 gcc_assert (d->vmode != V4DImode);
36443
36444 vmode = V32QImode;
36445 if (d->vmode == V8SImode
36446 || d->vmode == V16HImode
36447 || d->vmode == V32QImode)
36448 {
36449 /* First see if vpermq can be used for
36450 V8SImode/V16HImode/V32QImode. */
36451 if (valid_perm_using_mode_p (V4DImode, d))
36452 {
36453 for (i = 0; i < 4; i++)
36454 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36455 if (d->testing_p)
36456 return true;
36457 return expand_vselect (gen_lowpart (V4DImode, d->target),
36458 gen_lowpart (V4DImode, d->op0),
36459 perm, 4, false);
36460 }
36461
36462 /* Next see if vpermd can be used. */
36463 if (valid_perm_using_mode_p (V8SImode, d))
36464 vmode = V8SImode;
36465 }
36466 /* Or if vpermps can be used. */
36467 else if (d->vmode == V8SFmode)
36468 vmode = V8SImode;
36469
36470 if (vmode == V32QImode)
36471 {
36472 /* vpshufb only works intra lanes, it is not
36473 possible to shuffle bytes in between the lanes. */
36474 for (i = 0; i < nelt; ++i)
36475 if ((d->perm[i] ^ i) & (nelt / 2))
36476 return false;
36477 }
36478 }
36479 else
36480 return false;
36481 }
36482
36483 if (d->testing_p)
36484 return true;
36485
36486 if (vmode == V8SImode)
36487 for (i = 0; i < 8; ++i)
36488 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36489 else
36490 {
36491 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36492 if (!d->one_operand_p)
36493 mask = 2 * nelt - 1;
36494 else if (vmode == V16QImode)
36495 mask = nelt - 1;
36496 else
36497 mask = nelt / 2 - 1;
36498
36499 for (i = 0; i < nelt; ++i)
36500 {
36501 unsigned j, e = d->perm[i] & mask;
36502 for (j = 0; j < eltsz; ++j)
36503 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36504 }
36505 }
36506
36507 vperm = gen_rtx_CONST_VECTOR (vmode,
36508 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36509 vperm = force_reg (vmode, vperm);
36510
36511 if (vmode == V8SImode && d->vmode == V8SFmode)
36512 {
36513 vmode = V8SFmode;
36514 vperm = gen_lowpart (vmode, vperm);
36515 }
36516
36517 target = gen_lowpart (vmode, d->target);
36518 op0 = gen_lowpart (vmode, d->op0);
36519 if (d->one_operand_p)
36520 {
36521 if (vmode == V16QImode)
36522 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36523 else if (vmode == V32QImode)
36524 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36525 else if (vmode == V8SFmode)
36526 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
36527 else
36528 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36529 }
36530 else
36531 {
36532 op1 = gen_lowpart (vmode, d->op1);
36533 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36534 }
36535
36536 return true;
36537 }
36538
36539 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36540 in a single instruction. */
36541
36542 static bool
36543 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36544 {
36545 unsigned i, nelt = d->nelt;
36546 unsigned char perm2[MAX_VECT_LEN];
36547
36548 /* Check plain VEC_SELECT first, because AVX has instructions that could
36549 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36550 input where SEL+CONCAT may not. */
36551 if (d->one_operand_p)
36552 {
36553 int mask = nelt - 1;
36554 bool identity_perm = true;
36555 bool broadcast_perm = true;
36556
36557 for (i = 0; i < nelt; i++)
36558 {
36559 perm2[i] = d->perm[i] & mask;
36560 if (perm2[i] != i)
36561 identity_perm = false;
36562 if (perm2[i])
36563 broadcast_perm = false;
36564 }
36565
36566 if (identity_perm)
36567 {
36568 if (!d->testing_p)
36569 emit_move_insn (d->target, d->op0);
36570 return true;
36571 }
36572 else if (broadcast_perm && TARGET_AVX2)
36573 {
36574 /* Use vpbroadcast{b,w,d}. */
36575 rtx (*gen) (rtx, rtx) = NULL;
36576 switch (d->vmode)
36577 {
36578 case V32QImode:
36579 gen = gen_avx2_pbroadcastv32qi_1;
36580 break;
36581 case V16HImode:
36582 gen = gen_avx2_pbroadcastv16hi_1;
36583 break;
36584 case V8SImode:
36585 gen = gen_avx2_pbroadcastv8si_1;
36586 break;
36587 case V16QImode:
36588 gen = gen_avx2_pbroadcastv16qi;
36589 break;
36590 case V8HImode:
36591 gen = gen_avx2_pbroadcastv8hi;
36592 break;
36593 case V8SFmode:
36594 gen = gen_avx2_vec_dupv8sf_1;
36595 break;
36596 /* For other modes prefer other shuffles this function creates. */
36597 default: break;
36598 }
36599 if (gen != NULL)
36600 {
36601 if (!d->testing_p)
36602 emit_insn (gen (d->target, d->op0));
36603 return true;
36604 }
36605 }
36606
36607 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36608 return true;
36609
36610 /* There are plenty of patterns in sse.md that are written for
36611 SEL+CONCAT and are not replicated for a single op. Perhaps
36612 that should be changed, to avoid the nastiness here. */
36613
36614 /* Recognize interleave style patterns, which means incrementing
36615 every other permutation operand. */
36616 for (i = 0; i < nelt; i += 2)
36617 {
36618 perm2[i] = d->perm[i] & mask;
36619 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36620 }
36621 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36622 d->testing_p))
36623 return true;
36624
36625 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36626 if (nelt >= 4)
36627 {
36628 for (i = 0; i < nelt; i += 4)
36629 {
36630 perm2[i + 0] = d->perm[i + 0] & mask;
36631 perm2[i + 1] = d->perm[i + 1] & mask;
36632 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36633 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36634 }
36635
36636 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36637 d->testing_p))
36638 return true;
36639 }
36640 }
36641
36642 /* Finally, try the fully general two operand permute. */
36643 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36644 d->testing_p))
36645 return true;
36646
36647 /* Recognize interleave style patterns with reversed operands. */
36648 if (!d->one_operand_p)
36649 {
36650 for (i = 0; i < nelt; ++i)
36651 {
36652 unsigned e = d->perm[i];
36653 if (e >= nelt)
36654 e -= nelt;
36655 else
36656 e += nelt;
36657 perm2[i] = e;
36658 }
36659
36660 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36661 d->testing_p))
36662 return true;
36663 }
36664
36665 /* Try the SSE4.1 blend variable merge instructions. */
36666 if (expand_vec_perm_blend (d))
36667 return true;
36668
36669 /* Try one of the AVX vpermil variable permutations. */
36670 if (expand_vec_perm_vpermil (d))
36671 return true;
36672
36673 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36674 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36675 if (expand_vec_perm_pshufb (d))
36676 return true;
36677
36678 return false;
36679 }
36680
36681 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36682 in terms of a pair of pshuflw + pshufhw instructions. */
36683
36684 static bool
36685 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36686 {
36687 unsigned char perm2[MAX_VECT_LEN];
36688 unsigned i;
36689 bool ok;
36690
36691 if (d->vmode != V8HImode || !d->one_operand_p)
36692 return false;
36693
36694 /* The two permutations only operate in 64-bit lanes. */
36695 for (i = 0; i < 4; ++i)
36696 if (d->perm[i] >= 4)
36697 return false;
36698 for (i = 4; i < 8; ++i)
36699 if (d->perm[i] < 4)
36700 return false;
36701
36702 if (d->testing_p)
36703 return true;
36704
36705 /* Emit the pshuflw. */
36706 memcpy (perm2, d->perm, 4);
36707 for (i = 4; i < 8; ++i)
36708 perm2[i] = i;
36709 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36710 gcc_assert (ok);
36711
36712 /* Emit the pshufhw. */
36713 memcpy (perm2 + 4, d->perm + 4, 4);
36714 for (i = 0; i < 4; ++i)
36715 perm2[i] = i;
36716 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36717 gcc_assert (ok);
36718
36719 return true;
36720 }
36721
36722 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36723 the permutation using the SSSE3 palignr instruction. This succeeds
36724 when all of the elements in PERM fit within one vector and we merely
36725 need to shift them down so that a single vector permutation has a
36726 chance to succeed. */
36727
36728 static bool
36729 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36730 {
36731 unsigned i, nelt = d->nelt;
36732 unsigned min, max;
36733 bool in_order, ok;
36734 rtx shift;
36735
36736 /* Even with AVX, palignr only operates on 128-bit vectors. */
36737 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36738 return false;
36739
36740 min = nelt, max = 0;
36741 for (i = 0; i < nelt; ++i)
36742 {
36743 unsigned e = d->perm[i];
36744 if (e < min)
36745 min = e;
36746 if (e > max)
36747 max = e;
36748 }
36749 if (min == 0 || max - min >= nelt)
36750 return false;
36751
36752 /* Given that we have SSSE3, we know we'll be able to implement the
36753 single operand permutation after the palignr with pshufb. */
36754 if (d->testing_p)
36755 return true;
36756
36757 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36758 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36759 gen_lowpart (TImode, d->op1),
36760 gen_lowpart (TImode, d->op0), shift));
36761
36762 d->op0 = d->op1 = d->target;
36763 d->one_operand_p = true;
36764
36765 in_order = true;
36766 for (i = 0; i < nelt; ++i)
36767 {
36768 unsigned e = d->perm[i] - min;
36769 if (e != i)
36770 in_order = false;
36771 d->perm[i] = e;
36772 }
36773
36774 /* Test for the degenerate case where the alignment by itself
36775 produces the desired permutation. */
36776 if (in_order)
36777 return true;
36778
36779 ok = expand_vec_perm_1 (d);
36780 gcc_assert (ok);
36781
36782 return ok;
36783 }
36784
36785 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36786
36787 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36788 a two vector permutation into a single vector permutation by using
36789 an interleave operation to merge the vectors. */
36790
36791 static bool
36792 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36793 {
36794 struct expand_vec_perm_d dremap, dfinal;
36795 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36796 unsigned HOST_WIDE_INT contents;
36797 unsigned char remap[2 * MAX_VECT_LEN];
36798 rtx seq;
36799 bool ok, same_halves = false;
36800
36801 if (GET_MODE_SIZE (d->vmode) == 16)
36802 {
36803 if (d->one_operand_p)
36804 return false;
36805 }
36806 else if (GET_MODE_SIZE (d->vmode) == 32)
36807 {
36808 if (!TARGET_AVX)
36809 return false;
36810 /* For 32-byte modes allow even d->one_operand_p.
36811 The lack of cross-lane shuffling in some instructions
36812 might prevent a single insn shuffle. */
36813 dfinal = *d;
36814 dfinal.testing_p = true;
36815 /* If expand_vec_perm_interleave3 can expand this into
36816 a 3 insn sequence, give up and let it be expanded as
36817 3 insn sequence. While that is one insn longer,
36818 it doesn't need a memory operand and in the common
36819 case that both interleave low and high permutations
36820 with the same operands are adjacent needs 4 insns
36821 for both after CSE. */
36822 if (expand_vec_perm_interleave3 (&dfinal))
36823 return false;
36824 }
36825 else
36826 return false;
36827
36828 /* Examine from whence the elements come. */
36829 contents = 0;
36830 for (i = 0; i < nelt; ++i)
36831 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36832
36833 memset (remap, 0xff, sizeof (remap));
36834 dremap = *d;
36835
36836 if (GET_MODE_SIZE (d->vmode) == 16)
36837 {
36838 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36839
36840 /* Split the two input vectors into 4 halves. */
36841 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36842 h2 = h1 << nelt2;
36843 h3 = h2 << nelt2;
36844 h4 = h3 << nelt2;
36845
36846 /* If the elements from the low halves use interleave low, and similarly
36847 for interleave high. If the elements are from mis-matched halves, we
36848 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36849 if ((contents & (h1 | h3)) == contents)
36850 {
36851 /* punpckl* */
36852 for (i = 0; i < nelt2; ++i)
36853 {
36854 remap[i] = i * 2;
36855 remap[i + nelt] = i * 2 + 1;
36856 dremap.perm[i * 2] = i;
36857 dremap.perm[i * 2 + 1] = i + nelt;
36858 }
36859 if (!TARGET_SSE2 && d->vmode == V4SImode)
36860 dremap.vmode = V4SFmode;
36861 }
36862 else if ((contents & (h2 | h4)) == contents)
36863 {
36864 /* punpckh* */
36865 for (i = 0; i < nelt2; ++i)
36866 {
36867 remap[i + nelt2] = i * 2;
36868 remap[i + nelt + nelt2] = i * 2 + 1;
36869 dremap.perm[i * 2] = i + nelt2;
36870 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36871 }
36872 if (!TARGET_SSE2 && d->vmode == V4SImode)
36873 dremap.vmode = V4SFmode;
36874 }
36875 else if ((contents & (h1 | h4)) == contents)
36876 {
36877 /* shufps */
36878 for (i = 0; i < nelt2; ++i)
36879 {
36880 remap[i] = i;
36881 remap[i + nelt + nelt2] = i + nelt2;
36882 dremap.perm[i] = i;
36883 dremap.perm[i + nelt2] = i + nelt + nelt2;
36884 }
36885 if (nelt != 4)
36886 {
36887 /* shufpd */
36888 dremap.vmode = V2DImode;
36889 dremap.nelt = 2;
36890 dremap.perm[0] = 0;
36891 dremap.perm[1] = 3;
36892 }
36893 }
36894 else if ((contents & (h2 | h3)) == contents)
36895 {
36896 /* shufps */
36897 for (i = 0; i < nelt2; ++i)
36898 {
36899 remap[i + nelt2] = i;
36900 remap[i + nelt] = i + nelt2;
36901 dremap.perm[i] = i + nelt2;
36902 dremap.perm[i + nelt2] = i + nelt;
36903 }
36904 if (nelt != 4)
36905 {
36906 /* shufpd */
36907 dremap.vmode = V2DImode;
36908 dremap.nelt = 2;
36909 dremap.perm[0] = 1;
36910 dremap.perm[1] = 2;
36911 }
36912 }
36913 else
36914 return false;
36915 }
36916 else
36917 {
36918 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36919 unsigned HOST_WIDE_INT q[8];
36920 unsigned int nonzero_halves[4];
36921
36922 /* Split the two input vectors into 8 quarters. */
36923 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36924 for (i = 1; i < 8; ++i)
36925 q[i] = q[0] << (nelt4 * i);
36926 for (i = 0; i < 4; ++i)
36927 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36928 {
36929 nonzero_halves[nzcnt] = i;
36930 ++nzcnt;
36931 }
36932
36933 if (nzcnt == 1)
36934 {
36935 gcc_assert (d->one_operand_p);
36936 nonzero_halves[1] = nonzero_halves[0];
36937 same_halves = true;
36938 }
36939 else if (d->one_operand_p)
36940 {
36941 gcc_assert (nonzero_halves[0] == 0);
36942 gcc_assert (nonzero_halves[1] == 1);
36943 }
36944
36945 if (nzcnt <= 2)
36946 {
36947 if (d->perm[0] / nelt2 == nonzero_halves[1])
36948 {
36949 /* Attempt to increase the likelyhood that dfinal
36950 shuffle will be intra-lane. */
36951 char tmph = nonzero_halves[0];
36952 nonzero_halves[0] = nonzero_halves[1];
36953 nonzero_halves[1] = tmph;
36954 }
36955
36956 /* vperm2f128 or vperm2i128. */
36957 for (i = 0; i < nelt2; ++i)
36958 {
36959 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36960 remap[i + nonzero_halves[0] * nelt2] = i;
36961 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36962 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36963 }
36964
36965 if (d->vmode != V8SFmode
36966 && d->vmode != V4DFmode
36967 && d->vmode != V8SImode)
36968 {
36969 dremap.vmode = V8SImode;
36970 dremap.nelt = 8;
36971 for (i = 0; i < 4; ++i)
36972 {
36973 dremap.perm[i] = i + nonzero_halves[0] * 4;
36974 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36975 }
36976 }
36977 }
36978 else if (d->one_operand_p)
36979 return false;
36980 else if (TARGET_AVX2
36981 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36982 {
36983 /* vpunpckl* */
36984 for (i = 0; i < nelt4; ++i)
36985 {
36986 remap[i] = i * 2;
36987 remap[i + nelt] = i * 2 + 1;
36988 remap[i + nelt2] = i * 2 + nelt2;
36989 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36990 dremap.perm[i * 2] = i;
36991 dremap.perm[i * 2 + 1] = i + nelt;
36992 dremap.perm[i * 2 + nelt2] = i + nelt2;
36993 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36994 }
36995 }
36996 else if (TARGET_AVX2
36997 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36998 {
36999 /* vpunpckh* */
37000 for (i = 0; i < nelt4; ++i)
37001 {
37002 remap[i + nelt4] = i * 2;
37003 remap[i + nelt + nelt4] = i * 2 + 1;
37004 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37005 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37006 dremap.perm[i * 2] = i + nelt4;
37007 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37008 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37009 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37010 }
37011 }
37012 else
37013 return false;
37014 }
37015
37016 /* Use the remapping array set up above to move the elements from their
37017 swizzled locations into their final destinations. */
37018 dfinal = *d;
37019 for (i = 0; i < nelt; ++i)
37020 {
37021 unsigned e = remap[d->perm[i]];
37022 gcc_assert (e < nelt);
37023 /* If same_halves is true, both halves of the remapped vector are the
37024 same. Avoid cross-lane accesses if possible. */
37025 if (same_halves && i >= nelt2)
37026 {
37027 gcc_assert (e < nelt2);
37028 dfinal.perm[i] = e + nelt2;
37029 }
37030 else
37031 dfinal.perm[i] = e;
37032 }
37033 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37034 dfinal.op1 = dfinal.op0;
37035 dfinal.one_operand_p = true;
37036 dremap.target = dfinal.op0;
37037
37038 /* Test if the final remap can be done with a single insn. For V4SFmode or
37039 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37040 start_sequence ();
37041 ok = expand_vec_perm_1 (&dfinal);
37042 seq = get_insns ();
37043 end_sequence ();
37044
37045 if (!ok)
37046 return false;
37047
37048 if (d->testing_p)
37049 return true;
37050
37051 if (dremap.vmode != dfinal.vmode)
37052 {
37053 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37054 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37055 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37056 }
37057
37058 ok = expand_vec_perm_1 (&dremap);
37059 gcc_assert (ok);
37060
37061 emit_insn (seq);
37062 return true;
37063 }
37064
37065 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37066 a single vector cross-lane permutation into vpermq followed
37067 by any of the single insn permutations. */
37068
37069 static bool
37070 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37071 {
37072 struct expand_vec_perm_d dremap, dfinal;
37073 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37074 unsigned contents[2];
37075 bool ok;
37076
37077 if (!(TARGET_AVX2
37078 && (d->vmode == V32QImode || d->vmode == V16HImode)
37079 && d->one_operand_p))
37080 return false;
37081
37082 contents[0] = 0;
37083 contents[1] = 0;
37084 for (i = 0; i < nelt2; ++i)
37085 {
37086 contents[0] |= 1u << (d->perm[i] / nelt4);
37087 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37088 }
37089
37090 for (i = 0; i < 2; ++i)
37091 {
37092 unsigned int cnt = 0;
37093 for (j = 0; j < 4; ++j)
37094 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37095 return false;
37096 }
37097
37098 if (d->testing_p)
37099 return true;
37100
37101 dremap = *d;
37102 dremap.vmode = V4DImode;
37103 dremap.nelt = 4;
37104 dremap.target = gen_reg_rtx (V4DImode);
37105 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37106 dremap.op1 = dremap.op0;
37107 dremap.one_operand_p = true;
37108 for (i = 0; i < 2; ++i)
37109 {
37110 unsigned int cnt = 0;
37111 for (j = 0; j < 4; ++j)
37112 if ((contents[i] & (1u << j)) != 0)
37113 dremap.perm[2 * i + cnt++] = j;
37114 for (; cnt < 2; ++cnt)
37115 dremap.perm[2 * i + cnt] = 0;
37116 }
37117
37118 dfinal = *d;
37119 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37120 dfinal.op1 = dfinal.op0;
37121 dfinal.one_operand_p = true;
37122 for (i = 0, j = 0; i < nelt; ++i)
37123 {
37124 if (i == nelt2)
37125 j = 2;
37126 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37127 if ((d->perm[i] / nelt4) == dremap.perm[j])
37128 ;
37129 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37130 dfinal.perm[i] |= nelt4;
37131 else
37132 gcc_unreachable ();
37133 }
37134
37135 ok = expand_vec_perm_1 (&dremap);
37136 gcc_assert (ok);
37137
37138 ok = expand_vec_perm_1 (&dfinal);
37139 gcc_assert (ok);
37140
37141 return true;
37142 }
37143
37144 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37145 a vector permutation using two instructions, vperm2f128 resp.
37146 vperm2i128 followed by any single in-lane permutation. */
37147
37148 static bool
37149 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37150 {
37151 struct expand_vec_perm_d dfirst, dsecond;
37152 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37153 bool ok;
37154
37155 if (!TARGET_AVX
37156 || GET_MODE_SIZE (d->vmode) != 32
37157 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37158 return false;
37159
37160 dsecond = *d;
37161 dsecond.one_operand_p = false;
37162 dsecond.testing_p = true;
37163
37164 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37165 immediate. For perm < 16 the second permutation uses
37166 d->op0 as first operand, for perm >= 16 it uses d->op1
37167 as first operand. The second operand is the result of
37168 vperm2[fi]128. */
37169 for (perm = 0; perm < 32; perm++)
37170 {
37171 /* Ignore permutations which do not move anything cross-lane. */
37172 if (perm < 16)
37173 {
37174 /* The second shuffle for e.g. V4DFmode has
37175 0123 and ABCD operands.
37176 Ignore AB23, as 23 is already in the second lane
37177 of the first operand. */
37178 if ((perm & 0xc) == (1 << 2)) continue;
37179 /* And 01CD, as 01 is in the first lane of the first
37180 operand. */
37181 if ((perm & 3) == 0) continue;
37182 /* And 4567, as then the vperm2[fi]128 doesn't change
37183 anything on the original 4567 second operand. */
37184 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37185 }
37186 else
37187 {
37188 /* The second shuffle for e.g. V4DFmode has
37189 4567 and ABCD operands.
37190 Ignore AB67, as 67 is already in the second lane
37191 of the first operand. */
37192 if ((perm & 0xc) == (3 << 2)) continue;
37193 /* And 45CD, as 45 is in the first lane of the first
37194 operand. */
37195 if ((perm & 3) == 2) continue;
37196 /* And 0123, as then the vperm2[fi]128 doesn't change
37197 anything on the original 0123 first operand. */
37198 if ((perm & 0xf) == (1 << 2)) continue;
37199 }
37200
37201 for (i = 0; i < nelt; i++)
37202 {
37203 j = d->perm[i] / nelt2;
37204 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37205 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37206 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37207 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37208 else
37209 break;
37210 }
37211
37212 if (i == nelt)
37213 {
37214 start_sequence ();
37215 ok = expand_vec_perm_1 (&dsecond);
37216 end_sequence ();
37217 }
37218 else
37219 ok = false;
37220
37221 if (ok)
37222 {
37223 if (d->testing_p)
37224 return true;
37225
37226 /* Found a usable second shuffle. dfirst will be
37227 vperm2f128 on d->op0 and d->op1. */
37228 dsecond.testing_p = false;
37229 dfirst = *d;
37230 dfirst.target = gen_reg_rtx (d->vmode);
37231 for (i = 0; i < nelt; i++)
37232 dfirst.perm[i] = (i & (nelt2 - 1))
37233 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37234
37235 ok = expand_vec_perm_1 (&dfirst);
37236 gcc_assert (ok);
37237
37238 /* And dsecond is some single insn shuffle, taking
37239 d->op0 and result of vperm2f128 (if perm < 16) or
37240 d->op1 and result of vperm2f128 (otherwise). */
37241 dsecond.op1 = dfirst.target;
37242 if (perm >= 16)
37243 dsecond.op0 = dfirst.op1;
37244
37245 ok = expand_vec_perm_1 (&dsecond);
37246 gcc_assert (ok);
37247
37248 return true;
37249 }
37250
37251 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37252 if (d->one_operand_p)
37253 return false;
37254 }
37255
37256 return false;
37257 }
37258
37259 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37260 a two vector permutation using 2 intra-lane interleave insns
37261 and cross-lane shuffle for 32-byte vectors. */
37262
37263 static bool
37264 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37265 {
37266 unsigned i, nelt;
37267 rtx (*gen) (rtx, rtx, rtx);
37268
37269 if (d->one_operand_p)
37270 return false;
37271 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37272 ;
37273 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37274 ;
37275 else
37276 return false;
37277
37278 nelt = d->nelt;
37279 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37280 return false;
37281 for (i = 0; i < nelt; i += 2)
37282 if (d->perm[i] != d->perm[0] + i / 2
37283 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37284 return false;
37285
37286 if (d->testing_p)
37287 return true;
37288
37289 switch (d->vmode)
37290 {
37291 case V32QImode:
37292 if (d->perm[0])
37293 gen = gen_vec_interleave_highv32qi;
37294 else
37295 gen = gen_vec_interleave_lowv32qi;
37296 break;
37297 case V16HImode:
37298 if (d->perm[0])
37299 gen = gen_vec_interleave_highv16hi;
37300 else
37301 gen = gen_vec_interleave_lowv16hi;
37302 break;
37303 case V8SImode:
37304 if (d->perm[0])
37305 gen = gen_vec_interleave_highv8si;
37306 else
37307 gen = gen_vec_interleave_lowv8si;
37308 break;
37309 case V4DImode:
37310 if (d->perm[0])
37311 gen = gen_vec_interleave_highv4di;
37312 else
37313 gen = gen_vec_interleave_lowv4di;
37314 break;
37315 case V8SFmode:
37316 if (d->perm[0])
37317 gen = gen_vec_interleave_highv8sf;
37318 else
37319 gen = gen_vec_interleave_lowv8sf;
37320 break;
37321 case V4DFmode:
37322 if (d->perm[0])
37323 gen = gen_vec_interleave_highv4df;
37324 else
37325 gen = gen_vec_interleave_lowv4df;
37326 break;
37327 default:
37328 gcc_unreachable ();
37329 }
37330
37331 emit_insn (gen (d->target, d->op0, d->op1));
37332 return true;
37333 }
37334
37335 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37336 a single vector permutation using a single intra-lane vector
37337 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37338 the non-swapped and swapped vectors together. */
37339
37340 static bool
37341 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37342 {
37343 struct expand_vec_perm_d dfirst, dsecond;
37344 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37345 rtx seq;
37346 bool ok;
37347 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37348
37349 if (!TARGET_AVX
37350 || TARGET_AVX2
37351 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37352 || !d->one_operand_p)
37353 return false;
37354
37355 dfirst = *d;
37356 for (i = 0; i < nelt; i++)
37357 dfirst.perm[i] = 0xff;
37358 for (i = 0, msk = 0; i < nelt; i++)
37359 {
37360 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37361 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37362 return false;
37363 dfirst.perm[j] = d->perm[i];
37364 if (j != i)
37365 msk |= (1 << i);
37366 }
37367 for (i = 0; i < nelt; i++)
37368 if (dfirst.perm[i] == 0xff)
37369 dfirst.perm[i] = i;
37370
37371 if (!d->testing_p)
37372 dfirst.target = gen_reg_rtx (dfirst.vmode);
37373
37374 start_sequence ();
37375 ok = expand_vec_perm_1 (&dfirst);
37376 seq = get_insns ();
37377 end_sequence ();
37378
37379 if (!ok)
37380 return false;
37381
37382 if (d->testing_p)
37383 return true;
37384
37385 emit_insn (seq);
37386
37387 dsecond = *d;
37388 dsecond.op0 = dfirst.target;
37389 dsecond.op1 = dfirst.target;
37390 dsecond.one_operand_p = true;
37391 dsecond.target = gen_reg_rtx (dsecond.vmode);
37392 for (i = 0; i < nelt; i++)
37393 dsecond.perm[i] = i ^ nelt2;
37394
37395 ok = expand_vec_perm_1 (&dsecond);
37396 gcc_assert (ok);
37397
37398 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37399 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37400 return true;
37401 }
37402
37403 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37404 permutation with two pshufb insns and an ior. We should have already
37405 failed all two instruction sequences. */
37406
37407 static bool
37408 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37409 {
37410 rtx rperm[2][16], vperm, l, h, op, m128;
37411 unsigned int i, nelt, eltsz;
37412
37413 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37414 return false;
37415 gcc_assert (!d->one_operand_p);
37416
37417 nelt = d->nelt;
37418 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37419
37420 /* Generate two permutation masks. If the required element is within
37421 the given vector it is shuffled into the proper lane. If the required
37422 element is in the other vector, force a zero into the lane by setting
37423 bit 7 in the permutation mask. */
37424 m128 = GEN_INT (-128);
37425 for (i = 0; i < nelt; ++i)
37426 {
37427 unsigned j, e = d->perm[i];
37428 unsigned which = (e >= nelt);
37429 if (e >= nelt)
37430 e -= nelt;
37431
37432 for (j = 0; j < eltsz; ++j)
37433 {
37434 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37435 rperm[1-which][i*eltsz + j] = m128;
37436 }
37437 }
37438
37439 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37440 vperm = force_reg (V16QImode, vperm);
37441
37442 l = gen_reg_rtx (V16QImode);
37443 op = gen_lowpart (V16QImode, d->op0);
37444 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37445
37446 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37447 vperm = force_reg (V16QImode, vperm);
37448
37449 h = gen_reg_rtx (V16QImode);
37450 op = gen_lowpart (V16QImode, d->op1);
37451 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37452
37453 op = gen_lowpart (V16QImode, d->target);
37454 emit_insn (gen_iorv16qi3 (op, l, h));
37455
37456 return true;
37457 }
37458
37459 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37460 with two vpshufb insns, vpermq and vpor. We should have already failed
37461 all two or three instruction sequences. */
37462
37463 static bool
37464 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37465 {
37466 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37467 unsigned int i, nelt, eltsz;
37468
37469 if (!TARGET_AVX2
37470 || !d->one_operand_p
37471 || (d->vmode != V32QImode && d->vmode != V16HImode))
37472 return false;
37473
37474 if (d->testing_p)
37475 return true;
37476
37477 nelt = d->nelt;
37478 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37479
37480 /* Generate two permutation masks. If the required element is within
37481 the same lane, it is shuffled in. If the required element from the
37482 other lane, force a zero by setting bit 7 in the permutation mask.
37483 In the other mask the mask has non-negative elements if element
37484 is requested from the other lane, but also moved to the other lane,
37485 so that the result of vpshufb can have the two V2TImode halves
37486 swapped. */
37487 m128 = GEN_INT (-128);
37488 for (i = 0; i < nelt; ++i)
37489 {
37490 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37491 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37492
37493 for (j = 0; j < eltsz; ++j)
37494 {
37495 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37496 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37497 }
37498 }
37499
37500 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37501 vperm = force_reg (V32QImode, vperm);
37502
37503 h = gen_reg_rtx (V32QImode);
37504 op = gen_lowpart (V32QImode, d->op0);
37505 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37506
37507 /* Swap the 128-byte lanes of h into hp. */
37508 hp = gen_reg_rtx (V4DImode);
37509 op = gen_lowpart (V4DImode, h);
37510 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37511 const1_rtx));
37512
37513 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37514 vperm = force_reg (V32QImode, vperm);
37515
37516 l = gen_reg_rtx (V32QImode);
37517 op = gen_lowpart (V32QImode, d->op0);
37518 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37519
37520 op = gen_lowpart (V32QImode, d->target);
37521 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37522
37523 return true;
37524 }
37525
37526 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37527 and extract-odd permutations of two V32QImode and V16QImode operand
37528 with two vpshufb insns, vpor and vpermq. We should have already
37529 failed all two or three instruction sequences. */
37530
37531 static bool
37532 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37533 {
37534 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37535 unsigned int i, nelt, eltsz;
37536
37537 if (!TARGET_AVX2
37538 || d->one_operand_p
37539 || (d->vmode != V32QImode && d->vmode != V16HImode))
37540 return false;
37541
37542 for (i = 0; i < d->nelt; ++i)
37543 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37544 return false;
37545
37546 if (d->testing_p)
37547 return true;
37548
37549 nelt = d->nelt;
37550 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37551
37552 /* Generate two permutation masks. In the first permutation mask
37553 the first quarter will contain indexes for the first half
37554 of the op0, the second quarter will contain bit 7 set, third quarter
37555 will contain indexes for the second half of the op0 and the
37556 last quarter bit 7 set. In the second permutation mask
37557 the first quarter will contain bit 7 set, the second quarter
37558 indexes for the first half of the op1, the third quarter bit 7 set
37559 and last quarter indexes for the second half of the op1.
37560 I.e. the first mask e.g. for V32QImode extract even will be:
37561 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37562 (all values masked with 0xf except for -128) and second mask
37563 for extract even will be
37564 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37565 m128 = GEN_INT (-128);
37566 for (i = 0; i < nelt; ++i)
37567 {
37568 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37569 unsigned which = d->perm[i] >= nelt;
37570 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37571
37572 for (j = 0; j < eltsz; ++j)
37573 {
37574 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37575 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37576 }
37577 }
37578
37579 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37580 vperm = force_reg (V32QImode, vperm);
37581
37582 l = gen_reg_rtx (V32QImode);
37583 op = gen_lowpart (V32QImode, d->op0);
37584 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37585
37586 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37587 vperm = force_reg (V32QImode, vperm);
37588
37589 h = gen_reg_rtx (V32QImode);
37590 op = gen_lowpart (V32QImode, d->op1);
37591 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37592
37593 ior = gen_reg_rtx (V32QImode);
37594 emit_insn (gen_iorv32qi3 (ior, l, h));
37595
37596 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37597 op = gen_lowpart (V4DImode, d->target);
37598 ior = gen_lowpart (V4DImode, ior);
37599 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37600 const1_rtx, GEN_INT (3)));
37601
37602 return true;
37603 }
37604
37605 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37606 and extract-odd permutations. */
37607
37608 static bool
37609 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37610 {
37611 rtx t1, t2, t3;
37612
37613 switch (d->vmode)
37614 {
37615 case V4DFmode:
37616 t1 = gen_reg_rtx (V4DFmode);
37617 t2 = gen_reg_rtx (V4DFmode);
37618
37619 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37620 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37621 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37622
37623 /* Now an unpck[lh]pd will produce the result required. */
37624 if (odd)
37625 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37626 else
37627 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37628 emit_insn (t3);
37629 break;
37630
37631 case V8SFmode:
37632 {
37633 int mask = odd ? 0xdd : 0x88;
37634
37635 t1 = gen_reg_rtx (V8SFmode);
37636 t2 = gen_reg_rtx (V8SFmode);
37637 t3 = gen_reg_rtx (V8SFmode);
37638
37639 /* Shuffle within the 128-bit lanes to produce:
37640 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37641 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37642 GEN_INT (mask)));
37643
37644 /* Shuffle the lanes around to produce:
37645 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37646 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37647 GEN_INT (0x3)));
37648
37649 /* Shuffle within the 128-bit lanes to produce:
37650 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37651 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37652
37653 /* Shuffle within the 128-bit lanes to produce:
37654 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37655 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37656
37657 /* Shuffle the lanes around to produce:
37658 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37659 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37660 GEN_INT (0x20)));
37661 }
37662 break;
37663
37664 case V2DFmode:
37665 case V4SFmode:
37666 case V2DImode:
37667 case V4SImode:
37668 /* These are always directly implementable by expand_vec_perm_1. */
37669 gcc_unreachable ();
37670
37671 case V8HImode:
37672 if (TARGET_SSSE3)
37673 return expand_vec_perm_pshufb2 (d);
37674 else
37675 {
37676 /* We need 2*log2(N)-1 operations to achieve odd/even
37677 with interleave. */
37678 t1 = gen_reg_rtx (V8HImode);
37679 t2 = gen_reg_rtx (V8HImode);
37680 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37681 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37682 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37683 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37684 if (odd)
37685 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37686 else
37687 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37688 emit_insn (t3);
37689 }
37690 break;
37691
37692 case V16QImode:
37693 if (TARGET_SSSE3)
37694 return expand_vec_perm_pshufb2 (d);
37695 else
37696 {
37697 t1 = gen_reg_rtx (V16QImode);
37698 t2 = gen_reg_rtx (V16QImode);
37699 t3 = gen_reg_rtx (V16QImode);
37700 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37701 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37702 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37703 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37704 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37705 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37706 if (odd)
37707 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37708 else
37709 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37710 emit_insn (t3);
37711 }
37712 break;
37713
37714 case V16HImode:
37715 case V32QImode:
37716 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37717
37718 case V4DImode:
37719 if (!TARGET_AVX2)
37720 {
37721 struct expand_vec_perm_d d_copy = *d;
37722 d_copy.vmode = V4DFmode;
37723 d_copy.target = gen_lowpart (V4DFmode, d->target);
37724 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37725 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37726 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37727 }
37728
37729 t1 = gen_reg_rtx (V4DImode);
37730 t2 = gen_reg_rtx (V4DImode);
37731
37732 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37733 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37734 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37735
37736 /* Now an vpunpck[lh]qdq will produce the result required. */
37737 if (odd)
37738 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37739 else
37740 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37741 emit_insn (t3);
37742 break;
37743
37744 case V8SImode:
37745 if (!TARGET_AVX2)
37746 {
37747 struct expand_vec_perm_d d_copy = *d;
37748 d_copy.vmode = V8SFmode;
37749 d_copy.target = gen_lowpart (V8SFmode, d->target);
37750 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37751 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37752 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37753 }
37754
37755 t1 = gen_reg_rtx (V8SImode);
37756 t2 = gen_reg_rtx (V8SImode);
37757
37758 /* Shuffle the lanes around into
37759 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37760 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37761 gen_lowpart (V4DImode, d->op0),
37762 gen_lowpart (V4DImode, d->op1),
37763 GEN_INT (0x20)));
37764 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37765 gen_lowpart (V4DImode, d->op0),
37766 gen_lowpart (V4DImode, d->op1),
37767 GEN_INT (0x31)));
37768
37769 /* Swap the 2nd and 3rd position in each lane into
37770 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37771 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37772 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37773 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37774 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37775
37776 /* Now an vpunpck[lh]qdq will produce
37777 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37778 if (odd)
37779 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37780 gen_lowpart (V4DImode, t1),
37781 gen_lowpart (V4DImode, t2));
37782 else
37783 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37784 gen_lowpart (V4DImode, t1),
37785 gen_lowpart (V4DImode, t2));
37786 emit_insn (t3);
37787 break;
37788
37789 default:
37790 gcc_unreachable ();
37791 }
37792
37793 return true;
37794 }
37795
37796 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37797 extract-even and extract-odd permutations. */
37798
37799 static bool
37800 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37801 {
37802 unsigned i, odd, nelt = d->nelt;
37803
37804 odd = d->perm[0];
37805 if (odd != 0 && odd != 1)
37806 return false;
37807
37808 for (i = 1; i < nelt; ++i)
37809 if (d->perm[i] != 2 * i + odd)
37810 return false;
37811
37812 return expand_vec_perm_even_odd_1 (d, odd);
37813 }
37814
37815 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37816 permutations. We assume that expand_vec_perm_1 has already failed. */
37817
37818 static bool
37819 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37820 {
37821 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37822 enum machine_mode vmode = d->vmode;
37823 unsigned char perm2[4];
37824 rtx op0 = d->op0;
37825 bool ok;
37826
37827 switch (vmode)
37828 {
37829 case V4DFmode:
37830 case V8SFmode:
37831 /* These are special-cased in sse.md so that we can optionally
37832 use the vbroadcast instruction. They expand to two insns
37833 if the input happens to be in a register. */
37834 gcc_unreachable ();
37835
37836 case V2DFmode:
37837 case V2DImode:
37838 case V4SFmode:
37839 case V4SImode:
37840 /* These are always implementable using standard shuffle patterns. */
37841 gcc_unreachable ();
37842
37843 case V8HImode:
37844 case V16QImode:
37845 /* These can be implemented via interleave. We save one insn by
37846 stopping once we have promoted to V4SImode and then use pshufd. */
37847 do
37848 {
37849 rtx dest;
37850 rtx (*gen) (rtx, rtx, rtx)
37851 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37852 : gen_vec_interleave_lowv8hi;
37853
37854 if (elt >= nelt2)
37855 {
37856 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37857 : gen_vec_interleave_highv8hi;
37858 elt -= nelt2;
37859 }
37860 nelt2 /= 2;
37861
37862 dest = gen_reg_rtx (vmode);
37863 emit_insn (gen (dest, op0, op0));
37864 vmode = get_mode_wider_vector (vmode);
37865 op0 = gen_lowpart (vmode, dest);
37866 }
37867 while (vmode != V4SImode);
37868
37869 memset (perm2, elt, 4);
37870 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
37871 d->testing_p);
37872 gcc_assert (ok);
37873 return true;
37874
37875 case V32QImode:
37876 case V16HImode:
37877 case V8SImode:
37878 case V4DImode:
37879 /* For AVX2 broadcasts of the first element vpbroadcast* or
37880 vpermq should be used by expand_vec_perm_1. */
37881 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37882 return false;
37883
37884 default:
37885 gcc_unreachable ();
37886 }
37887 }
37888
37889 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37890 broadcast permutations. */
37891
37892 static bool
37893 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37894 {
37895 unsigned i, elt, nelt = d->nelt;
37896
37897 if (!d->one_operand_p)
37898 return false;
37899
37900 elt = d->perm[0];
37901 for (i = 1; i < nelt; ++i)
37902 if (d->perm[i] != elt)
37903 return false;
37904
37905 return expand_vec_perm_broadcast_1 (d);
37906 }
37907
37908 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37909 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37910 all the shorter instruction sequences. */
37911
37912 static bool
37913 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37914 {
37915 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37916 unsigned int i, nelt, eltsz;
37917 bool used[4];
37918
37919 if (!TARGET_AVX2
37920 || d->one_operand_p
37921 || (d->vmode != V32QImode && d->vmode != V16HImode))
37922 return false;
37923
37924 if (d->testing_p)
37925 return true;
37926
37927 nelt = d->nelt;
37928 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37929
37930 /* Generate 4 permutation masks. If the required element is within
37931 the same lane, it is shuffled in. If the required element from the
37932 other lane, force a zero by setting bit 7 in the permutation mask.
37933 In the other mask the mask has non-negative elements if element
37934 is requested from the other lane, but also moved to the other lane,
37935 so that the result of vpshufb can have the two V2TImode halves
37936 swapped. */
37937 m128 = GEN_INT (-128);
37938 for (i = 0; i < 32; ++i)
37939 {
37940 rperm[0][i] = m128;
37941 rperm[1][i] = m128;
37942 rperm[2][i] = m128;
37943 rperm[3][i] = m128;
37944 }
37945 used[0] = false;
37946 used[1] = false;
37947 used[2] = false;
37948 used[3] = false;
37949 for (i = 0; i < nelt; ++i)
37950 {
37951 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37952 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37953 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37954
37955 for (j = 0; j < eltsz; ++j)
37956 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37957 used[which] = true;
37958 }
37959
37960 for (i = 0; i < 2; ++i)
37961 {
37962 if (!used[2 * i + 1])
37963 {
37964 h[i] = NULL_RTX;
37965 continue;
37966 }
37967 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37968 gen_rtvec_v (32, rperm[2 * i + 1]));
37969 vperm = force_reg (V32QImode, vperm);
37970 h[i] = gen_reg_rtx (V32QImode);
37971 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37972 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37973 }
37974
37975 /* Swap the 128-byte lanes of h[X]. */
37976 for (i = 0; i < 2; ++i)
37977 {
37978 if (h[i] == NULL_RTX)
37979 continue;
37980 op = gen_reg_rtx (V4DImode);
37981 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37982 const2_rtx, GEN_INT (3), const0_rtx,
37983 const1_rtx));
37984 h[i] = gen_lowpart (V32QImode, op);
37985 }
37986
37987 for (i = 0; i < 2; ++i)
37988 {
37989 if (!used[2 * i])
37990 {
37991 l[i] = NULL_RTX;
37992 continue;
37993 }
37994 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37995 vperm = force_reg (V32QImode, vperm);
37996 l[i] = gen_reg_rtx (V32QImode);
37997 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37998 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37999 }
38000
38001 for (i = 0; i < 2; ++i)
38002 {
38003 if (h[i] && l[i])
38004 {
38005 op = gen_reg_rtx (V32QImode);
38006 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38007 l[i] = op;
38008 }
38009 else if (h[i])
38010 l[i] = h[i];
38011 }
38012
38013 gcc_assert (l[0] && l[1]);
38014 op = gen_lowpart (V32QImode, d->target);
38015 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38016 return true;
38017 }
38018
38019 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38020 With all of the interface bits taken care of, perform the expansion
38021 in D and return true on success. */
38022
38023 static bool
38024 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38025 {
38026 /* Try a single instruction expansion. */
38027 if (expand_vec_perm_1 (d))
38028 return true;
38029
38030 /* Try sequences of two instructions. */
38031
38032 if (expand_vec_perm_pshuflw_pshufhw (d))
38033 return true;
38034
38035 if (expand_vec_perm_palignr (d))
38036 return true;
38037
38038 if (expand_vec_perm_interleave2 (d))
38039 return true;
38040
38041 if (expand_vec_perm_broadcast (d))
38042 return true;
38043
38044 if (expand_vec_perm_vpermq_perm_1 (d))
38045 return true;
38046
38047 if (expand_vec_perm_vperm2f128 (d))
38048 return true;
38049
38050 /* Try sequences of three instructions. */
38051
38052 if (expand_vec_perm_pshufb2 (d))
38053 return true;
38054
38055 if (expand_vec_perm_interleave3 (d))
38056 return true;
38057
38058 if (expand_vec_perm_vperm2f128_vblend (d))
38059 return true;
38060
38061 /* Try sequences of four instructions. */
38062
38063 if (expand_vec_perm_vpshufb2_vpermq (d))
38064 return true;
38065
38066 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38067 return true;
38068
38069 /* ??? Look for narrow permutations whose element orderings would
38070 allow the promotion to a wider mode. */
38071
38072 /* ??? Look for sequences of interleave or a wider permute that place
38073 the data into the correct lanes for a half-vector shuffle like
38074 pshuf[lh]w or vpermilps. */
38075
38076 /* ??? Look for sequences of interleave that produce the desired results.
38077 The combinatorics of punpck[lh] get pretty ugly... */
38078
38079 if (expand_vec_perm_even_odd (d))
38080 return true;
38081
38082 /* Even longer sequences. */
38083 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38084 return true;
38085
38086 return false;
38087 }
38088
38089 bool
38090 ix86_expand_vec_perm_const (rtx operands[4])
38091 {
38092 struct expand_vec_perm_d d;
38093 unsigned char perm[MAX_VECT_LEN];
38094 int i, nelt, which;
38095 rtx sel;
38096
38097 d.target = operands[0];
38098 d.op0 = operands[1];
38099 d.op1 = operands[2];
38100 sel = operands[3];
38101
38102 d.vmode = GET_MODE (d.target);
38103 gcc_assert (VECTOR_MODE_P (d.vmode));
38104 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38105 d.testing_p = false;
38106
38107 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38108 gcc_assert (XVECLEN (sel, 0) == nelt);
38109 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38110
38111 for (i = which = 0; i < nelt; ++i)
38112 {
38113 rtx e = XVECEXP (sel, 0, i);
38114 int ei = INTVAL (e) & (2 * nelt - 1);
38115
38116 which |= (ei < nelt ? 1 : 2);
38117 d.perm[i] = ei;
38118 perm[i] = ei;
38119 }
38120
38121 d.one_operand_p = true;
38122 switch (which)
38123 {
38124 default:
38125 gcc_unreachable();
38126
38127 case 3:
38128 if (!rtx_equal_p (d.op0, d.op1))
38129 {
38130 d.one_operand_p = false;
38131 break;
38132 }
38133 /* The elements of PERM do not suggest that only the first operand
38134 is used, but both operands are identical. Allow easier matching
38135 of the permutation by folding the permutation into the single
38136 input vector. */
38137 /* FALLTHRU */
38138
38139 case 2:
38140 for (i = 0; i < nelt; ++i)
38141 d.perm[i] &= nelt - 1;
38142 d.op0 = d.op1;
38143 break;
38144
38145 case 1:
38146 d.op1 = d.op0;
38147 break;
38148 }
38149
38150 if (ix86_expand_vec_perm_const_1 (&d))
38151 return true;
38152
38153 /* If the selector says both arguments are needed, but the operands are the
38154 same, the above tried to expand with one_operand_p and flattened selector.
38155 If that didn't work, retry without one_operand_p; we succeeded with that
38156 during testing. */
38157 if (which == 3 && d.one_operand_p)
38158 {
38159 d.one_operand_p = false;
38160 memcpy (d.perm, perm, sizeof (perm));
38161 return ix86_expand_vec_perm_const_1 (&d);
38162 }
38163
38164 return false;
38165 }
38166
38167 /* Implement targetm.vectorize.vec_perm_const_ok. */
38168
38169 static bool
38170 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38171 const unsigned char *sel)
38172 {
38173 struct expand_vec_perm_d d;
38174 unsigned int i, nelt, which;
38175 bool ret;
38176
38177 d.vmode = vmode;
38178 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38179 d.testing_p = true;
38180
38181 /* Given sufficient ISA support we can just return true here
38182 for selected vector modes. */
38183 if (GET_MODE_SIZE (d.vmode) == 16)
38184 {
38185 /* All implementable with a single vpperm insn. */
38186 if (TARGET_XOP)
38187 return true;
38188 /* All implementable with 2 pshufb + 1 ior. */
38189 if (TARGET_SSSE3)
38190 return true;
38191 /* All implementable with shufpd or unpck[lh]pd. */
38192 if (d.nelt == 2)
38193 return true;
38194 }
38195
38196 /* Extract the values from the vector CST into the permutation
38197 array in D. */
38198 memcpy (d.perm, sel, nelt);
38199 for (i = which = 0; i < nelt; ++i)
38200 {
38201 unsigned char e = d.perm[i];
38202 gcc_assert (e < 2 * nelt);
38203 which |= (e < nelt ? 1 : 2);
38204 }
38205
38206 /* For all elements from second vector, fold the elements to first. */
38207 if (which == 2)
38208 for (i = 0; i < nelt; ++i)
38209 d.perm[i] -= nelt;
38210
38211 /* Check whether the mask can be applied to the vector type. */
38212 d.one_operand_p = (which != 3);
38213
38214 /* Implementable with shufps or pshufd. */
38215 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38216 return true;
38217
38218 /* Otherwise we have to go through the motions and see if we can
38219 figure out how to generate the requested permutation. */
38220 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38221 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38222 if (!d.one_operand_p)
38223 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38224
38225 start_sequence ();
38226 ret = ix86_expand_vec_perm_const_1 (&d);
38227 end_sequence ();
38228
38229 return ret;
38230 }
38231
38232 void
38233 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38234 {
38235 struct expand_vec_perm_d d;
38236 unsigned i, nelt;
38237
38238 d.target = targ;
38239 d.op0 = op0;
38240 d.op1 = op1;
38241 d.vmode = GET_MODE (targ);
38242 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38243 d.one_operand_p = false;
38244 d.testing_p = false;
38245
38246 for (i = 0; i < nelt; ++i)
38247 d.perm[i] = i * 2 + odd;
38248
38249 /* We'll either be able to implement the permutation directly... */
38250 if (expand_vec_perm_1 (&d))
38251 return;
38252
38253 /* ... or we use the special-case patterns. */
38254 expand_vec_perm_even_odd_1 (&d, odd);
38255 }
38256
38257 /* Expand an insert into a vector register through pinsr insn.
38258 Return true if successful. */
38259
38260 bool
38261 ix86_expand_pinsr (rtx *operands)
38262 {
38263 rtx dst = operands[0];
38264 rtx src = operands[3];
38265
38266 unsigned int size = INTVAL (operands[1]);
38267 unsigned int pos = INTVAL (operands[2]);
38268
38269 if (GET_CODE (dst) == SUBREG)
38270 {
38271 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
38272 dst = SUBREG_REG (dst);
38273 }
38274
38275 if (GET_CODE (src) == SUBREG)
38276 src = SUBREG_REG (src);
38277
38278 switch (GET_MODE (dst))
38279 {
38280 case V16QImode:
38281 case V8HImode:
38282 case V4SImode:
38283 case V2DImode:
38284 {
38285 enum machine_mode srcmode, dstmode;
38286 rtx (*pinsr)(rtx, rtx, rtx, rtx);
38287
38288 srcmode = mode_for_size (size, MODE_INT, 0);
38289
38290 switch (srcmode)
38291 {
38292 case QImode:
38293 if (!TARGET_SSE4_1)
38294 return false;
38295 dstmode = V16QImode;
38296 pinsr = gen_sse4_1_pinsrb;
38297 break;
38298
38299 case HImode:
38300 if (!TARGET_SSE2)
38301 return false;
38302 dstmode = V8HImode;
38303 pinsr = gen_sse2_pinsrw;
38304 break;
38305
38306 case SImode:
38307 if (!TARGET_SSE4_1)
38308 return false;
38309 dstmode = V4SImode;
38310 pinsr = gen_sse4_1_pinsrd;
38311 break;
38312
38313 case DImode:
38314 gcc_assert (TARGET_64BIT);
38315 if (!TARGET_SSE4_1)
38316 return false;
38317 dstmode = V2DImode;
38318 pinsr = gen_sse4_1_pinsrq;
38319 break;
38320
38321 default:
38322 return false;
38323 }
38324
38325 dst = gen_lowpart (dstmode, dst);
38326 src = gen_lowpart (srcmode, src);
38327
38328 pos /= size;
38329
38330 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
38331 return true;
38332 }
38333
38334 default:
38335 return false;
38336 }
38337 }
38338 \f
38339 /* This function returns the calling abi specific va_list type node.
38340 It returns the FNDECL specific va_list type. */
38341
38342 static tree
38343 ix86_fn_abi_va_list (tree fndecl)
38344 {
38345 if (!TARGET_64BIT)
38346 return va_list_type_node;
38347 gcc_assert (fndecl != NULL_TREE);
38348
38349 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
38350 return ms_va_list_type_node;
38351 else
38352 return sysv_va_list_type_node;
38353 }
38354
38355 /* Returns the canonical va_list type specified by TYPE. If there
38356 is no valid TYPE provided, it return NULL_TREE. */
38357
38358 static tree
38359 ix86_canonical_va_list_type (tree type)
38360 {
38361 tree wtype, htype;
38362
38363 /* Resolve references and pointers to va_list type. */
38364 if (TREE_CODE (type) == MEM_REF)
38365 type = TREE_TYPE (type);
38366 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
38367 type = TREE_TYPE (type);
38368 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
38369 type = TREE_TYPE (type);
38370
38371 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
38372 {
38373 wtype = va_list_type_node;
38374 gcc_assert (wtype != NULL_TREE);
38375 htype = type;
38376 if (TREE_CODE (wtype) == ARRAY_TYPE)
38377 {
38378 /* If va_list is an array type, the argument may have decayed
38379 to a pointer type, e.g. by being passed to another function.
38380 In that case, unwrap both types so that we can compare the
38381 underlying records. */
38382 if (TREE_CODE (htype) == ARRAY_TYPE
38383 || POINTER_TYPE_P (htype))
38384 {
38385 wtype = TREE_TYPE (wtype);
38386 htype = TREE_TYPE (htype);
38387 }
38388 }
38389 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38390 return va_list_type_node;
38391 wtype = sysv_va_list_type_node;
38392 gcc_assert (wtype != NULL_TREE);
38393 htype = type;
38394 if (TREE_CODE (wtype) == ARRAY_TYPE)
38395 {
38396 /* If va_list is an array type, the argument may have decayed
38397 to a pointer type, e.g. by being passed to another function.
38398 In that case, unwrap both types so that we can compare the
38399 underlying records. */
38400 if (TREE_CODE (htype) == ARRAY_TYPE
38401 || POINTER_TYPE_P (htype))
38402 {
38403 wtype = TREE_TYPE (wtype);
38404 htype = TREE_TYPE (htype);
38405 }
38406 }
38407 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38408 return sysv_va_list_type_node;
38409 wtype = ms_va_list_type_node;
38410 gcc_assert (wtype != NULL_TREE);
38411 htype = type;
38412 if (TREE_CODE (wtype) == ARRAY_TYPE)
38413 {
38414 /* If va_list is an array type, the argument may have decayed
38415 to a pointer type, e.g. by being passed to another function.
38416 In that case, unwrap both types so that we can compare the
38417 underlying records. */
38418 if (TREE_CODE (htype) == ARRAY_TYPE
38419 || POINTER_TYPE_P (htype))
38420 {
38421 wtype = TREE_TYPE (wtype);
38422 htype = TREE_TYPE (htype);
38423 }
38424 }
38425 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38426 return ms_va_list_type_node;
38427 return NULL_TREE;
38428 }
38429 return std_canonical_va_list_type (type);
38430 }
38431
38432 /* Iterate through the target-specific builtin types for va_list.
38433 IDX denotes the iterator, *PTREE is set to the result type of
38434 the va_list builtin, and *PNAME to its internal type.
38435 Returns zero if there is no element for this index, otherwise
38436 IDX should be increased upon the next call.
38437 Note, do not iterate a base builtin's name like __builtin_va_list.
38438 Used from c_common_nodes_and_builtins. */
38439
38440 static int
38441 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
38442 {
38443 if (TARGET_64BIT)
38444 {
38445 switch (idx)
38446 {
38447 default:
38448 break;
38449
38450 case 0:
38451 *ptree = ms_va_list_type_node;
38452 *pname = "__builtin_ms_va_list";
38453 return 1;
38454
38455 case 1:
38456 *ptree = sysv_va_list_type_node;
38457 *pname = "__builtin_sysv_va_list";
38458 return 1;
38459 }
38460 }
38461
38462 return 0;
38463 }
38464
38465 #undef TARGET_SCHED_DISPATCH
38466 #define TARGET_SCHED_DISPATCH has_dispatch
38467 #undef TARGET_SCHED_DISPATCH_DO
38468 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38469 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38470 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38471
38472 /* The size of the dispatch window is the total number of bytes of
38473 object code allowed in a window. */
38474 #define DISPATCH_WINDOW_SIZE 16
38475
38476 /* Number of dispatch windows considered for scheduling. */
38477 #define MAX_DISPATCH_WINDOWS 3
38478
38479 /* Maximum number of instructions in a window. */
38480 #define MAX_INSN 4
38481
38482 /* Maximum number of immediate operands in a window. */
38483 #define MAX_IMM 4
38484
38485 /* Maximum number of immediate bits allowed in a window. */
38486 #define MAX_IMM_SIZE 128
38487
38488 /* Maximum number of 32 bit immediates allowed in a window. */
38489 #define MAX_IMM_32 4
38490
38491 /* Maximum number of 64 bit immediates allowed in a window. */
38492 #define MAX_IMM_64 2
38493
38494 /* Maximum total of loads or prefetches allowed in a window. */
38495 #define MAX_LOAD 2
38496
38497 /* Maximum total of stores allowed in a window. */
38498 #define MAX_STORE 1
38499
38500 #undef BIG
38501 #define BIG 100
38502
38503
38504 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38505 enum dispatch_group {
38506 disp_no_group = 0,
38507 disp_load,
38508 disp_store,
38509 disp_load_store,
38510 disp_prefetch,
38511 disp_imm,
38512 disp_imm_32,
38513 disp_imm_64,
38514 disp_branch,
38515 disp_cmp,
38516 disp_jcc,
38517 disp_last
38518 };
38519
38520 /* Number of allowable groups in a dispatch window. It is an array
38521 indexed by dispatch_group enum. 100 is used as a big number,
38522 because the number of these kind of operations does not have any
38523 effect in dispatch window, but we need them for other reasons in
38524 the table. */
38525 static unsigned int num_allowable_groups[disp_last] = {
38526 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38527 };
38528
38529 char group_name[disp_last + 1][16] = {
38530 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38531 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38532 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38533 };
38534
38535 /* Instruction path. */
38536 enum insn_path {
38537 no_path = 0,
38538 path_single, /* Single micro op. */
38539 path_double, /* Double micro op. */
38540 path_multi, /* Instructions with more than 2 micro op.. */
38541 last_path
38542 };
38543
38544 /* sched_insn_info defines a window to the instructions scheduled in
38545 the basic block. It contains a pointer to the insn_info table and
38546 the instruction scheduled.
38547
38548 Windows are allocated for each basic block and are linked
38549 together. */
38550 typedef struct sched_insn_info_s {
38551 rtx insn;
38552 enum dispatch_group group;
38553 enum insn_path path;
38554 int byte_len;
38555 int imm_bytes;
38556 } sched_insn_info;
38557
38558 /* Linked list of dispatch windows. This is a two way list of
38559 dispatch windows of a basic block. It contains information about
38560 the number of uops in the window and the total number of
38561 instructions and of bytes in the object code for this dispatch
38562 window. */
38563 typedef struct dispatch_windows_s {
38564 int num_insn; /* Number of insn in the window. */
38565 int num_uops; /* Number of uops in the window. */
38566 int window_size; /* Number of bytes in the window. */
38567 int window_num; /* Window number between 0 or 1. */
38568 int num_imm; /* Number of immediates in an insn. */
38569 int num_imm_32; /* Number of 32 bit immediates in an insn. */
38570 int num_imm_64; /* Number of 64 bit immediates in an insn. */
38571 int imm_size; /* Total immediates in the window. */
38572 int num_loads; /* Total memory loads in the window. */
38573 int num_stores; /* Total memory stores in the window. */
38574 int violation; /* Violation exists in window. */
38575 sched_insn_info *window; /* Pointer to the window. */
38576 struct dispatch_windows_s *next;
38577 struct dispatch_windows_s *prev;
38578 } dispatch_windows;
38579
38580 /* Immediate valuse used in an insn. */
38581 typedef struct imm_info_s
38582 {
38583 int imm;
38584 int imm32;
38585 int imm64;
38586 } imm_info;
38587
38588 static dispatch_windows *dispatch_window_list;
38589 static dispatch_windows *dispatch_window_list1;
38590
38591 /* Get dispatch group of insn. */
38592
38593 static enum dispatch_group
38594 get_mem_group (rtx insn)
38595 {
38596 enum attr_memory memory;
38597
38598 if (INSN_CODE (insn) < 0)
38599 return disp_no_group;
38600 memory = get_attr_memory (insn);
38601 if (memory == MEMORY_STORE)
38602 return disp_store;
38603
38604 if (memory == MEMORY_LOAD)
38605 return disp_load;
38606
38607 if (memory == MEMORY_BOTH)
38608 return disp_load_store;
38609
38610 return disp_no_group;
38611 }
38612
38613 /* Return true if insn is a compare instruction. */
38614
38615 static bool
38616 is_cmp (rtx insn)
38617 {
38618 enum attr_type type;
38619
38620 type = get_attr_type (insn);
38621 return (type == TYPE_TEST
38622 || type == TYPE_ICMP
38623 || type == TYPE_FCMP
38624 || GET_CODE (PATTERN (insn)) == COMPARE);
38625 }
38626
38627 /* Return true if a dispatch violation encountered. */
38628
38629 static bool
38630 dispatch_violation (void)
38631 {
38632 if (dispatch_window_list->next)
38633 return dispatch_window_list->next->violation;
38634 return dispatch_window_list->violation;
38635 }
38636
38637 /* Return true if insn is a branch instruction. */
38638
38639 static bool
38640 is_branch (rtx insn)
38641 {
38642 return (CALL_P (insn) || JUMP_P (insn));
38643 }
38644
38645 /* Return true if insn is a prefetch instruction. */
38646
38647 static bool
38648 is_prefetch (rtx insn)
38649 {
38650 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38651 }
38652
38653 /* This function initializes a dispatch window and the list container holding a
38654 pointer to the window. */
38655
38656 static void
38657 init_window (int window_num)
38658 {
38659 int i;
38660 dispatch_windows *new_list;
38661
38662 if (window_num == 0)
38663 new_list = dispatch_window_list;
38664 else
38665 new_list = dispatch_window_list1;
38666
38667 new_list->num_insn = 0;
38668 new_list->num_uops = 0;
38669 new_list->window_size = 0;
38670 new_list->next = NULL;
38671 new_list->prev = NULL;
38672 new_list->window_num = window_num;
38673 new_list->num_imm = 0;
38674 new_list->num_imm_32 = 0;
38675 new_list->num_imm_64 = 0;
38676 new_list->imm_size = 0;
38677 new_list->num_loads = 0;
38678 new_list->num_stores = 0;
38679 new_list->violation = false;
38680
38681 for (i = 0; i < MAX_INSN; i++)
38682 {
38683 new_list->window[i].insn = NULL;
38684 new_list->window[i].group = disp_no_group;
38685 new_list->window[i].path = no_path;
38686 new_list->window[i].byte_len = 0;
38687 new_list->window[i].imm_bytes = 0;
38688 }
38689 return;
38690 }
38691
38692 /* This function allocates and initializes a dispatch window and the
38693 list container holding a pointer to the window. */
38694
38695 static dispatch_windows *
38696 allocate_window (void)
38697 {
38698 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38699 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38700
38701 return new_list;
38702 }
38703
38704 /* This routine initializes the dispatch scheduling information. It
38705 initiates building dispatch scheduler tables and constructs the
38706 first dispatch window. */
38707
38708 static void
38709 init_dispatch_sched (void)
38710 {
38711 /* Allocate a dispatch list and a window. */
38712 dispatch_window_list = allocate_window ();
38713 dispatch_window_list1 = allocate_window ();
38714 init_window (0);
38715 init_window (1);
38716 }
38717
38718 /* This function returns true if a branch is detected. End of a basic block
38719 does not have to be a branch, but here we assume only branches end a
38720 window. */
38721
38722 static bool
38723 is_end_basic_block (enum dispatch_group group)
38724 {
38725 return group == disp_branch;
38726 }
38727
38728 /* This function is called when the end of a window processing is reached. */
38729
38730 static void
38731 process_end_window (void)
38732 {
38733 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38734 if (dispatch_window_list->next)
38735 {
38736 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38737 gcc_assert (dispatch_window_list->window_size
38738 + dispatch_window_list1->window_size <= 48);
38739 init_window (1);
38740 }
38741 init_window (0);
38742 }
38743
38744 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38745 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38746 for 48 bytes of instructions. Note that these windows are not dispatch
38747 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38748
38749 static dispatch_windows *
38750 allocate_next_window (int window_num)
38751 {
38752 if (window_num == 0)
38753 {
38754 if (dispatch_window_list->next)
38755 init_window (1);
38756 init_window (0);
38757 return dispatch_window_list;
38758 }
38759
38760 dispatch_window_list->next = dispatch_window_list1;
38761 dispatch_window_list1->prev = dispatch_window_list;
38762
38763 return dispatch_window_list1;
38764 }
38765
38766 /* Increment the number of immediate operands of an instruction. */
38767
38768 static int
38769 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38770 {
38771 if (*in_rtx == 0)
38772 return 0;
38773
38774 switch ( GET_CODE (*in_rtx))
38775 {
38776 case CONST:
38777 case SYMBOL_REF:
38778 case CONST_INT:
38779 (imm_values->imm)++;
38780 if (x86_64_immediate_operand (*in_rtx, SImode))
38781 (imm_values->imm32)++;
38782 else
38783 (imm_values->imm64)++;
38784 break;
38785
38786 case CONST_DOUBLE:
38787 (imm_values->imm)++;
38788 (imm_values->imm64)++;
38789 break;
38790
38791 case CODE_LABEL:
38792 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38793 {
38794 (imm_values->imm)++;
38795 (imm_values->imm32)++;
38796 }
38797 break;
38798
38799 default:
38800 break;
38801 }
38802
38803 return 0;
38804 }
38805
38806 /* Compute number of immediate operands of an instruction. */
38807
38808 static void
38809 find_constant (rtx in_rtx, imm_info *imm_values)
38810 {
38811 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38812 (rtx_function) find_constant_1, (void *) imm_values);
38813 }
38814
38815 /* Return total size of immediate operands of an instruction along with number
38816 of corresponding immediate-operands. It initializes its parameters to zero
38817 befor calling FIND_CONSTANT.
38818 INSN is the input instruction. IMM is the total of immediates.
38819 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38820 bit immediates. */
38821
38822 static int
38823 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38824 {
38825 imm_info imm_values = {0, 0, 0};
38826
38827 find_constant (insn, &imm_values);
38828 *imm = imm_values.imm;
38829 *imm32 = imm_values.imm32;
38830 *imm64 = imm_values.imm64;
38831 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38832 }
38833
38834 /* This function indicates if an operand of an instruction is an
38835 immediate. */
38836
38837 static bool
38838 has_immediate (rtx insn)
38839 {
38840 int num_imm_operand;
38841 int num_imm32_operand;
38842 int num_imm64_operand;
38843
38844 if (insn)
38845 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38846 &num_imm64_operand);
38847 return false;
38848 }
38849
38850 /* Return single or double path for instructions. */
38851
38852 static enum insn_path
38853 get_insn_path (rtx insn)
38854 {
38855 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38856
38857 if ((int)path == 0)
38858 return path_single;
38859
38860 if ((int)path == 1)
38861 return path_double;
38862
38863 return path_multi;
38864 }
38865
38866 /* Return insn dispatch group. */
38867
38868 static enum dispatch_group
38869 get_insn_group (rtx insn)
38870 {
38871 enum dispatch_group group = get_mem_group (insn);
38872 if (group)
38873 return group;
38874
38875 if (is_branch (insn))
38876 return disp_branch;
38877
38878 if (is_cmp (insn))
38879 return disp_cmp;
38880
38881 if (has_immediate (insn))
38882 return disp_imm;
38883
38884 if (is_prefetch (insn))
38885 return disp_prefetch;
38886
38887 return disp_no_group;
38888 }
38889
38890 /* Count number of GROUP restricted instructions in a dispatch
38891 window WINDOW_LIST. */
38892
38893 static int
38894 count_num_restricted (rtx insn, dispatch_windows *window_list)
38895 {
38896 enum dispatch_group group = get_insn_group (insn);
38897 int imm_size;
38898 int num_imm_operand;
38899 int num_imm32_operand;
38900 int num_imm64_operand;
38901
38902 if (group == disp_no_group)
38903 return 0;
38904
38905 if (group == disp_imm)
38906 {
38907 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38908 &num_imm64_operand);
38909 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38910 || num_imm_operand + window_list->num_imm > MAX_IMM
38911 || (num_imm32_operand > 0
38912 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38913 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38914 || (num_imm64_operand > 0
38915 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38916 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38917 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38918 && num_imm64_operand > 0
38919 && ((window_list->num_imm_64 > 0
38920 && window_list->num_insn >= 2)
38921 || window_list->num_insn >= 3)))
38922 return BIG;
38923
38924 return 1;
38925 }
38926
38927 if ((group == disp_load_store
38928 && (window_list->num_loads >= MAX_LOAD
38929 || window_list->num_stores >= MAX_STORE))
38930 || ((group == disp_load
38931 || group == disp_prefetch)
38932 && window_list->num_loads >= MAX_LOAD)
38933 || (group == disp_store
38934 && window_list->num_stores >= MAX_STORE))
38935 return BIG;
38936
38937 return 1;
38938 }
38939
38940 /* This function returns true if insn satisfies dispatch rules on the
38941 last window scheduled. */
38942
38943 static bool
38944 fits_dispatch_window (rtx insn)
38945 {
38946 dispatch_windows *window_list = dispatch_window_list;
38947 dispatch_windows *window_list_next = dispatch_window_list->next;
38948 unsigned int num_restrict;
38949 enum dispatch_group group = get_insn_group (insn);
38950 enum insn_path path = get_insn_path (insn);
38951 int sum;
38952
38953 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38954 instructions should be given the lowest priority in the
38955 scheduling process in Haifa scheduler to make sure they will be
38956 scheduled in the same dispatch window as the refrence to them. */
38957 if (group == disp_jcc || group == disp_cmp)
38958 return false;
38959
38960 /* Check nonrestricted. */
38961 if (group == disp_no_group || group == disp_branch)
38962 return true;
38963
38964 /* Get last dispatch window. */
38965 if (window_list_next)
38966 window_list = window_list_next;
38967
38968 if (window_list->window_num == 1)
38969 {
38970 sum = window_list->prev->window_size + window_list->window_size;
38971
38972 if (sum == 32
38973 || (min_insn_size (insn) + sum) >= 48)
38974 /* Window 1 is full. Go for next window. */
38975 return true;
38976 }
38977
38978 num_restrict = count_num_restricted (insn, window_list);
38979
38980 if (num_restrict > num_allowable_groups[group])
38981 return false;
38982
38983 /* See if it fits in the first window. */
38984 if (window_list->window_num == 0)
38985 {
38986 /* The first widow should have only single and double path
38987 uops. */
38988 if (path == path_double
38989 && (window_list->num_uops + 2) > MAX_INSN)
38990 return false;
38991 else if (path != path_single)
38992 return false;
38993 }
38994 return true;
38995 }
38996
38997 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38998 dispatch window WINDOW_LIST. */
38999
39000 static void
39001 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
39002 {
39003 int byte_len = min_insn_size (insn);
39004 int num_insn = window_list->num_insn;
39005 int imm_size;
39006 sched_insn_info *window = window_list->window;
39007 enum dispatch_group group = get_insn_group (insn);
39008 enum insn_path path = get_insn_path (insn);
39009 int num_imm_operand;
39010 int num_imm32_operand;
39011 int num_imm64_operand;
39012
39013 if (!window_list->violation && group != disp_cmp
39014 && !fits_dispatch_window (insn))
39015 window_list->violation = true;
39016
39017 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39018 &num_imm64_operand);
39019
39020 /* Initialize window with new instruction. */
39021 window[num_insn].insn = insn;
39022 window[num_insn].byte_len = byte_len;
39023 window[num_insn].group = group;
39024 window[num_insn].path = path;
39025 window[num_insn].imm_bytes = imm_size;
39026
39027 window_list->window_size += byte_len;
39028 window_list->num_insn = num_insn + 1;
39029 window_list->num_uops = window_list->num_uops + num_uops;
39030 window_list->imm_size += imm_size;
39031 window_list->num_imm += num_imm_operand;
39032 window_list->num_imm_32 += num_imm32_operand;
39033 window_list->num_imm_64 += num_imm64_operand;
39034
39035 if (group == disp_store)
39036 window_list->num_stores += 1;
39037 else if (group == disp_load
39038 || group == disp_prefetch)
39039 window_list->num_loads += 1;
39040 else if (group == disp_load_store)
39041 {
39042 window_list->num_stores += 1;
39043 window_list->num_loads += 1;
39044 }
39045 }
39046
39047 /* Adds a scheduled instruction, INSN, to the current dispatch window.
39048 If the total bytes of instructions or the number of instructions in
39049 the window exceed allowable, it allocates a new window. */
39050
39051 static void
39052 add_to_dispatch_window (rtx insn)
39053 {
39054 int byte_len;
39055 dispatch_windows *window_list;
39056 dispatch_windows *next_list;
39057 dispatch_windows *window0_list;
39058 enum insn_path path;
39059 enum dispatch_group insn_group;
39060 bool insn_fits;
39061 int num_insn;
39062 int num_uops;
39063 int window_num;
39064 int insn_num_uops;
39065 int sum;
39066
39067 if (INSN_CODE (insn) < 0)
39068 return;
39069
39070 byte_len = min_insn_size (insn);
39071 window_list = dispatch_window_list;
39072 next_list = window_list->next;
39073 path = get_insn_path (insn);
39074 insn_group = get_insn_group (insn);
39075
39076 /* Get the last dispatch window. */
39077 if (next_list)
39078 window_list = dispatch_window_list->next;
39079
39080 if (path == path_single)
39081 insn_num_uops = 1;
39082 else if (path == path_double)
39083 insn_num_uops = 2;
39084 else
39085 insn_num_uops = (int) path;
39086
39087 /* If current window is full, get a new window.
39088 Window number zero is full, if MAX_INSN uops are scheduled in it.
39089 Window number one is full, if window zero's bytes plus window
39090 one's bytes is 32, or if the bytes of the new instruction added
39091 to the total makes it greater than 48, or it has already MAX_INSN
39092 instructions in it. */
39093 num_insn = window_list->num_insn;
39094 num_uops = window_list->num_uops;
39095 window_num = window_list->window_num;
39096 insn_fits = fits_dispatch_window (insn);
39097
39098 if (num_insn >= MAX_INSN
39099 || num_uops + insn_num_uops > MAX_INSN
39100 || !(insn_fits))
39101 {
39102 window_num = ~window_num & 1;
39103 window_list = allocate_next_window (window_num);
39104 }
39105
39106 if (window_num == 0)
39107 {
39108 add_insn_window (insn, window_list, insn_num_uops);
39109 if (window_list->num_insn >= MAX_INSN
39110 && insn_group == disp_branch)
39111 {
39112 process_end_window ();
39113 return;
39114 }
39115 }
39116 else if (window_num == 1)
39117 {
39118 window0_list = window_list->prev;
39119 sum = window0_list->window_size + window_list->window_size;
39120 if (sum == 32
39121 || (byte_len + sum) >= 48)
39122 {
39123 process_end_window ();
39124 window_list = dispatch_window_list;
39125 }
39126
39127 add_insn_window (insn, window_list, insn_num_uops);
39128 }
39129 else
39130 gcc_unreachable ();
39131
39132 if (is_end_basic_block (insn_group))
39133 {
39134 /* End of basic block is reached do end-basic-block process. */
39135 process_end_window ();
39136 return;
39137 }
39138 }
39139
39140 /* Print the dispatch window, WINDOW_NUM, to FILE. */
39141
39142 DEBUG_FUNCTION static void
39143 debug_dispatch_window_file (FILE *file, int window_num)
39144 {
39145 dispatch_windows *list;
39146 int i;
39147
39148 if (window_num == 0)
39149 list = dispatch_window_list;
39150 else
39151 list = dispatch_window_list1;
39152
39153 fprintf (file, "Window #%d:\n", list->window_num);
39154 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
39155 list->num_insn, list->num_uops, list->window_size);
39156 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39157 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
39158
39159 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
39160 list->num_stores);
39161 fprintf (file, " insn info:\n");
39162
39163 for (i = 0; i < MAX_INSN; i++)
39164 {
39165 if (!list->window[i].insn)
39166 break;
39167 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
39168 i, group_name[list->window[i].group],
39169 i, (void *)list->window[i].insn,
39170 i, list->window[i].path,
39171 i, list->window[i].byte_len,
39172 i, list->window[i].imm_bytes);
39173 }
39174 }
39175
39176 /* Print to stdout a dispatch window. */
39177
39178 DEBUG_FUNCTION void
39179 debug_dispatch_window (int window_num)
39180 {
39181 debug_dispatch_window_file (stdout, window_num);
39182 }
39183
39184 /* Print INSN dispatch information to FILE. */
39185
39186 DEBUG_FUNCTION static void
39187 debug_insn_dispatch_info_file (FILE *file, rtx insn)
39188 {
39189 int byte_len;
39190 enum insn_path path;
39191 enum dispatch_group group;
39192 int imm_size;
39193 int num_imm_operand;
39194 int num_imm32_operand;
39195 int num_imm64_operand;
39196
39197 if (INSN_CODE (insn) < 0)
39198 return;
39199
39200 byte_len = min_insn_size (insn);
39201 path = get_insn_path (insn);
39202 group = get_insn_group (insn);
39203 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39204 &num_imm64_operand);
39205
39206 fprintf (file, " insn info:\n");
39207 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
39208 group_name[group], path, byte_len);
39209 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39210 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
39211 }
39212
39213 /* Print to STDERR the status of the ready list with respect to
39214 dispatch windows. */
39215
39216 DEBUG_FUNCTION void
39217 debug_ready_dispatch (void)
39218 {
39219 int i;
39220 int no_ready = number_in_ready ();
39221
39222 fprintf (stdout, "Number of ready: %d\n", no_ready);
39223
39224 for (i = 0; i < no_ready; i++)
39225 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
39226 }
39227
39228 /* This routine is the driver of the dispatch scheduler. */
39229
39230 static void
39231 do_dispatch (rtx insn, int mode)
39232 {
39233 if (mode == DISPATCH_INIT)
39234 init_dispatch_sched ();
39235 else if (mode == ADD_TO_DISPATCH_WINDOW)
39236 add_to_dispatch_window (insn);
39237 }
39238
39239 /* Return TRUE if Dispatch Scheduling is supported. */
39240
39241 static bool
39242 has_dispatch (rtx insn, int action)
39243 {
39244 if ((TARGET_BDVER1 || TARGET_BDVER2)
39245 && flag_dispatch_scheduler)
39246 switch (action)
39247 {
39248 default:
39249 return false;
39250
39251 case IS_DISPATCH_ON:
39252 return true;
39253 break;
39254
39255 case IS_CMP:
39256 return is_cmp (insn);
39257
39258 case DISPATCH_VIOLATION:
39259 return dispatch_violation ();
39260
39261 case FITS_DISPATCH_WINDOW:
39262 return fits_dispatch_window (insn);
39263 }
39264
39265 return false;
39266 }
39267
39268 /* Implementation of reassociation_width target hook used by
39269 reassoc phase to identify parallelism level in reassociated
39270 tree. Statements tree_code is passed in OPC. Arguments type
39271 is passed in MODE.
39272
39273 Currently parallel reassociation is enabled for Atom
39274 processors only and we set reassociation width to be 2
39275 because Atom may issue up to 2 instructions per cycle.
39276
39277 Return value should be fixed if parallel reassociation is
39278 enabled for other processors. */
39279
39280 static int
39281 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
39282 enum machine_mode mode)
39283 {
39284 int res = 1;
39285
39286 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
39287 res = 2;
39288 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
39289 res = 2;
39290
39291 return res;
39292 }
39293
39294 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
39295 place emms and femms instructions. */
39296
39297 static enum machine_mode
39298 ix86_preferred_simd_mode (enum machine_mode mode)
39299 {
39300 if (!TARGET_SSE)
39301 return word_mode;
39302
39303 switch (mode)
39304 {
39305 case QImode:
39306 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
39307 case HImode:
39308 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
39309 case SImode:
39310 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
39311 case DImode:
39312 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
39313
39314 case SFmode:
39315 if (TARGET_AVX && !TARGET_PREFER_AVX128)
39316 return V8SFmode;
39317 else
39318 return V4SFmode;
39319
39320 case DFmode:
39321 if (!TARGET_VECTORIZE_DOUBLE)
39322 return word_mode;
39323 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
39324 return V4DFmode;
39325 else if (TARGET_SSE2)
39326 return V2DFmode;
39327 /* FALLTHRU */
39328
39329 default:
39330 return word_mode;
39331 }
39332 }
39333
39334 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
39335 vectors. */
39336
39337 static unsigned int
39338 ix86_autovectorize_vector_sizes (void)
39339 {
39340 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
39341 }
39342
39343 /* Validate target specific memory model bits in VAL. */
39344
39345 static unsigned HOST_WIDE_INT
39346 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
39347 {
39348 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
39349 unsigned HOST_WIDE_INT strong;
39350
39351 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
39352 |MEMMODEL_MASK)
39353 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
39354 {
39355 warning (OPT_Winvalid_memory_model,
39356 "Unknown architecture specific memory model");
39357 return MEMMODEL_SEQ_CST;
39358 }
39359 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
39360 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
39361 {
39362 warning (OPT_Winvalid_memory_model,
39363 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
39364 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
39365 }
39366 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
39367 {
39368 warning (OPT_Winvalid_memory_model,
39369 "HLE_RELEASE not used with RELEASE or stronger memory model");
39370 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
39371 }
39372 return val;
39373 }
39374
39375 /* Initialize the GCC target structure. */
39376 #undef TARGET_RETURN_IN_MEMORY
39377 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
39378
39379 #undef TARGET_LEGITIMIZE_ADDRESS
39380 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
39381
39382 #undef TARGET_ATTRIBUTE_TABLE
39383 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
39384 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39385 # undef TARGET_MERGE_DECL_ATTRIBUTES
39386 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
39387 #endif
39388
39389 #undef TARGET_COMP_TYPE_ATTRIBUTES
39390 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
39391
39392 #undef TARGET_INIT_BUILTINS
39393 #define TARGET_INIT_BUILTINS ix86_init_builtins
39394 #undef TARGET_BUILTIN_DECL
39395 #define TARGET_BUILTIN_DECL ix86_builtin_decl
39396 #undef TARGET_EXPAND_BUILTIN
39397 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
39398
39399 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
39400 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
39401 ix86_builtin_vectorized_function
39402
39403 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
39404 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
39405
39406 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
39407 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
39408
39409 #undef TARGET_VECTORIZE_BUILTIN_GATHER
39410 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
39411
39412 #undef TARGET_BUILTIN_RECIPROCAL
39413 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
39414
39415 #undef TARGET_ASM_FUNCTION_EPILOGUE
39416 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
39417
39418 #undef TARGET_ENCODE_SECTION_INFO
39419 #ifndef SUBTARGET_ENCODE_SECTION_INFO
39420 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
39421 #else
39422 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
39423 #endif
39424
39425 #undef TARGET_ASM_OPEN_PAREN
39426 #define TARGET_ASM_OPEN_PAREN ""
39427 #undef TARGET_ASM_CLOSE_PAREN
39428 #define TARGET_ASM_CLOSE_PAREN ""
39429
39430 #undef TARGET_ASM_BYTE_OP
39431 #define TARGET_ASM_BYTE_OP ASM_BYTE
39432
39433 #undef TARGET_ASM_ALIGNED_HI_OP
39434 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
39435 #undef TARGET_ASM_ALIGNED_SI_OP
39436 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
39437 #ifdef ASM_QUAD
39438 #undef TARGET_ASM_ALIGNED_DI_OP
39439 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
39440 #endif
39441
39442 #undef TARGET_PROFILE_BEFORE_PROLOGUE
39443 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
39444
39445 #undef TARGET_ASM_UNALIGNED_HI_OP
39446 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
39447 #undef TARGET_ASM_UNALIGNED_SI_OP
39448 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
39449 #undef TARGET_ASM_UNALIGNED_DI_OP
39450 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
39451
39452 #undef TARGET_PRINT_OPERAND
39453 #define TARGET_PRINT_OPERAND ix86_print_operand
39454 #undef TARGET_PRINT_OPERAND_ADDRESS
39455 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
39456 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
39457 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
39458 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
39459 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
39460
39461 #undef TARGET_SCHED_INIT_GLOBAL
39462 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
39463 #undef TARGET_SCHED_ADJUST_COST
39464 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
39465 #undef TARGET_SCHED_ISSUE_RATE
39466 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
39467 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
39468 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
39469 ia32_multipass_dfa_lookahead
39470
39471 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
39472 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
39473
39474 #undef TARGET_MEMMODEL_CHECK
39475 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
39476
39477 #ifdef HAVE_AS_TLS
39478 #undef TARGET_HAVE_TLS
39479 #define TARGET_HAVE_TLS true
39480 #endif
39481 #undef TARGET_CANNOT_FORCE_CONST_MEM
39482 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39483 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39484 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39485
39486 #undef TARGET_DELEGITIMIZE_ADDRESS
39487 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39488
39489 #undef TARGET_MS_BITFIELD_LAYOUT_P
39490 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39491
39492 #if TARGET_MACHO
39493 #undef TARGET_BINDS_LOCAL_P
39494 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39495 #endif
39496 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39497 #undef TARGET_BINDS_LOCAL_P
39498 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39499 #endif
39500
39501 #undef TARGET_ASM_OUTPUT_MI_THUNK
39502 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39503 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39504 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39505
39506 #undef TARGET_ASM_FILE_START
39507 #define TARGET_ASM_FILE_START x86_file_start
39508
39509 #undef TARGET_OPTION_OVERRIDE
39510 #define TARGET_OPTION_OVERRIDE ix86_option_override
39511
39512 #undef TARGET_REGISTER_MOVE_COST
39513 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39514 #undef TARGET_MEMORY_MOVE_COST
39515 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39516 #undef TARGET_RTX_COSTS
39517 #define TARGET_RTX_COSTS ix86_rtx_costs
39518 #undef TARGET_ADDRESS_COST
39519 #define TARGET_ADDRESS_COST ix86_address_cost
39520
39521 #undef TARGET_FIXED_CONDITION_CODE_REGS
39522 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39523 #undef TARGET_CC_MODES_COMPATIBLE
39524 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39525
39526 #undef TARGET_MACHINE_DEPENDENT_REORG
39527 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39528
39529 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39530 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39531
39532 #undef TARGET_BUILD_BUILTIN_VA_LIST
39533 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39534
39535 #undef TARGET_FOLD_BUILTIN
39536 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
39537
39538 #undef TARGET_ENUM_VA_LIST_P
39539 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39540
39541 #undef TARGET_FN_ABI_VA_LIST
39542 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39543
39544 #undef TARGET_CANONICAL_VA_LIST_TYPE
39545 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39546
39547 #undef TARGET_EXPAND_BUILTIN_VA_START
39548 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39549
39550 #undef TARGET_MD_ASM_CLOBBERS
39551 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
39552
39553 #undef TARGET_PROMOTE_PROTOTYPES
39554 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
39555 #undef TARGET_STRUCT_VALUE_RTX
39556 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
39557 #undef TARGET_SETUP_INCOMING_VARARGS
39558 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
39559 #undef TARGET_MUST_PASS_IN_STACK
39560 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
39561 #undef TARGET_FUNCTION_ARG_ADVANCE
39562 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
39563 #undef TARGET_FUNCTION_ARG
39564 #define TARGET_FUNCTION_ARG ix86_function_arg
39565 #undef TARGET_FUNCTION_ARG_BOUNDARY
39566 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
39567 #undef TARGET_PASS_BY_REFERENCE
39568 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39569 #undef TARGET_INTERNAL_ARG_POINTER
39570 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39571 #undef TARGET_UPDATE_STACK_BOUNDARY
39572 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39573 #undef TARGET_GET_DRAP_RTX
39574 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39575 #undef TARGET_STRICT_ARGUMENT_NAMING
39576 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39577 #undef TARGET_STATIC_CHAIN
39578 #define TARGET_STATIC_CHAIN ix86_static_chain
39579 #undef TARGET_TRAMPOLINE_INIT
39580 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39581 #undef TARGET_RETURN_POPS_ARGS
39582 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39583
39584 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39585 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39586
39587 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39588 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39589
39590 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39591 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39592
39593 #undef TARGET_C_MODE_FOR_SUFFIX
39594 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39595
39596 #ifdef HAVE_AS_TLS
39597 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39598 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39599 #endif
39600
39601 #ifdef SUBTARGET_INSERT_ATTRIBUTES
39602 #undef TARGET_INSERT_ATTRIBUTES
39603 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
39604 #endif
39605
39606 #undef TARGET_MANGLE_TYPE
39607 #define TARGET_MANGLE_TYPE ix86_mangle_type
39608
39609 #if !TARGET_MACHO
39610 #undef TARGET_STACK_PROTECT_FAIL
39611 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39612 #endif
39613
39614 #undef TARGET_FUNCTION_VALUE
39615 #define TARGET_FUNCTION_VALUE ix86_function_value
39616
39617 #undef TARGET_FUNCTION_VALUE_REGNO_P
39618 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39619
39620 #undef TARGET_PROMOTE_FUNCTION_MODE
39621 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39622
39623 #undef TARGET_SECONDARY_RELOAD
39624 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39625
39626 #undef TARGET_CLASS_MAX_NREGS
39627 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39628
39629 #undef TARGET_PREFERRED_RELOAD_CLASS
39630 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39631 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39632 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39633 #undef TARGET_CLASS_LIKELY_SPILLED_P
39634 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39635
39636 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39637 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39638 ix86_builtin_vectorization_cost
39639 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39640 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39641 ix86_vectorize_vec_perm_const_ok
39642 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39643 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39644 ix86_preferred_simd_mode
39645 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39646 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39647 ix86_autovectorize_vector_sizes
39648
39649 #undef TARGET_SET_CURRENT_FUNCTION
39650 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39651
39652 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39653 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39654
39655 #undef TARGET_OPTION_SAVE
39656 #define TARGET_OPTION_SAVE ix86_function_specific_save
39657
39658 #undef TARGET_OPTION_RESTORE
39659 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39660
39661 #undef TARGET_OPTION_PRINT
39662 #define TARGET_OPTION_PRINT ix86_function_specific_print
39663
39664 #undef TARGET_CAN_INLINE_P
39665 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39666
39667 #undef TARGET_EXPAND_TO_RTL_HOOK
39668 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39669
39670 #undef TARGET_LEGITIMATE_ADDRESS_P
39671 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39672
39673 #undef TARGET_LEGITIMATE_CONSTANT_P
39674 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39675
39676 #undef TARGET_FRAME_POINTER_REQUIRED
39677 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39678
39679 #undef TARGET_CAN_ELIMINATE
39680 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39681
39682 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39683 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39684
39685 #undef TARGET_ASM_CODE_END
39686 #define TARGET_ASM_CODE_END ix86_code_end
39687
39688 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39689 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39690
39691 #if TARGET_MACHO
39692 #undef TARGET_INIT_LIBFUNCS
39693 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39694 #endif
39695
39696 struct gcc_target targetm = TARGET_INITIALIZER;
39697 \f
39698 #include "gt-i386.h"