7bcde8a2e164bcc42f5338f9907885774a3b54a2
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64
65 enum upper_128bits_state
66 {
67 unknown = 0,
68 unused,
69 used
70 };
71
72 typedef struct block_info_def
73 {
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85 } *block_info;
86
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89 enum call_avx256_state
90 {
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102 };
103
104 /* Check if a 256bit AVX register is referenced in stores. */
105
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119 }
120
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130 {
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292 }
293
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338 done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356 }
357
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490 }
491
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580 };
581
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651 };
652
653 static const
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723 };
724
725 static const
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793 };
794
795 static const
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870 };
871
872 static const
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941 };
942
943 static const
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014 };
1015
1016 static const
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087 };
1088
1089 static const
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340 };
1341
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427 };
1428
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509 };
1510
1511 static const
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580 };
1581
1582 static const
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653 };
1654
1655 static const
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726 };
1727
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1729 static const
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803 };
1804
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807 static const
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875 };
1876
1877 /* Set by -mtune. */
1878 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1879
1880 /* Set by -mtune or -Os. */
1881 const struct processor_costs *ix86_cost = &pentium_cost;
1882
1883 /* Processor feature/optimization bitmasks. */
1884 #define m_386 (1<<PROCESSOR_I386)
1885 #define m_486 (1<<PROCESSOR_I486)
1886 #define m_PENT (1<<PROCESSOR_PENTIUM)
1887 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1888 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1889 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1890 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1891 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1892 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1893 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1894 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1895 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1896 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1897 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1898 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1899 #define m_ATOM (1<<PROCESSOR_ATOM)
1900
1901 #define m_GEODE (1<<PROCESSOR_GEODE)
1902 #define m_K6 (1<<PROCESSOR_K6)
1903 #define m_K6_GEODE (m_K6 | m_GEODE)
1904 #define m_K8 (1<<PROCESSOR_K8)
1905 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1906 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1907 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1908 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1909 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1910 #define m_BDVER (m_BDVER1 | m_BDVER2)
1911 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1912 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1913
1914 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1915 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1916
1917 /* Generic instruction choice should be common subset of supported CPUs
1918 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1919 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1920
1921 /* Feature tests against the various tunings. */
1922 unsigned char ix86_tune_features[X86_TUNE_LAST];
1923
1924 /* Feature tests against the various tunings used to create ix86_tune_features
1925 based on the processor mask. */
1926 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1927 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1928 negatively, so enabling for Generic64 seems like good code size
1929 tradeoff. We can't enable it for 32bit generic because it does not
1930 work well with PPro base chips. */
1931 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1932
1933 /* X86_TUNE_PUSH_MEMORY */
1934 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1935
1936 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1937 m_486 | m_PENT,
1938
1939 /* X86_TUNE_UNROLL_STRLEN */
1940 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1941
1942 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1943 on simulation result. But after P4 was made, no performance benefit
1944 was observed with branch hints. It also increases the code size.
1945 As a result, icc never generates branch hints. */
1946 0,
1947
1948 /* X86_TUNE_DOUBLE_WITH_ADD */
1949 ~m_386,
1950
1951 /* X86_TUNE_USE_SAHF */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1953
1954 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1955 partial dependencies. */
1956 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1957
1958 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1959 register stalls on Generic32 compilation setting as well. However
1960 in current implementation the partial register stalls are not eliminated
1961 very well - they can be introduced via subregs synthesized by combine
1962 and can happen in caller/callee saving sequences. Because this option
1963 pays back little on PPro based chips and is in conflict with partial reg
1964 dependencies used by Athlon/P4 based chips, it is better to leave it off
1965 for generic32 for now. */
1966 m_PPRO,
1967
1968 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1969 m_CORE2I7 | m_GENERIC,
1970
1971 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1972 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1973 m_CORE2I7 | m_GENERIC,
1974
1975 /* X86_TUNE_USE_HIMODE_FIOP */
1976 m_386 | m_486 | m_K6_GEODE,
1977
1978 /* X86_TUNE_USE_SIMODE_FIOP */
1979 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1980
1981 /* X86_TUNE_USE_MOV0 */
1982 m_K6,
1983
1984 /* X86_TUNE_USE_CLTD */
1985 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1986
1987 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1988 m_PENT4,
1989
1990 /* X86_TUNE_SPLIT_LONG_MOVES */
1991 m_PPRO,
1992
1993 /* X86_TUNE_READ_MODIFY_WRITE */
1994 ~m_PENT,
1995
1996 /* X86_TUNE_READ_MODIFY */
1997 ~(m_PENT | m_PPRO),
1998
1999 /* X86_TUNE_PROMOTE_QIMODE */
2000 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2001
2002 /* X86_TUNE_FAST_PREFIX */
2003 ~(m_386 | m_486 | m_PENT),
2004
2005 /* X86_TUNE_SINGLE_STRINGOP */
2006 m_386 | m_P4_NOCONA,
2007
2008 /* X86_TUNE_QIMODE_MATH */
2009 ~0,
2010
2011 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2012 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2013 might be considered for Generic32 if our scheme for avoiding partial
2014 stalls was more effective. */
2015 ~m_PPRO,
2016
2017 /* X86_TUNE_PROMOTE_QI_REGS */
2018 0,
2019
2020 /* X86_TUNE_PROMOTE_HI_REGS */
2021 m_PPRO,
2022
2023 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2024 over esp addition. */
2025 m_386 | m_486 | m_PENT | m_PPRO,
2026
2027 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2028 over esp addition. */
2029 m_PENT,
2030
2031 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2032 over esp subtraction. */
2033 m_386 | m_486 | m_PENT | m_K6_GEODE,
2034
2035 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2036 over esp subtraction. */
2037 m_PENT | m_K6_GEODE,
2038
2039 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2040 for DFmode copies */
2041 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2042
2043 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2044 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2045
2046 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2047 conflict here in between PPro/Pentium4 based chips that thread 128bit
2048 SSE registers as single units versus K8 based chips that divide SSE
2049 registers to two 64bit halves. This knob promotes all store destinations
2050 to be 128bit to allow register renaming on 128bit SSE units, but usually
2051 results in one extra microop on 64bit SSE units. Experimental results
2052 shows that disabling this option on P4 brings over 20% SPECfp regression,
2053 while enabling it on K8 brings roughly 2.4% regression that can be partly
2054 masked by careful scheduling of moves. */
2055 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2056
2057 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2058 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2059
2060 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2061 m_COREI7 | m_BDVER,
2062
2063 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2064 m_BDVER ,
2065
2066 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2067 are resolved on SSE register parts instead of whole registers, so we may
2068 maintain just lower part of scalar values in proper format leaving the
2069 upper part undefined. */
2070 m_ATHLON_K8,
2071
2072 /* X86_TUNE_SSE_TYPELESS_STORES */
2073 m_AMD_MULTIPLE,
2074
2075 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2076 m_PPRO | m_P4_NOCONA,
2077
2078 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2079 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2080
2081 /* X86_TUNE_PROLOGUE_USING_MOVE */
2082 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2083
2084 /* X86_TUNE_EPILOGUE_USING_MOVE */
2085 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2086
2087 /* X86_TUNE_SHIFT1 */
2088 ~m_486,
2089
2090 /* X86_TUNE_USE_FFREEP */
2091 m_AMD_MULTIPLE,
2092
2093 /* X86_TUNE_INTER_UNIT_MOVES */
2094 ~(m_AMD_MULTIPLE | m_GENERIC),
2095
2096 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2097 ~(m_AMDFAM10 | m_BDVER ),
2098
2099 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2100 than 4 branch instructions in the 16 byte window. */
2101 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2102
2103 /* X86_TUNE_SCHEDULE */
2104 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_USE_BT */
2107 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2108
2109 /* X86_TUNE_USE_INCDEC */
2110 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2111
2112 /* X86_TUNE_PAD_RETURNS */
2113 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2114
2115 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2116 m_ATOM,
2117
2118 /* X86_TUNE_EXT_80387_CONSTANTS */
2119 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2120
2121 /* X86_TUNE_SHORTEN_X87_SSE */
2122 ~m_K8,
2123
2124 /* X86_TUNE_AVOID_VECTOR_DECODE */
2125 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2126
2127 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2128 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2129 ~(m_386 | m_486),
2130
2131 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2132 vector path on AMD machines. */
2133 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2134
2135 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2136 machines. */
2137 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2138
2139 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2140 than a MOV. */
2141 m_PENT,
2142
2143 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2144 but one byte longer. */
2145 m_PENT,
2146
2147 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2148 operand that cannot be represented using a modRM byte. The XOR
2149 replacement is long decoded, so this split helps here as well. */
2150 m_K6,
2151
2152 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2153 from FP to FP. */
2154 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2155
2156 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2157 from integer to FP. */
2158 m_AMDFAM10,
2159
2160 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2161 with a subsequent conditional jump instruction into a single
2162 compare-and-branch uop. */
2163 m_BDVER,
2164
2165 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2166 will impact LEA instruction selection. */
2167 m_ATOM,
2168
2169 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2170 instructions. */
2171 ~m_ATOM,
2172
2173 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2174 at -O3. For the moment, the prefetching seems badly tuned for Intel
2175 chips. */
2176 m_K6_GEODE | m_AMD_MULTIPLE,
2177
2178 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2179 the auto-vectorizer. */
2180 m_BDVER,
2181
2182 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2183 during reassociation of integer computation. */
2184 m_ATOM,
2185
2186 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2187 during reassociation of fp computation. */
2188 m_ATOM
2189 };
2190
2191 /* Feature tests against the various architecture variations. */
2192 unsigned char ix86_arch_features[X86_ARCH_LAST];
2193
2194 /* Feature tests against the various architecture variations, used to create
2195 ix86_arch_features based on the processor mask. */
2196 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2197 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2198 ~(m_386 | m_486 | m_PENT | m_K6),
2199
2200 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2201 ~m_386,
2202
2203 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2204 ~(m_386 | m_486),
2205
2206 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2207 ~m_386,
2208
2209 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2210 ~m_386,
2211 };
2212
2213 static const unsigned int x86_accumulate_outgoing_args
2214 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2215
2216 static const unsigned int x86_arch_always_fancy_math_387
2217 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2218
2219 static const unsigned int x86_avx256_split_unaligned_load
2220 = m_COREI7 | m_GENERIC;
2221
2222 static const unsigned int x86_avx256_split_unaligned_store
2223 = m_COREI7 | m_BDVER | m_GENERIC;
2224
2225 /* In case the average insn count for single function invocation is
2226 lower than this constant, emit fast (but longer) prologue and
2227 epilogue code. */
2228 #define FAST_PROLOGUE_INSN_COUNT 20
2229
2230 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2231 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2232 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2233 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2234
2235 /* Array of the smallest class containing reg number REGNO, indexed by
2236 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2237
2238 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2239 {
2240 /* ax, dx, cx, bx */
2241 AREG, DREG, CREG, BREG,
2242 /* si, di, bp, sp */
2243 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2244 /* FP registers */
2245 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2246 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2247 /* arg pointer */
2248 NON_Q_REGS,
2249 /* flags, fpsr, fpcr, frame */
2250 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2251 /* SSE registers */
2252 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 /* MMX registers */
2255 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2256 MMX_REGS, MMX_REGS,
2257 /* REX registers */
2258 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2259 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2260 /* SSE REX registers */
2261 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2262 SSE_REGS, SSE_REGS,
2263 };
2264
2265 /* The "default" register map used in 32bit mode. */
2266
2267 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2268 {
2269 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2270 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2271 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2272 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2273 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2275 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2276 };
2277
2278 /* The "default" register map used in 64bit mode. */
2279
2280 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2281 {
2282 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2283 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2284 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2285 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2286 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2287 8,9,10,11,12,13,14,15, /* extended integer registers */
2288 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2289 };
2290
2291 /* Define the register numbers to be used in Dwarf debugging information.
2292 The SVR4 reference port C compiler uses the following register numbers
2293 in its Dwarf output code:
2294 0 for %eax (gcc regno = 0)
2295 1 for %ecx (gcc regno = 2)
2296 2 for %edx (gcc regno = 1)
2297 3 for %ebx (gcc regno = 3)
2298 4 for %esp (gcc regno = 7)
2299 5 for %ebp (gcc regno = 6)
2300 6 for %esi (gcc regno = 4)
2301 7 for %edi (gcc regno = 5)
2302 The following three DWARF register numbers are never generated by
2303 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2304 believes these numbers have these meanings.
2305 8 for %eip (no gcc equivalent)
2306 9 for %eflags (gcc regno = 17)
2307 10 for %trapno (no gcc equivalent)
2308 It is not at all clear how we should number the FP stack registers
2309 for the x86 architecture. If the version of SDB on x86/svr4 were
2310 a bit less brain dead with respect to floating-point then we would
2311 have a precedent to follow with respect to DWARF register numbers
2312 for x86 FP registers, but the SDB on x86/svr4 is so completely
2313 broken with respect to FP registers that it is hardly worth thinking
2314 of it as something to strive for compatibility with.
2315 The version of x86/svr4 SDB I have at the moment does (partially)
2316 seem to believe that DWARF register number 11 is associated with
2317 the x86 register %st(0), but that's about all. Higher DWARF
2318 register numbers don't seem to be associated with anything in
2319 particular, and even for DWARF regno 11, SDB only seems to under-
2320 stand that it should say that a variable lives in %st(0) (when
2321 asked via an `=' command) if we said it was in DWARF regno 11,
2322 but SDB still prints garbage when asked for the value of the
2323 variable in question (via a `/' command).
2324 (Also note that the labels SDB prints for various FP stack regs
2325 when doing an `x' command are all wrong.)
2326 Note that these problems generally don't affect the native SVR4
2327 C compiler because it doesn't allow the use of -O with -g and
2328 because when it is *not* optimizing, it allocates a memory
2329 location for each floating-point variable, and the memory
2330 location is what gets described in the DWARF AT_location
2331 attribute for the variable in question.
2332 Regardless of the severe mental illness of the x86/svr4 SDB, we
2333 do something sensible here and we use the following DWARF
2334 register numbers. Note that these are all stack-top-relative
2335 numbers.
2336 11 for %st(0) (gcc regno = 8)
2337 12 for %st(1) (gcc regno = 9)
2338 13 for %st(2) (gcc regno = 10)
2339 14 for %st(3) (gcc regno = 11)
2340 15 for %st(4) (gcc regno = 12)
2341 16 for %st(5) (gcc regno = 13)
2342 17 for %st(6) (gcc regno = 14)
2343 18 for %st(7) (gcc regno = 15)
2344 */
2345 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2346 {
2347 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2348 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2349 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2350 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2351 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2352 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2353 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2354 };
2355
2356 /* Define parameter passing and return registers. */
2357
2358 static int const x86_64_int_parameter_registers[6] =
2359 {
2360 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2361 };
2362
2363 static int const x86_64_ms_abi_int_parameter_registers[4] =
2364 {
2365 CX_REG, DX_REG, R8_REG, R9_REG
2366 };
2367
2368 static int const x86_64_int_return_registers[4] =
2369 {
2370 AX_REG, DX_REG, DI_REG, SI_REG
2371 };
2372
2373 /* Define the structure for the machine field in struct function. */
2374
2375 struct GTY(()) stack_local_entry {
2376 unsigned short mode;
2377 unsigned short n;
2378 rtx rtl;
2379 struct stack_local_entry *next;
2380 };
2381
2382 /* Structure describing stack frame layout.
2383 Stack grows downward:
2384
2385 [arguments]
2386 <- ARG_POINTER
2387 saved pc
2388
2389 saved static chain if ix86_static_chain_on_stack
2390
2391 saved frame pointer if frame_pointer_needed
2392 <- HARD_FRAME_POINTER
2393 [saved regs]
2394 <- regs_save_offset
2395 [padding0]
2396
2397 [saved SSE regs]
2398 <- sse_regs_save_offset
2399 [padding1] |
2400 | <- FRAME_POINTER
2401 [va_arg registers] |
2402 |
2403 [frame] |
2404 |
2405 [padding2] | = to_allocate
2406 <- STACK_POINTER
2407 */
2408 struct ix86_frame
2409 {
2410 int nsseregs;
2411 int nregs;
2412 int va_arg_size;
2413 int red_zone_size;
2414 int outgoing_arguments_size;
2415
2416 /* The offsets relative to ARG_POINTER. */
2417 HOST_WIDE_INT frame_pointer_offset;
2418 HOST_WIDE_INT hard_frame_pointer_offset;
2419 HOST_WIDE_INT stack_pointer_offset;
2420 HOST_WIDE_INT hfp_save_offset;
2421 HOST_WIDE_INT reg_save_offset;
2422 HOST_WIDE_INT sse_reg_save_offset;
2423
2424 /* When save_regs_using_mov is set, emit prologue using
2425 move instead of push instructions. */
2426 bool save_regs_using_mov;
2427 };
2428
2429 /* Which cpu are we scheduling for. */
2430 enum attr_cpu ix86_schedule;
2431
2432 /* Which cpu are we optimizing for. */
2433 enum processor_type ix86_tune;
2434
2435 /* Which instruction set architecture to use. */
2436 enum processor_type ix86_arch;
2437
2438 /* true if sse prefetch instruction is not NOOP. */
2439 int x86_prefetch_sse;
2440
2441 /* -mstackrealign option */
2442 static const char ix86_force_align_arg_pointer_string[]
2443 = "force_align_arg_pointer";
2444
2445 static rtx (*ix86_gen_leave) (void);
2446 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2449 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2450 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2451 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2452 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2453 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2454 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2455 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2456 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2457
2458 /* Preferred alignment for stack boundary in bits. */
2459 unsigned int ix86_preferred_stack_boundary;
2460
2461 /* Alignment for incoming stack boundary in bits specified at
2462 command line. */
2463 static unsigned int ix86_user_incoming_stack_boundary;
2464
2465 /* Default alignment for incoming stack boundary in bits. */
2466 static unsigned int ix86_default_incoming_stack_boundary;
2467
2468 /* Alignment for incoming stack boundary in bits. */
2469 unsigned int ix86_incoming_stack_boundary;
2470
2471 /* Calling abi specific va_list type nodes. */
2472 static GTY(()) tree sysv_va_list_type_node;
2473 static GTY(()) tree ms_va_list_type_node;
2474
2475 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2476 char internal_label_prefix[16];
2477 int internal_label_prefix_len;
2478
2479 /* Fence to use after loop using movnt. */
2480 tree x86_mfence;
2481
2482 /* Register class used for passing given 64bit part of the argument.
2483 These represent classes as documented by the PS ABI, with the exception
2484 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2485 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2486
2487 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2488 whenever possible (upper half does contain padding). */
2489 enum x86_64_reg_class
2490 {
2491 X86_64_NO_CLASS,
2492 X86_64_INTEGER_CLASS,
2493 X86_64_INTEGERSI_CLASS,
2494 X86_64_SSE_CLASS,
2495 X86_64_SSESF_CLASS,
2496 X86_64_SSEDF_CLASS,
2497 X86_64_SSEUP_CLASS,
2498 X86_64_X87_CLASS,
2499 X86_64_X87UP_CLASS,
2500 X86_64_COMPLEX_X87_CLASS,
2501 X86_64_MEMORY_CLASS
2502 };
2503
2504 #define MAX_CLASSES 4
2505
2506 /* Table of constants used by fldpi, fldln2, etc.... */
2507 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2508 static bool ext_80387_constants_init = 0;
2509
2510 \f
2511 static struct machine_function * ix86_init_machine_status (void);
2512 static rtx ix86_function_value (const_tree, const_tree, bool);
2513 static bool ix86_function_value_regno_p (const unsigned int);
2514 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2515 const_tree);
2516 static rtx ix86_static_chain (const_tree, bool);
2517 static int ix86_function_regparm (const_tree, const_tree);
2518 static void ix86_compute_frame_layout (struct ix86_frame *);
2519 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2520 rtx, rtx, int);
2521 static void ix86_add_new_builtins (HOST_WIDE_INT);
2522 static tree ix86_canonical_va_list_type (tree);
2523 static void predict_jump (int);
2524 static unsigned int split_stack_prologue_scratch_regno (void);
2525 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2526
2527 enum ix86_function_specific_strings
2528 {
2529 IX86_FUNCTION_SPECIFIC_ARCH,
2530 IX86_FUNCTION_SPECIFIC_TUNE,
2531 IX86_FUNCTION_SPECIFIC_MAX
2532 };
2533
2534 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2535 const char *, enum fpmath_unit, bool);
2536 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2537 static void ix86_function_specific_save (struct cl_target_option *);
2538 static void ix86_function_specific_restore (struct cl_target_option *);
2539 static void ix86_function_specific_print (FILE *, int,
2540 struct cl_target_option *);
2541 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2542 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2543 struct gcc_options *);
2544 static bool ix86_can_inline_p (tree, tree);
2545 static void ix86_set_current_function (tree);
2546 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2547
2548 static enum calling_abi ix86_function_abi (const_tree);
2549
2550 \f
2551 #ifndef SUBTARGET32_DEFAULT_CPU
2552 #define SUBTARGET32_DEFAULT_CPU "i386"
2553 #endif
2554
2555 /* The svr4 ABI for the i386 says that records and unions are returned
2556 in memory. */
2557 #ifndef DEFAULT_PCC_STRUCT_RETURN
2558 #define DEFAULT_PCC_STRUCT_RETURN 1
2559 #endif
2560
2561 /* Whether -mtune= or -march= were specified */
2562 static int ix86_tune_defaulted;
2563 static int ix86_arch_specified;
2564
2565 /* Vectorization library interface and handlers. */
2566 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2567
2568 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2569 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2570
2571 /* Processor target table, indexed by processor number */
2572 struct ptt
2573 {
2574 const struct processor_costs *cost; /* Processor costs */
2575 const int align_loop; /* Default alignments. */
2576 const int align_loop_max_skip;
2577 const int align_jump;
2578 const int align_jump_max_skip;
2579 const int align_func;
2580 };
2581
2582 static const struct ptt processor_target_table[PROCESSOR_max] =
2583 {
2584 {&i386_cost, 4, 3, 4, 3, 4},
2585 {&i486_cost, 16, 15, 16, 15, 16},
2586 {&pentium_cost, 16, 7, 16, 7, 16},
2587 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2588 {&geode_cost, 0, 0, 0, 0, 0},
2589 {&k6_cost, 32, 7, 32, 7, 32},
2590 {&athlon_cost, 16, 7, 16, 7, 16},
2591 {&pentium4_cost, 0, 0, 0, 0, 0},
2592 {&k8_cost, 16, 7, 16, 7, 16},
2593 {&nocona_cost, 0, 0, 0, 0, 0},
2594 /* Core 2 32-bit. */
2595 {&generic32_cost, 16, 10, 16, 10, 16},
2596 /* Core 2 64-bit. */
2597 {&generic64_cost, 16, 10, 16, 10, 16},
2598 /* Core i7 32-bit. */
2599 {&generic32_cost, 16, 10, 16, 10, 16},
2600 /* Core i7 64-bit. */
2601 {&generic64_cost, 16, 10, 16, 10, 16},
2602 {&generic32_cost, 16, 7, 16, 7, 16},
2603 {&generic64_cost, 16, 10, 16, 10, 16},
2604 {&amdfam10_cost, 32, 24, 32, 7, 32},
2605 {&bdver1_cost, 32, 24, 32, 7, 32},
2606 {&bdver2_cost, 32, 24, 32, 7, 32},
2607 {&btver1_cost, 32, 24, 32, 7, 32},
2608 {&atom_cost, 16, 15, 16, 7, 16}
2609 };
2610
2611 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2612 {
2613 "generic",
2614 "i386",
2615 "i486",
2616 "pentium",
2617 "pentium-mmx",
2618 "pentiumpro",
2619 "pentium2",
2620 "pentium3",
2621 "pentium4",
2622 "pentium-m",
2623 "prescott",
2624 "nocona",
2625 "core2",
2626 "corei7",
2627 "atom",
2628 "geode",
2629 "k6",
2630 "k6-2",
2631 "k6-3",
2632 "athlon",
2633 "athlon-4",
2634 "k8",
2635 "amdfam10",
2636 "bdver1",
2637 "bdver2",
2638 "btver1"
2639 };
2640 \f
2641 /* Return true if a red-zone is in use. */
2642
2643 static inline bool
2644 ix86_using_red_zone (void)
2645 {
2646 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2647 }
2648 \f
2649 /* Return a string that documents the current -m options. The caller is
2650 responsible for freeing the string. */
2651
2652 static char *
2653 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2654 const char *tune, enum fpmath_unit fpmath,
2655 bool add_nl_p)
2656 {
2657 struct ix86_target_opts
2658 {
2659 const char *option; /* option string */
2660 HOST_WIDE_INT mask; /* isa mask options */
2661 };
2662
2663 /* This table is ordered so that options like -msse4.2 that imply
2664 preceding options while match those first. */
2665 static struct ix86_target_opts isa_opts[] =
2666 {
2667 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2668 { "-mfma", OPTION_MASK_ISA_FMA },
2669 { "-mxop", OPTION_MASK_ISA_XOP },
2670 { "-mlwp", OPTION_MASK_ISA_LWP },
2671 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2672 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2673 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2674 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2675 { "-msse3", OPTION_MASK_ISA_SSE3 },
2676 { "-msse2", OPTION_MASK_ISA_SSE2 },
2677 { "-msse", OPTION_MASK_ISA_SSE },
2678 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2679 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2680 { "-mmmx", OPTION_MASK_ISA_MMX },
2681 { "-mabm", OPTION_MASK_ISA_ABM },
2682 { "-mbmi", OPTION_MASK_ISA_BMI },
2683 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2684 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2685 { "-mhle", OPTION_MASK_ISA_HLE },
2686 { "-mtbm", OPTION_MASK_ISA_TBM },
2687 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2688 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2689 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2690 { "-maes", OPTION_MASK_ISA_AES },
2691 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2692 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2693 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2694 { "-mf16c", OPTION_MASK_ISA_F16C },
2695 { "-mrtm", OPTION_MASK_ISA_RTM },
2696 };
2697
2698 /* Flag options. */
2699 static struct ix86_target_opts flag_opts[] =
2700 {
2701 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2702 { "-m80387", MASK_80387 },
2703 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2704 { "-malign-double", MASK_ALIGN_DOUBLE },
2705 { "-mcld", MASK_CLD },
2706 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2707 { "-mieee-fp", MASK_IEEE_FP },
2708 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2709 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2710 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2711 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2712 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2713 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2714 { "-mno-red-zone", MASK_NO_RED_ZONE },
2715 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2716 { "-mrecip", MASK_RECIP },
2717 { "-mrtd", MASK_RTD },
2718 { "-msseregparm", MASK_SSEREGPARM },
2719 { "-mstack-arg-probe", MASK_STACK_PROBE },
2720 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2721 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2722 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2723 { "-mvzeroupper", MASK_VZEROUPPER },
2724 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2725 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2726 { "-mprefer-avx128", MASK_PREFER_AVX128},
2727 };
2728
2729 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2730
2731 char isa_other[40];
2732 char target_other[40];
2733 unsigned num = 0;
2734 unsigned i, j;
2735 char *ret;
2736 char *ptr;
2737 size_t len;
2738 size_t line_len;
2739 size_t sep_len;
2740 const char *abi;
2741
2742 memset (opts, '\0', sizeof (opts));
2743
2744 /* Add -march= option. */
2745 if (arch)
2746 {
2747 opts[num][0] = "-march=";
2748 opts[num++][1] = arch;
2749 }
2750
2751 /* Add -mtune= option. */
2752 if (tune)
2753 {
2754 opts[num][0] = "-mtune=";
2755 opts[num++][1] = tune;
2756 }
2757
2758 /* Add -m32/-m64/-mx32. */
2759 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2760 {
2761 if ((isa & OPTION_MASK_ABI_64) != 0)
2762 abi = "-m64";
2763 else
2764 abi = "-mx32";
2765 isa &= ~ (OPTION_MASK_ISA_64BIT
2766 | OPTION_MASK_ABI_64
2767 | OPTION_MASK_ABI_X32);
2768 }
2769 else
2770 abi = "-m32";
2771 opts[num++][0] = abi;
2772
2773 /* Pick out the options in isa options. */
2774 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2775 {
2776 if ((isa & isa_opts[i].mask) != 0)
2777 {
2778 opts[num++][0] = isa_opts[i].option;
2779 isa &= ~ isa_opts[i].mask;
2780 }
2781 }
2782
2783 if (isa && add_nl_p)
2784 {
2785 opts[num++][0] = isa_other;
2786 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2787 isa);
2788 }
2789
2790 /* Add flag options. */
2791 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2792 {
2793 if ((flags & flag_opts[i].mask) != 0)
2794 {
2795 opts[num++][0] = flag_opts[i].option;
2796 flags &= ~ flag_opts[i].mask;
2797 }
2798 }
2799
2800 if (flags && add_nl_p)
2801 {
2802 opts[num++][0] = target_other;
2803 sprintf (target_other, "(other flags: %#x)", flags);
2804 }
2805
2806 /* Add -fpmath= option. */
2807 if (fpmath)
2808 {
2809 opts[num][0] = "-mfpmath=";
2810 switch ((int) fpmath)
2811 {
2812 case FPMATH_387:
2813 opts[num++][1] = "387";
2814 break;
2815
2816 case FPMATH_SSE:
2817 opts[num++][1] = "sse";
2818 break;
2819
2820 case FPMATH_387 | FPMATH_SSE:
2821 opts[num++][1] = "sse+387";
2822 break;
2823
2824 default:
2825 gcc_unreachable ();
2826 }
2827 }
2828
2829 /* Any options? */
2830 if (num == 0)
2831 return NULL;
2832
2833 gcc_assert (num < ARRAY_SIZE (opts));
2834
2835 /* Size the string. */
2836 len = 0;
2837 sep_len = (add_nl_p) ? 3 : 1;
2838 for (i = 0; i < num; i++)
2839 {
2840 len += sep_len;
2841 for (j = 0; j < 2; j++)
2842 if (opts[i][j])
2843 len += strlen (opts[i][j]);
2844 }
2845
2846 /* Build the string. */
2847 ret = ptr = (char *) xmalloc (len);
2848 line_len = 0;
2849
2850 for (i = 0; i < num; i++)
2851 {
2852 size_t len2[2];
2853
2854 for (j = 0; j < 2; j++)
2855 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2856
2857 if (i != 0)
2858 {
2859 *ptr++ = ' ';
2860 line_len++;
2861
2862 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2863 {
2864 *ptr++ = '\\';
2865 *ptr++ = '\n';
2866 line_len = 0;
2867 }
2868 }
2869
2870 for (j = 0; j < 2; j++)
2871 if (opts[i][j])
2872 {
2873 memcpy (ptr, opts[i][j], len2[j]);
2874 ptr += len2[j];
2875 line_len += len2[j];
2876 }
2877 }
2878
2879 *ptr = '\0';
2880 gcc_assert (ret + len >= ptr);
2881
2882 return ret;
2883 }
2884
2885 /* Return true, if profiling code should be emitted before
2886 prologue. Otherwise it returns false.
2887 Note: For x86 with "hotfix" it is sorried. */
2888 static bool
2889 ix86_profile_before_prologue (void)
2890 {
2891 return flag_fentry != 0;
2892 }
2893
2894 /* Function that is callable from the debugger to print the current
2895 options. */
2896 void
2897 ix86_debug_options (void)
2898 {
2899 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2900 ix86_arch_string, ix86_tune_string,
2901 ix86_fpmath, true);
2902
2903 if (opts)
2904 {
2905 fprintf (stderr, "%s\n\n", opts);
2906 free (opts);
2907 }
2908 else
2909 fputs ("<no options>\n\n", stderr);
2910
2911 return;
2912 }
2913 \f
2914 /* Override various settings based on options. If MAIN_ARGS_P, the
2915 options are from the command line, otherwise they are from
2916 attributes. */
2917
2918 static void
2919 ix86_option_override_internal (bool main_args_p)
2920 {
2921 int i;
2922 unsigned int ix86_arch_mask, ix86_tune_mask;
2923 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2924 const char *prefix;
2925 const char *suffix;
2926 const char *sw;
2927
2928 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2929 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2930 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2931 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2932 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2933 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2934 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2935 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2936 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2937 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2938 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2939 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2940 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2941 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2942 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2943 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2944 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2945 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2946 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2947 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2948 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2949 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2950 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2951 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2952 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2953 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2954 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2955 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2956 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2957 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2958 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2959 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2960 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2961 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2962 /* if this reaches 64, need to widen struct pta flags below */
2963
2964 static struct pta
2965 {
2966 const char *const name; /* processor name or nickname. */
2967 const enum processor_type processor;
2968 const enum attr_cpu schedule;
2969 const unsigned HOST_WIDE_INT flags;
2970 }
2971 const processor_alias_table[] =
2972 {
2973 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2974 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2975 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2976 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2977 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2978 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2979 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2980 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2981 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2982 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2983 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2984 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2985 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2986 PTA_MMX | PTA_SSE},
2987 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2988 PTA_MMX | PTA_SSE},
2989 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2990 PTA_MMX | PTA_SSE | PTA_SSE2},
2991 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2992 PTA_MMX |PTA_SSE | PTA_SSE2},
2993 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2994 PTA_MMX | PTA_SSE | PTA_SSE2},
2995 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2996 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2997 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2998 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2999 | PTA_CX16 | PTA_NO_SAHF},
3000 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3001 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3002 | PTA_SSSE3 | PTA_CX16},
3003 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3004 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3005 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3006 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3007 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3008 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3009 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3010 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3012 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3013 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3014 | PTA_RDRND | PTA_F16C},
3015 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3016 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3017 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3018 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3019 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3020 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3021 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3022 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3023 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3024 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3025 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3026 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3027 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3028 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3029 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3030 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3031 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3032 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3033 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3034 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3035 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3036 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3037 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3038 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3039 {"x86-64", PROCESSOR_K8, CPU_K8,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3041 {"k8", PROCESSOR_K8, CPU_K8,
3042 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3043 | PTA_SSE2 | PTA_NO_SAHF},
3044 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3045 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3046 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3047 {"opteron", PROCESSOR_K8, CPU_K8,
3048 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3049 | PTA_SSE2 | PTA_NO_SAHF},
3050 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3051 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3052 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3053 {"athlon64", PROCESSOR_K8, CPU_K8,
3054 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3055 | PTA_SSE2 | PTA_NO_SAHF},
3056 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3057 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3058 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3059 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3060 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3061 | PTA_SSE2 | PTA_NO_SAHF},
3062 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3063 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3064 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3065 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3066 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3067 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3068 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3069 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3070 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3071 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3072 | PTA_XOP | PTA_LWP},
3073 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3074 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3075 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3076 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3077 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3078 | PTA_FMA},
3079 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3080 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3081 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3082 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3083 PTA_HLE /* flags are only used for -march switch. */ },
3084 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3085 PTA_64BIT
3086 | PTA_HLE /* flags are only used for -march switch. */ },
3087 };
3088
3089 /* -mrecip options. */
3090 static struct
3091 {
3092 const char *string; /* option name */
3093 unsigned int mask; /* mask bits to set */
3094 }
3095 const recip_options[] =
3096 {
3097 { "all", RECIP_MASK_ALL },
3098 { "none", RECIP_MASK_NONE },
3099 { "div", RECIP_MASK_DIV },
3100 { "sqrt", RECIP_MASK_SQRT },
3101 { "vec-div", RECIP_MASK_VEC_DIV },
3102 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3103 };
3104
3105 int const pta_size = ARRAY_SIZE (processor_alias_table);
3106
3107 /* Set up prefix/suffix so the error messages refer to either the command
3108 line argument, or the attribute(target). */
3109 if (main_args_p)
3110 {
3111 prefix = "-m";
3112 suffix = "";
3113 sw = "switch";
3114 }
3115 else
3116 {
3117 prefix = "option(\"";
3118 suffix = "\")";
3119 sw = "attribute";
3120 }
3121
3122 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3123 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3124 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3125 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3126 #ifdef TARGET_BI_ARCH
3127 else
3128 {
3129 #if TARGET_BI_ARCH == 1
3130 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3131 is on and OPTION_MASK_ABI_X32 is off. We turn off
3132 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3133 -mx32. */
3134 if (TARGET_X32)
3135 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3136 #else
3137 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3138 on and OPTION_MASK_ABI_64 is off. We turn off
3139 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3140 -m64. */
3141 if (TARGET_LP64)
3142 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3143 #endif
3144 }
3145 #endif
3146
3147 if (TARGET_X32)
3148 {
3149 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3150 OPTION_MASK_ABI_64 for TARGET_X32. */
3151 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3152 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3153 }
3154 else if (TARGET_LP64)
3155 {
3156 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3157 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3158 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3159 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3160 }
3161
3162 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3163 SUBTARGET_OVERRIDE_OPTIONS;
3164 #endif
3165
3166 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3167 SUBSUBTARGET_OVERRIDE_OPTIONS;
3168 #endif
3169
3170 /* -fPIC is the default for x86_64. */
3171 if (TARGET_MACHO && TARGET_64BIT)
3172 flag_pic = 2;
3173
3174 /* Need to check -mtune=generic first. */
3175 if (ix86_tune_string)
3176 {
3177 if (!strcmp (ix86_tune_string, "generic")
3178 || !strcmp (ix86_tune_string, "i686")
3179 /* As special support for cross compilers we read -mtune=native
3180 as -mtune=generic. With native compilers we won't see the
3181 -mtune=native, as it was changed by the driver. */
3182 || !strcmp (ix86_tune_string, "native"))
3183 {
3184 if (TARGET_64BIT)
3185 ix86_tune_string = "generic64";
3186 else
3187 ix86_tune_string = "generic32";
3188 }
3189 /* If this call is for setting the option attribute, allow the
3190 generic32/generic64 that was previously set. */
3191 else if (!main_args_p
3192 && (!strcmp (ix86_tune_string, "generic32")
3193 || !strcmp (ix86_tune_string, "generic64")))
3194 ;
3195 else if (!strncmp (ix86_tune_string, "generic", 7))
3196 error ("bad value (%s) for %stune=%s %s",
3197 ix86_tune_string, prefix, suffix, sw);
3198 else if (!strcmp (ix86_tune_string, "x86-64"))
3199 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3200 "%stune=k8%s or %stune=generic%s instead as appropriate",
3201 prefix, suffix, prefix, suffix, prefix, suffix);
3202 }
3203 else
3204 {
3205 if (ix86_arch_string)
3206 ix86_tune_string = ix86_arch_string;
3207 if (!ix86_tune_string)
3208 {
3209 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3210 ix86_tune_defaulted = 1;
3211 }
3212
3213 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3214 need to use a sensible tune option. */
3215 if (!strcmp (ix86_tune_string, "generic")
3216 || !strcmp (ix86_tune_string, "x86-64")
3217 || !strcmp (ix86_tune_string, "i686"))
3218 {
3219 if (TARGET_64BIT)
3220 ix86_tune_string = "generic64";
3221 else
3222 ix86_tune_string = "generic32";
3223 }
3224 }
3225
3226 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3227 {
3228 /* rep; movq isn't available in 32-bit code. */
3229 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3230 ix86_stringop_alg = no_stringop;
3231 }
3232
3233 if (!ix86_arch_string)
3234 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3235 else
3236 ix86_arch_specified = 1;
3237
3238 if (global_options_set.x_ix86_pmode)
3239 {
3240 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3241 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3242 error ("address mode %qs not supported in the %s bit mode",
3243 TARGET_64BIT ? "short" : "long",
3244 TARGET_64BIT ? "64" : "32");
3245 }
3246 else
3247 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3248
3249 if (!global_options_set.x_ix86_abi)
3250 ix86_abi = DEFAULT_ABI;
3251
3252 if (global_options_set.x_ix86_cmodel)
3253 {
3254 switch (ix86_cmodel)
3255 {
3256 case CM_SMALL:
3257 case CM_SMALL_PIC:
3258 if (flag_pic)
3259 ix86_cmodel = CM_SMALL_PIC;
3260 if (!TARGET_64BIT)
3261 error ("code model %qs not supported in the %s bit mode",
3262 "small", "32");
3263 break;
3264
3265 case CM_MEDIUM:
3266 case CM_MEDIUM_PIC:
3267 if (flag_pic)
3268 ix86_cmodel = CM_MEDIUM_PIC;
3269 if (!TARGET_64BIT)
3270 error ("code model %qs not supported in the %s bit mode",
3271 "medium", "32");
3272 else if (TARGET_X32)
3273 error ("code model %qs not supported in x32 mode",
3274 "medium");
3275 break;
3276
3277 case CM_LARGE:
3278 case CM_LARGE_PIC:
3279 if (flag_pic)
3280 ix86_cmodel = CM_LARGE_PIC;
3281 if (!TARGET_64BIT)
3282 error ("code model %qs not supported in the %s bit mode",
3283 "large", "32");
3284 else if (TARGET_X32)
3285 error ("code model %qs not supported in x32 mode",
3286 "medium");
3287 break;
3288
3289 case CM_32:
3290 if (flag_pic)
3291 error ("code model %s does not support PIC mode", "32");
3292 if (TARGET_64BIT)
3293 error ("code model %qs not supported in the %s bit mode",
3294 "32", "64");
3295 break;
3296
3297 case CM_KERNEL:
3298 if (flag_pic)
3299 {
3300 error ("code model %s does not support PIC mode", "kernel");
3301 ix86_cmodel = CM_32;
3302 }
3303 if (!TARGET_64BIT)
3304 error ("code model %qs not supported in the %s bit mode",
3305 "kernel", "32");
3306 break;
3307
3308 default:
3309 gcc_unreachable ();
3310 }
3311 }
3312 else
3313 {
3314 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3315 use of rip-relative addressing. This eliminates fixups that
3316 would otherwise be needed if this object is to be placed in a
3317 DLL, and is essentially just as efficient as direct addressing. */
3318 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3319 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3320 else if (TARGET_64BIT)
3321 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3322 else
3323 ix86_cmodel = CM_32;
3324 }
3325 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3326 {
3327 error ("-masm=intel not supported in this configuration");
3328 ix86_asm_dialect = ASM_ATT;
3329 }
3330 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3331 sorry ("%i-bit mode not compiled in",
3332 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3333
3334 for (i = 0; i < pta_size; i++)
3335 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3336 {
3337 ix86_schedule = processor_alias_table[i].schedule;
3338 ix86_arch = processor_alias_table[i].processor;
3339 /* Default cpu tuning to the architecture. */
3340 ix86_tune = ix86_arch;
3341
3342 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3343 error ("CPU you selected does not support x86-64 "
3344 "instruction set");
3345
3346 if (processor_alias_table[i].flags & PTA_MMX
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3348 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3349 if (processor_alias_table[i].flags & PTA_3DNOW
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3351 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3352 if (processor_alias_table[i].flags & PTA_3DNOW_A
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3354 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3355 if (processor_alias_table[i].flags & PTA_SSE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3358 if (processor_alias_table[i].flags & PTA_SSE2
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3360 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3361 if (processor_alias_table[i].flags & PTA_SSE3
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3363 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3364 if (processor_alias_table[i].flags & PTA_SSSE3
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3366 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3367 if (processor_alias_table[i].flags & PTA_SSE4_1
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3369 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3370 if (processor_alias_table[i].flags & PTA_SSE4_2
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3372 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3373 if (processor_alias_table[i].flags & PTA_AVX
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3375 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3376 if (processor_alias_table[i].flags & PTA_AVX2
3377 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3378 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3379 if (processor_alias_table[i].flags & PTA_FMA
3380 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3381 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3382 if (processor_alias_table[i].flags & PTA_SSE4A
3383 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3384 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3385 if (processor_alias_table[i].flags & PTA_FMA4
3386 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3387 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3388 if (processor_alias_table[i].flags & PTA_XOP
3389 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3390 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3391 if (processor_alias_table[i].flags & PTA_LWP
3392 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3393 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3394 if (processor_alias_table[i].flags & PTA_ABM
3395 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3396 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3397 if (processor_alias_table[i].flags & PTA_BMI
3398 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3399 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3400 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3401 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3402 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3403 if (processor_alias_table[i].flags & PTA_TBM
3404 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3405 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3406 if (processor_alias_table[i].flags & PTA_BMI2
3407 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3408 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3409 if (processor_alias_table[i].flags & PTA_CX16
3410 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3411 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3412 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3413 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3414 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3415 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3416 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3417 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3418 if (processor_alias_table[i].flags & PTA_MOVBE
3419 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3420 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3421 if (processor_alias_table[i].flags & PTA_AES
3422 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3423 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3424 if (processor_alias_table[i].flags & PTA_PCLMUL
3425 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3426 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3427 if (processor_alias_table[i].flags & PTA_FSGSBASE
3428 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3429 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3430 if (processor_alias_table[i].flags & PTA_RDRND
3431 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3432 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3433 if (processor_alias_table[i].flags & PTA_F16C
3434 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3435 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3436 if (processor_alias_table[i].flags & PTA_RTM
3437 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3438 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3439 if (processor_alias_table[i].flags & PTA_HLE
3440 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3441 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3442 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3443 x86_prefetch_sse = true;
3444
3445 break;
3446 }
3447
3448 if (!strcmp (ix86_arch_string, "generic"))
3449 error ("generic CPU can be used only for %stune=%s %s",
3450 prefix, suffix, sw);
3451 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3452 error ("bad value (%s) for %sarch=%s %s",
3453 ix86_arch_string, prefix, suffix, sw);
3454
3455 ix86_arch_mask = 1u << ix86_arch;
3456 for (i = 0; i < X86_ARCH_LAST; ++i)
3457 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3458
3459 for (i = 0; i < pta_size; i++)
3460 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3461 {
3462 ix86_schedule = processor_alias_table[i].schedule;
3463 ix86_tune = processor_alias_table[i].processor;
3464 if (TARGET_64BIT)
3465 {
3466 if (!(processor_alias_table[i].flags & PTA_64BIT))
3467 {
3468 if (ix86_tune_defaulted)
3469 {
3470 ix86_tune_string = "x86-64";
3471 for (i = 0; i < pta_size; i++)
3472 if (! strcmp (ix86_tune_string,
3473 processor_alias_table[i].name))
3474 break;
3475 ix86_schedule = processor_alias_table[i].schedule;
3476 ix86_tune = processor_alias_table[i].processor;
3477 }
3478 else
3479 error ("CPU you selected does not support x86-64 "
3480 "instruction set");
3481 }
3482 }
3483 else
3484 {
3485 /* Adjust tuning when compiling for 32-bit ABI. */
3486 switch (ix86_tune)
3487 {
3488 case PROCESSOR_GENERIC64:
3489 ix86_tune = PROCESSOR_GENERIC32;
3490 ix86_schedule = CPU_PENTIUMPRO;
3491 break;
3492
3493 case PROCESSOR_CORE2_64:
3494 ix86_tune = PROCESSOR_CORE2_32;
3495 break;
3496
3497 case PROCESSOR_COREI7_64:
3498 ix86_tune = PROCESSOR_COREI7_32;
3499 break;
3500
3501 default:
3502 break;
3503 }
3504 }
3505 /* Intel CPUs have always interpreted SSE prefetch instructions as
3506 NOPs; so, we can enable SSE prefetch instructions even when
3507 -mtune (rather than -march) points us to a processor that has them.
3508 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3509 higher processors. */
3510 if (TARGET_CMOV
3511 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3512 x86_prefetch_sse = true;
3513 break;
3514 }
3515
3516 if (ix86_tune_specified && i == pta_size)
3517 error ("bad value (%s) for %stune=%s %s",
3518 ix86_tune_string, prefix, suffix, sw);
3519
3520 ix86_tune_mask = 1u << ix86_tune;
3521 for (i = 0; i < X86_TUNE_LAST; ++i)
3522 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3523
3524 #ifndef USE_IX86_FRAME_POINTER
3525 #define USE_IX86_FRAME_POINTER 0
3526 #endif
3527
3528 #ifndef USE_X86_64_FRAME_POINTER
3529 #define USE_X86_64_FRAME_POINTER 0
3530 #endif
3531
3532 /* Set the default values for switches whose default depends on TARGET_64BIT
3533 in case they weren't overwritten by command line options. */
3534 if (TARGET_64BIT)
3535 {
3536 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3537 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3538 if (flag_asynchronous_unwind_tables == 2)
3539 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3540 if (flag_pcc_struct_return == 2)
3541 flag_pcc_struct_return = 0;
3542 }
3543 else
3544 {
3545 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3546 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3547 if (flag_asynchronous_unwind_tables == 2)
3548 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3549 if (flag_pcc_struct_return == 2)
3550 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3551 }
3552
3553 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3554 if (optimize_size)
3555 ix86_cost = &ix86_size_cost;
3556 else
3557 ix86_cost = ix86_tune_cost;
3558
3559 /* Arrange to set up i386_stack_locals for all functions. */
3560 init_machine_status = ix86_init_machine_status;
3561
3562 /* Validate -mregparm= value. */
3563 if (global_options_set.x_ix86_regparm)
3564 {
3565 if (TARGET_64BIT)
3566 warning (0, "-mregparm is ignored in 64-bit mode");
3567 if (ix86_regparm > REGPARM_MAX)
3568 {
3569 error ("-mregparm=%d is not between 0 and %d",
3570 ix86_regparm, REGPARM_MAX);
3571 ix86_regparm = 0;
3572 }
3573 }
3574 if (TARGET_64BIT)
3575 ix86_regparm = REGPARM_MAX;
3576
3577 /* Default align_* from the processor table. */
3578 if (align_loops == 0)
3579 {
3580 align_loops = processor_target_table[ix86_tune].align_loop;
3581 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3582 }
3583 if (align_jumps == 0)
3584 {
3585 align_jumps = processor_target_table[ix86_tune].align_jump;
3586 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3587 }
3588 if (align_functions == 0)
3589 {
3590 align_functions = processor_target_table[ix86_tune].align_func;
3591 }
3592
3593 /* Provide default for -mbranch-cost= value. */
3594 if (!global_options_set.x_ix86_branch_cost)
3595 ix86_branch_cost = ix86_cost->branch_cost;
3596
3597 if (TARGET_64BIT)
3598 {
3599 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3600
3601 /* Enable by default the SSE and MMX builtins. Do allow the user to
3602 explicitly disable any of these. In particular, disabling SSE and
3603 MMX for kernel code is extremely useful. */
3604 if (!ix86_arch_specified)
3605 ix86_isa_flags
3606 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3607 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3608
3609 if (TARGET_RTD)
3610 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3611 }
3612 else
3613 {
3614 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3615
3616 if (!ix86_arch_specified)
3617 ix86_isa_flags
3618 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3619
3620 /* i386 ABI does not specify red zone. It still makes sense to use it
3621 when programmer takes care to stack from being destroyed. */
3622 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3623 target_flags |= MASK_NO_RED_ZONE;
3624 }
3625
3626 /* Keep nonleaf frame pointers. */
3627 if (flag_omit_frame_pointer)
3628 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3629 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3630 flag_omit_frame_pointer = 1;
3631
3632 /* If we're doing fast math, we don't care about comparison order
3633 wrt NaNs. This lets us use a shorter comparison sequence. */
3634 if (flag_finite_math_only)
3635 target_flags &= ~MASK_IEEE_FP;
3636
3637 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3638 since the insns won't need emulation. */
3639 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3640 target_flags &= ~MASK_NO_FANCY_MATH_387;
3641
3642 /* Likewise, if the target doesn't have a 387, or we've specified
3643 software floating point, don't use 387 inline intrinsics. */
3644 if (!TARGET_80387)
3645 target_flags |= MASK_NO_FANCY_MATH_387;
3646
3647 /* Turn on MMX builtins for -msse. */
3648 if (TARGET_SSE)
3649 {
3650 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3651 x86_prefetch_sse = true;
3652 }
3653
3654 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3655 if (TARGET_SSE4_2 || TARGET_ABM)
3656 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3657
3658 /* Turn on lzcnt instruction for -mabm. */
3659 if (TARGET_ABM)
3660 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3661
3662 /* Validate -mpreferred-stack-boundary= value or default it to
3663 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3664 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3665 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3666 {
3667 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3668 int max = (TARGET_SEH ? 4 : 12);
3669
3670 if (ix86_preferred_stack_boundary_arg < min
3671 || ix86_preferred_stack_boundary_arg > max)
3672 {
3673 if (min == max)
3674 error ("-mpreferred-stack-boundary is not supported "
3675 "for this target");
3676 else
3677 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3678 ix86_preferred_stack_boundary_arg, min, max);
3679 }
3680 else
3681 ix86_preferred_stack_boundary
3682 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3683 }
3684
3685 /* Set the default value for -mstackrealign. */
3686 if (ix86_force_align_arg_pointer == -1)
3687 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3688
3689 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3690
3691 /* Validate -mincoming-stack-boundary= value or default it to
3692 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3693 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3694 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3695 {
3696 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3697 || ix86_incoming_stack_boundary_arg > 12)
3698 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3699 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3700 else
3701 {
3702 ix86_user_incoming_stack_boundary
3703 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3704 ix86_incoming_stack_boundary
3705 = ix86_user_incoming_stack_boundary;
3706 }
3707 }
3708
3709 /* Accept -msseregparm only if at least SSE support is enabled. */
3710 if (TARGET_SSEREGPARM
3711 && ! TARGET_SSE)
3712 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3713
3714 if (global_options_set.x_ix86_fpmath)
3715 {
3716 if (ix86_fpmath & FPMATH_SSE)
3717 {
3718 if (!TARGET_SSE)
3719 {
3720 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3721 ix86_fpmath = FPMATH_387;
3722 }
3723 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3724 {
3725 warning (0, "387 instruction set disabled, using SSE arithmetics");
3726 ix86_fpmath = FPMATH_SSE;
3727 }
3728 }
3729 }
3730 else
3731 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3732
3733 /* If the i387 is disabled, then do not return values in it. */
3734 if (!TARGET_80387)
3735 target_flags &= ~MASK_FLOAT_RETURNS;
3736
3737 /* Use external vectorized library in vectorizing intrinsics. */
3738 if (global_options_set.x_ix86_veclibabi_type)
3739 switch (ix86_veclibabi_type)
3740 {
3741 case ix86_veclibabi_type_svml:
3742 ix86_veclib_handler = ix86_veclibabi_svml;
3743 break;
3744
3745 case ix86_veclibabi_type_acml:
3746 ix86_veclib_handler = ix86_veclibabi_acml;
3747 break;
3748
3749 default:
3750 gcc_unreachable ();
3751 }
3752
3753 if ((!USE_IX86_FRAME_POINTER
3754 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3755 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3756 && !optimize_size)
3757 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3758
3759 /* ??? Unwind info is not correct around the CFG unless either a frame
3760 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3761 unwind info generation to be aware of the CFG and propagating states
3762 around edges. */
3763 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3764 || flag_exceptions || flag_non_call_exceptions)
3765 && flag_omit_frame_pointer
3766 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3767 {
3768 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3769 warning (0, "unwind tables currently require either a frame pointer "
3770 "or %saccumulate-outgoing-args%s for correctness",
3771 prefix, suffix);
3772 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3773 }
3774
3775 /* If stack probes are required, the space used for large function
3776 arguments on the stack must also be probed, so enable
3777 -maccumulate-outgoing-args so this happens in the prologue. */
3778 if (TARGET_STACK_PROBE
3779 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3780 {
3781 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3782 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3783 "for correctness", prefix, suffix);
3784 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3785 }
3786
3787 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3788 {
3789 char *p;
3790 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3791 p = strchr (internal_label_prefix, 'X');
3792 internal_label_prefix_len = p - internal_label_prefix;
3793 *p = '\0';
3794 }
3795
3796 /* When scheduling description is not available, disable scheduler pass
3797 so it won't slow down the compilation and make x87 code slower. */
3798 if (!TARGET_SCHEDULE)
3799 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3800
3801 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3802 ix86_tune_cost->simultaneous_prefetches,
3803 global_options.x_param_values,
3804 global_options_set.x_param_values);
3805 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3806 ix86_tune_cost->prefetch_block,
3807 global_options.x_param_values,
3808 global_options_set.x_param_values);
3809 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3810 ix86_tune_cost->l1_cache_size,
3811 global_options.x_param_values,
3812 global_options_set.x_param_values);
3813 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3814 ix86_tune_cost->l2_cache_size,
3815 global_options.x_param_values,
3816 global_options_set.x_param_values);
3817
3818 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3819 if (flag_prefetch_loop_arrays < 0
3820 && HAVE_prefetch
3821 && optimize >= 3
3822 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3823 flag_prefetch_loop_arrays = 1;
3824
3825 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3826 can be optimized to ap = __builtin_next_arg (0). */
3827 if (!TARGET_64BIT && !flag_split_stack)
3828 targetm.expand_builtin_va_start = NULL;
3829
3830 if (TARGET_64BIT)
3831 {
3832 ix86_gen_leave = gen_leave_rex64;
3833 if (Pmode == DImode)
3834 {
3835 ix86_gen_monitor = gen_sse3_monitor64_di;
3836 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3837 ix86_gen_tls_local_dynamic_base_64
3838 = gen_tls_local_dynamic_base_64_di;
3839 }
3840 else
3841 {
3842 ix86_gen_monitor = gen_sse3_monitor64_si;
3843 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3844 ix86_gen_tls_local_dynamic_base_64
3845 = gen_tls_local_dynamic_base_64_si;
3846 }
3847 }
3848 else
3849 {
3850 ix86_gen_leave = gen_leave;
3851 ix86_gen_monitor = gen_sse3_monitor;
3852 }
3853
3854 if (Pmode == DImode)
3855 {
3856 ix86_gen_add3 = gen_adddi3;
3857 ix86_gen_sub3 = gen_subdi3;
3858 ix86_gen_sub3_carry = gen_subdi3_carry;
3859 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3860 ix86_gen_andsp = gen_anddi3;
3861 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3862 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3863 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3864 }
3865 else
3866 {
3867 ix86_gen_add3 = gen_addsi3;
3868 ix86_gen_sub3 = gen_subsi3;
3869 ix86_gen_sub3_carry = gen_subsi3_carry;
3870 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3871 ix86_gen_andsp = gen_andsi3;
3872 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3873 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3874 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3875 }
3876
3877 #ifdef USE_IX86_CLD
3878 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3879 if (!TARGET_64BIT)
3880 target_flags |= MASK_CLD & ~target_flags_explicit;
3881 #endif
3882
3883 if (!TARGET_64BIT && flag_pic)
3884 {
3885 if (flag_fentry > 0)
3886 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3887 "with -fpic");
3888 flag_fentry = 0;
3889 }
3890 else if (TARGET_SEH)
3891 {
3892 if (flag_fentry == 0)
3893 sorry ("-mno-fentry isn%'t compatible with SEH");
3894 flag_fentry = 1;
3895 }
3896 else if (flag_fentry < 0)
3897 {
3898 #if defined(PROFILE_BEFORE_PROLOGUE)
3899 flag_fentry = 1;
3900 #else
3901 flag_fentry = 0;
3902 #endif
3903 }
3904
3905 if (TARGET_AVX)
3906 {
3907 /* When not optimize for size, enable vzeroupper optimization for
3908 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3909 AVX unaligned load/store. */
3910 if (!optimize_size)
3911 {
3912 if (flag_expensive_optimizations
3913 && !(target_flags_explicit & MASK_VZEROUPPER))
3914 target_flags |= MASK_VZEROUPPER;
3915 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3916 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3917 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3918 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3919 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3920 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3921 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3922 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3923 target_flags |= MASK_PREFER_AVX128;
3924 }
3925 }
3926 else
3927 {
3928 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3929 target_flags &= ~MASK_VZEROUPPER;
3930 }
3931
3932 if (ix86_recip_name)
3933 {
3934 char *p = ASTRDUP (ix86_recip_name);
3935 char *q;
3936 unsigned int mask, i;
3937 bool invert;
3938
3939 while ((q = strtok (p, ",")) != NULL)
3940 {
3941 p = NULL;
3942 if (*q == '!')
3943 {
3944 invert = true;
3945 q++;
3946 }
3947 else
3948 invert = false;
3949
3950 if (!strcmp (q, "default"))
3951 mask = RECIP_MASK_ALL;
3952 else
3953 {
3954 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3955 if (!strcmp (q, recip_options[i].string))
3956 {
3957 mask = recip_options[i].mask;
3958 break;
3959 }
3960
3961 if (i == ARRAY_SIZE (recip_options))
3962 {
3963 error ("unknown option for -mrecip=%s", q);
3964 invert = false;
3965 mask = RECIP_MASK_NONE;
3966 }
3967 }
3968
3969 recip_mask_explicit |= mask;
3970 if (invert)
3971 recip_mask &= ~mask;
3972 else
3973 recip_mask |= mask;
3974 }
3975 }
3976
3977 if (TARGET_RECIP)
3978 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3979 else if (target_flags_explicit & MASK_RECIP)
3980 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3981
3982 /* Save the initial options in case the user does function specific
3983 options. */
3984 if (main_args_p)
3985 target_option_default_node = target_option_current_node
3986 = build_target_option_node ();
3987 }
3988
3989 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3990
3991 static bool
3992 function_pass_avx256_p (const_rtx val)
3993 {
3994 if (!val)
3995 return false;
3996
3997 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3998 return true;
3999
4000 if (GET_CODE (val) == PARALLEL)
4001 {
4002 int i;
4003 rtx r;
4004
4005 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4006 {
4007 r = XVECEXP (val, 0, i);
4008 if (GET_CODE (r) == EXPR_LIST
4009 && XEXP (r, 0)
4010 && REG_P (XEXP (r, 0))
4011 && (GET_MODE (XEXP (r, 0)) == OImode
4012 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4013 return true;
4014 }
4015 }
4016
4017 return false;
4018 }
4019
4020 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4021
4022 static void
4023 ix86_option_override (void)
4024 {
4025 ix86_option_override_internal (true);
4026 }
4027
4028 /* Update register usage after having seen the compiler flags. */
4029
4030 static void
4031 ix86_conditional_register_usage (void)
4032 {
4033 int i;
4034 unsigned int j;
4035
4036 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4037 {
4038 if (fixed_regs[i] > 1)
4039 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4040 if (call_used_regs[i] > 1)
4041 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4042 }
4043
4044 /* The PIC register, if it exists, is fixed. */
4045 j = PIC_OFFSET_TABLE_REGNUM;
4046 if (j != INVALID_REGNUM)
4047 fixed_regs[j] = call_used_regs[j] = 1;
4048
4049 /* The 64-bit MS_ABI changes the set of call-used registers. */
4050 if (TARGET_64BIT_MS_ABI)
4051 {
4052 call_used_regs[SI_REG] = 0;
4053 call_used_regs[DI_REG] = 0;
4054 call_used_regs[XMM6_REG] = 0;
4055 call_used_regs[XMM7_REG] = 0;
4056 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4057 call_used_regs[i] = 0;
4058 }
4059
4060 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4061 other call-clobbered regs for 64-bit. */
4062 if (TARGET_64BIT)
4063 {
4064 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4065
4066 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4067 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4068 && call_used_regs[i])
4069 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4070 }
4071
4072 /* If MMX is disabled, squash the registers. */
4073 if (! TARGET_MMX)
4074 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4075 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4076 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4077
4078 /* If SSE is disabled, squash the registers. */
4079 if (! TARGET_SSE)
4080 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4081 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4082 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4083
4084 /* If the FPU is disabled, squash the registers. */
4085 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4086 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4087 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4088 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4089
4090 /* If 32-bit, squash the 64-bit registers. */
4091 if (! TARGET_64BIT)
4092 {
4093 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4094 reg_names[i] = "";
4095 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4096 reg_names[i] = "";
4097 }
4098 }
4099
4100 \f
4101 /* Save the current options */
4102
4103 static void
4104 ix86_function_specific_save (struct cl_target_option *ptr)
4105 {
4106 ptr->arch = ix86_arch;
4107 ptr->schedule = ix86_schedule;
4108 ptr->tune = ix86_tune;
4109 ptr->branch_cost = ix86_branch_cost;
4110 ptr->tune_defaulted = ix86_tune_defaulted;
4111 ptr->arch_specified = ix86_arch_specified;
4112 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4113 ptr->ix86_target_flags_explicit = target_flags_explicit;
4114 ptr->x_recip_mask_explicit = recip_mask_explicit;
4115
4116 /* The fields are char but the variables are not; make sure the
4117 values fit in the fields. */
4118 gcc_assert (ptr->arch == ix86_arch);
4119 gcc_assert (ptr->schedule == ix86_schedule);
4120 gcc_assert (ptr->tune == ix86_tune);
4121 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4122 }
4123
4124 /* Restore the current options */
4125
4126 static void
4127 ix86_function_specific_restore (struct cl_target_option *ptr)
4128 {
4129 enum processor_type old_tune = ix86_tune;
4130 enum processor_type old_arch = ix86_arch;
4131 unsigned int ix86_arch_mask, ix86_tune_mask;
4132 int i;
4133
4134 ix86_arch = (enum processor_type) ptr->arch;
4135 ix86_schedule = (enum attr_cpu) ptr->schedule;
4136 ix86_tune = (enum processor_type) ptr->tune;
4137 ix86_branch_cost = ptr->branch_cost;
4138 ix86_tune_defaulted = ptr->tune_defaulted;
4139 ix86_arch_specified = ptr->arch_specified;
4140 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4141 target_flags_explicit = ptr->ix86_target_flags_explicit;
4142 recip_mask_explicit = ptr->x_recip_mask_explicit;
4143
4144 /* Recreate the arch feature tests if the arch changed */
4145 if (old_arch != ix86_arch)
4146 {
4147 ix86_arch_mask = 1u << ix86_arch;
4148 for (i = 0; i < X86_ARCH_LAST; ++i)
4149 ix86_arch_features[i]
4150 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4151 }
4152
4153 /* Recreate the tune optimization tests */
4154 if (old_tune != ix86_tune)
4155 {
4156 ix86_tune_mask = 1u << ix86_tune;
4157 for (i = 0; i < X86_TUNE_LAST; ++i)
4158 ix86_tune_features[i]
4159 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4160 }
4161 }
4162
4163 /* Print the current options */
4164
4165 static void
4166 ix86_function_specific_print (FILE *file, int indent,
4167 struct cl_target_option *ptr)
4168 {
4169 char *target_string
4170 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4171 NULL, NULL, ptr->x_ix86_fpmath, false);
4172
4173 fprintf (file, "%*sarch = %d (%s)\n",
4174 indent, "",
4175 ptr->arch,
4176 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4177 ? cpu_names[ptr->arch]
4178 : "<unknown>"));
4179
4180 fprintf (file, "%*stune = %d (%s)\n",
4181 indent, "",
4182 ptr->tune,
4183 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4184 ? cpu_names[ptr->tune]
4185 : "<unknown>"));
4186
4187 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4188
4189 if (target_string)
4190 {
4191 fprintf (file, "%*s%s\n", indent, "", target_string);
4192 free (target_string);
4193 }
4194 }
4195
4196 \f
4197 /* Inner function to process the attribute((target(...))), take an argument and
4198 set the current options from the argument. If we have a list, recursively go
4199 over the list. */
4200
4201 static bool
4202 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4203 struct gcc_options *enum_opts_set)
4204 {
4205 char *next_optstr;
4206 bool ret = true;
4207
4208 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4209 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4210 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4211 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4212 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4213
4214 enum ix86_opt_type
4215 {
4216 ix86_opt_unknown,
4217 ix86_opt_yes,
4218 ix86_opt_no,
4219 ix86_opt_str,
4220 ix86_opt_enum,
4221 ix86_opt_isa
4222 };
4223
4224 static const struct
4225 {
4226 const char *string;
4227 size_t len;
4228 enum ix86_opt_type type;
4229 int opt;
4230 int mask;
4231 } attrs[] = {
4232 /* isa options */
4233 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4234 IX86_ATTR_ISA ("abm", OPT_mabm),
4235 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4236 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4237 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4238 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4239 IX86_ATTR_ISA ("aes", OPT_maes),
4240 IX86_ATTR_ISA ("avx", OPT_mavx),
4241 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4242 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4243 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4244 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4245 IX86_ATTR_ISA ("sse", OPT_msse),
4246 IX86_ATTR_ISA ("sse2", OPT_msse2),
4247 IX86_ATTR_ISA ("sse3", OPT_msse3),
4248 IX86_ATTR_ISA ("sse4", OPT_msse4),
4249 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4250 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4251 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4252 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4253 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4254 IX86_ATTR_ISA ("fma", OPT_mfma),
4255 IX86_ATTR_ISA ("xop", OPT_mxop),
4256 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4257 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4258 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4259 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4260 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4261 IX86_ATTR_ISA ("hle", OPT_mhle),
4262
4263 /* enum options */
4264 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4265
4266 /* string options */
4267 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4268 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4269
4270 /* flag options */
4271 IX86_ATTR_YES ("cld",
4272 OPT_mcld,
4273 MASK_CLD),
4274
4275 IX86_ATTR_NO ("fancy-math-387",
4276 OPT_mfancy_math_387,
4277 MASK_NO_FANCY_MATH_387),
4278
4279 IX86_ATTR_YES ("ieee-fp",
4280 OPT_mieee_fp,
4281 MASK_IEEE_FP),
4282
4283 IX86_ATTR_YES ("inline-all-stringops",
4284 OPT_minline_all_stringops,
4285 MASK_INLINE_ALL_STRINGOPS),
4286
4287 IX86_ATTR_YES ("inline-stringops-dynamically",
4288 OPT_minline_stringops_dynamically,
4289 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4290
4291 IX86_ATTR_NO ("align-stringops",
4292 OPT_mno_align_stringops,
4293 MASK_NO_ALIGN_STRINGOPS),
4294
4295 IX86_ATTR_YES ("recip",
4296 OPT_mrecip,
4297 MASK_RECIP),
4298
4299 };
4300
4301 /* If this is a list, recurse to get the options. */
4302 if (TREE_CODE (args) == TREE_LIST)
4303 {
4304 bool ret = true;
4305
4306 for (; args; args = TREE_CHAIN (args))
4307 if (TREE_VALUE (args)
4308 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4309 p_strings, enum_opts_set))
4310 ret = false;
4311
4312 return ret;
4313 }
4314
4315 else if (TREE_CODE (args) != STRING_CST)
4316 gcc_unreachable ();
4317
4318 /* Handle multiple arguments separated by commas. */
4319 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4320
4321 while (next_optstr && *next_optstr != '\0')
4322 {
4323 char *p = next_optstr;
4324 char *orig_p = p;
4325 char *comma = strchr (next_optstr, ',');
4326 const char *opt_string;
4327 size_t len, opt_len;
4328 int opt;
4329 bool opt_set_p;
4330 char ch;
4331 unsigned i;
4332 enum ix86_opt_type type = ix86_opt_unknown;
4333 int mask = 0;
4334
4335 if (comma)
4336 {
4337 *comma = '\0';
4338 len = comma - next_optstr;
4339 next_optstr = comma + 1;
4340 }
4341 else
4342 {
4343 len = strlen (p);
4344 next_optstr = NULL;
4345 }
4346
4347 /* Recognize no-xxx. */
4348 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4349 {
4350 opt_set_p = false;
4351 p += 3;
4352 len -= 3;
4353 }
4354 else
4355 opt_set_p = true;
4356
4357 /* Find the option. */
4358 ch = *p;
4359 opt = N_OPTS;
4360 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4361 {
4362 type = attrs[i].type;
4363 opt_len = attrs[i].len;
4364 if (ch == attrs[i].string[0]
4365 && ((type != ix86_opt_str && type != ix86_opt_enum)
4366 ? len == opt_len
4367 : len > opt_len)
4368 && memcmp (p, attrs[i].string, opt_len) == 0)
4369 {
4370 opt = attrs[i].opt;
4371 mask = attrs[i].mask;
4372 opt_string = attrs[i].string;
4373 break;
4374 }
4375 }
4376
4377 /* Process the option. */
4378 if (opt == N_OPTS)
4379 {
4380 error ("attribute(target(\"%s\")) is unknown", orig_p);
4381 ret = false;
4382 }
4383
4384 else if (type == ix86_opt_isa)
4385 {
4386 struct cl_decoded_option decoded;
4387
4388 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4389 ix86_handle_option (&global_options, &global_options_set,
4390 &decoded, input_location);
4391 }
4392
4393 else if (type == ix86_opt_yes || type == ix86_opt_no)
4394 {
4395 if (type == ix86_opt_no)
4396 opt_set_p = !opt_set_p;
4397
4398 if (opt_set_p)
4399 target_flags |= mask;
4400 else
4401 target_flags &= ~mask;
4402 }
4403
4404 else if (type == ix86_opt_str)
4405 {
4406 if (p_strings[opt])
4407 {
4408 error ("option(\"%s\") was already specified", opt_string);
4409 ret = false;
4410 }
4411 else
4412 p_strings[opt] = xstrdup (p + opt_len);
4413 }
4414
4415 else if (type == ix86_opt_enum)
4416 {
4417 bool arg_ok;
4418 int value;
4419
4420 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4421 if (arg_ok)
4422 set_option (&global_options, enum_opts_set, opt, value,
4423 p + opt_len, DK_UNSPECIFIED, input_location,
4424 global_dc);
4425 else
4426 {
4427 error ("attribute(target(\"%s\")) is unknown", orig_p);
4428 ret = false;
4429 }
4430 }
4431
4432 else
4433 gcc_unreachable ();
4434 }
4435
4436 return ret;
4437 }
4438
4439 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4440
4441 tree
4442 ix86_valid_target_attribute_tree (tree args)
4443 {
4444 const char *orig_arch_string = ix86_arch_string;
4445 const char *orig_tune_string = ix86_tune_string;
4446 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4447 int orig_tune_defaulted = ix86_tune_defaulted;
4448 int orig_arch_specified = ix86_arch_specified;
4449 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4450 tree t = NULL_TREE;
4451 int i;
4452 struct cl_target_option *def
4453 = TREE_TARGET_OPTION (target_option_default_node);
4454 struct gcc_options enum_opts_set;
4455
4456 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4457
4458 /* Process each of the options on the chain. */
4459 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4460 &enum_opts_set))
4461 return NULL_TREE;
4462
4463 /* If the changed options are different from the default, rerun
4464 ix86_option_override_internal, and then save the options away.
4465 The string options are are attribute options, and will be undone
4466 when we copy the save structure. */
4467 if (ix86_isa_flags != def->x_ix86_isa_flags
4468 || target_flags != def->x_target_flags
4469 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4470 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4471 || enum_opts_set.x_ix86_fpmath)
4472 {
4473 /* If we are using the default tune= or arch=, undo the string assigned,
4474 and use the default. */
4475 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4476 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4477 else if (!orig_arch_specified)
4478 ix86_arch_string = NULL;
4479
4480 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4481 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4482 else if (orig_tune_defaulted)
4483 ix86_tune_string = NULL;
4484
4485 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4486 if (enum_opts_set.x_ix86_fpmath)
4487 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4488 else if (!TARGET_64BIT && TARGET_SSE)
4489 {
4490 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4491 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4492 }
4493
4494 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4495 ix86_option_override_internal (false);
4496
4497 /* Add any builtin functions with the new isa if any. */
4498 ix86_add_new_builtins (ix86_isa_flags);
4499
4500 /* Save the current options unless we are validating options for
4501 #pragma. */
4502 t = build_target_option_node ();
4503
4504 ix86_arch_string = orig_arch_string;
4505 ix86_tune_string = orig_tune_string;
4506 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4507
4508 /* Free up memory allocated to hold the strings */
4509 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4510 free (option_strings[i]);
4511 }
4512
4513 return t;
4514 }
4515
4516 /* Hook to validate attribute((target("string"))). */
4517
4518 static bool
4519 ix86_valid_target_attribute_p (tree fndecl,
4520 tree ARG_UNUSED (name),
4521 tree args,
4522 int ARG_UNUSED (flags))
4523 {
4524 struct cl_target_option cur_target;
4525 bool ret = true;
4526 tree old_optimize = build_optimization_node ();
4527 tree new_target, new_optimize;
4528 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4529
4530 /* If the function changed the optimization levels as well as setting target
4531 options, start with the optimizations specified. */
4532 if (func_optimize && func_optimize != old_optimize)
4533 cl_optimization_restore (&global_options,
4534 TREE_OPTIMIZATION (func_optimize));
4535
4536 /* The target attributes may also change some optimization flags, so update
4537 the optimization options if necessary. */
4538 cl_target_option_save (&cur_target, &global_options);
4539 new_target = ix86_valid_target_attribute_tree (args);
4540 new_optimize = build_optimization_node ();
4541
4542 if (!new_target)
4543 ret = false;
4544
4545 else if (fndecl)
4546 {
4547 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4548
4549 if (old_optimize != new_optimize)
4550 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4551 }
4552
4553 cl_target_option_restore (&global_options, &cur_target);
4554
4555 if (old_optimize != new_optimize)
4556 cl_optimization_restore (&global_options,
4557 TREE_OPTIMIZATION (old_optimize));
4558
4559 return ret;
4560 }
4561
4562 \f
4563 /* Hook to determine if one function can safely inline another. */
4564
4565 static bool
4566 ix86_can_inline_p (tree caller, tree callee)
4567 {
4568 bool ret = false;
4569 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4570 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4571
4572 /* If callee has no option attributes, then it is ok to inline. */
4573 if (!callee_tree)
4574 ret = true;
4575
4576 /* If caller has no option attributes, but callee does then it is not ok to
4577 inline. */
4578 else if (!caller_tree)
4579 ret = false;
4580
4581 else
4582 {
4583 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4584 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4585
4586 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4587 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4588 function. */
4589 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4590 != callee_opts->x_ix86_isa_flags)
4591 ret = false;
4592
4593 /* See if we have the same non-isa options. */
4594 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4595 ret = false;
4596
4597 /* See if arch, tune, etc. are the same. */
4598 else if (caller_opts->arch != callee_opts->arch)
4599 ret = false;
4600
4601 else if (caller_opts->tune != callee_opts->tune)
4602 ret = false;
4603
4604 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4605 ret = false;
4606
4607 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4608 ret = false;
4609
4610 else
4611 ret = true;
4612 }
4613
4614 return ret;
4615 }
4616
4617 \f
4618 /* Remember the last target of ix86_set_current_function. */
4619 static GTY(()) tree ix86_previous_fndecl;
4620
4621 /* Establish appropriate back-end context for processing the function
4622 FNDECL. The argument might be NULL to indicate processing at top
4623 level, outside of any function scope. */
4624 static void
4625 ix86_set_current_function (tree fndecl)
4626 {
4627 /* Only change the context if the function changes. This hook is called
4628 several times in the course of compiling a function, and we don't want to
4629 slow things down too much or call target_reinit when it isn't safe. */
4630 if (fndecl && fndecl != ix86_previous_fndecl)
4631 {
4632 tree old_tree = (ix86_previous_fndecl
4633 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4634 : NULL_TREE);
4635
4636 tree new_tree = (fndecl
4637 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4638 : NULL_TREE);
4639
4640 ix86_previous_fndecl = fndecl;
4641 if (old_tree == new_tree)
4642 ;
4643
4644 else if (new_tree)
4645 {
4646 cl_target_option_restore (&global_options,
4647 TREE_TARGET_OPTION (new_tree));
4648 target_reinit ();
4649 }
4650
4651 else if (old_tree)
4652 {
4653 struct cl_target_option *def
4654 = TREE_TARGET_OPTION (target_option_current_node);
4655
4656 cl_target_option_restore (&global_options, def);
4657 target_reinit ();
4658 }
4659 }
4660 }
4661
4662 \f
4663 /* Return true if this goes in large data/bss. */
4664
4665 static bool
4666 ix86_in_large_data_p (tree exp)
4667 {
4668 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4669 return false;
4670
4671 /* Functions are never large data. */
4672 if (TREE_CODE (exp) == FUNCTION_DECL)
4673 return false;
4674
4675 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4676 {
4677 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4678 if (strcmp (section, ".ldata") == 0
4679 || strcmp (section, ".lbss") == 0)
4680 return true;
4681 return false;
4682 }
4683 else
4684 {
4685 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4686
4687 /* If this is an incomplete type with size 0, then we can't put it
4688 in data because it might be too big when completed. */
4689 if (!size || size > ix86_section_threshold)
4690 return true;
4691 }
4692
4693 return false;
4694 }
4695
4696 /* Switch to the appropriate section for output of DECL.
4697 DECL is either a `VAR_DECL' node or a constant of some sort.
4698 RELOC indicates whether forming the initial value of DECL requires
4699 link-time relocations. */
4700
4701 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4702 ATTRIBUTE_UNUSED;
4703
4704 static section *
4705 x86_64_elf_select_section (tree decl, int reloc,
4706 unsigned HOST_WIDE_INT align)
4707 {
4708 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4709 && ix86_in_large_data_p (decl))
4710 {
4711 const char *sname = NULL;
4712 unsigned int flags = SECTION_WRITE;
4713 switch (categorize_decl_for_section (decl, reloc))
4714 {
4715 case SECCAT_DATA:
4716 sname = ".ldata";
4717 break;
4718 case SECCAT_DATA_REL:
4719 sname = ".ldata.rel";
4720 break;
4721 case SECCAT_DATA_REL_LOCAL:
4722 sname = ".ldata.rel.local";
4723 break;
4724 case SECCAT_DATA_REL_RO:
4725 sname = ".ldata.rel.ro";
4726 break;
4727 case SECCAT_DATA_REL_RO_LOCAL:
4728 sname = ".ldata.rel.ro.local";
4729 break;
4730 case SECCAT_BSS:
4731 sname = ".lbss";
4732 flags |= SECTION_BSS;
4733 break;
4734 case SECCAT_RODATA:
4735 case SECCAT_RODATA_MERGE_STR:
4736 case SECCAT_RODATA_MERGE_STR_INIT:
4737 case SECCAT_RODATA_MERGE_CONST:
4738 sname = ".lrodata";
4739 flags = 0;
4740 break;
4741 case SECCAT_SRODATA:
4742 case SECCAT_SDATA:
4743 case SECCAT_SBSS:
4744 gcc_unreachable ();
4745 case SECCAT_TEXT:
4746 case SECCAT_TDATA:
4747 case SECCAT_TBSS:
4748 /* We don't split these for medium model. Place them into
4749 default sections and hope for best. */
4750 break;
4751 }
4752 if (sname)
4753 {
4754 /* We might get called with string constants, but get_named_section
4755 doesn't like them as they are not DECLs. Also, we need to set
4756 flags in that case. */
4757 if (!DECL_P (decl))
4758 return get_section (sname, flags, NULL);
4759 return get_named_section (decl, sname, reloc);
4760 }
4761 }
4762 return default_elf_select_section (decl, reloc, align);
4763 }
4764
4765 /* Build up a unique section name, expressed as a
4766 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4767 RELOC indicates whether the initial value of EXP requires
4768 link-time relocations. */
4769
4770 static void ATTRIBUTE_UNUSED
4771 x86_64_elf_unique_section (tree decl, int reloc)
4772 {
4773 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4774 && ix86_in_large_data_p (decl))
4775 {
4776 const char *prefix = NULL;
4777 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4778 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4779
4780 switch (categorize_decl_for_section (decl, reloc))
4781 {
4782 case SECCAT_DATA:
4783 case SECCAT_DATA_REL:
4784 case SECCAT_DATA_REL_LOCAL:
4785 case SECCAT_DATA_REL_RO:
4786 case SECCAT_DATA_REL_RO_LOCAL:
4787 prefix = one_only ? ".ld" : ".ldata";
4788 break;
4789 case SECCAT_BSS:
4790 prefix = one_only ? ".lb" : ".lbss";
4791 break;
4792 case SECCAT_RODATA:
4793 case SECCAT_RODATA_MERGE_STR:
4794 case SECCAT_RODATA_MERGE_STR_INIT:
4795 case SECCAT_RODATA_MERGE_CONST:
4796 prefix = one_only ? ".lr" : ".lrodata";
4797 break;
4798 case SECCAT_SRODATA:
4799 case SECCAT_SDATA:
4800 case SECCAT_SBSS:
4801 gcc_unreachable ();
4802 case SECCAT_TEXT:
4803 case SECCAT_TDATA:
4804 case SECCAT_TBSS:
4805 /* We don't split these for medium model. Place them into
4806 default sections and hope for best. */
4807 break;
4808 }
4809 if (prefix)
4810 {
4811 const char *name, *linkonce;
4812 char *string;
4813
4814 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4815 name = targetm.strip_name_encoding (name);
4816
4817 /* If we're using one_only, then there needs to be a .gnu.linkonce
4818 prefix to the section name. */
4819 linkonce = one_only ? ".gnu.linkonce" : "";
4820
4821 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4822
4823 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4824 return;
4825 }
4826 }
4827 default_unique_section (decl, reloc);
4828 }
4829
4830 #ifdef COMMON_ASM_OP
4831 /* This says how to output assembler code to declare an
4832 uninitialized external linkage data object.
4833
4834 For medium model x86-64 we need to use .largecomm opcode for
4835 large objects. */
4836 void
4837 x86_elf_aligned_common (FILE *file,
4838 const char *name, unsigned HOST_WIDE_INT size,
4839 int align)
4840 {
4841 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4842 && size > (unsigned int)ix86_section_threshold)
4843 fputs (".largecomm\t", file);
4844 else
4845 fputs (COMMON_ASM_OP, file);
4846 assemble_name (file, name);
4847 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4848 size, align / BITS_PER_UNIT);
4849 }
4850 #endif
4851
4852 /* Utility function for targets to use in implementing
4853 ASM_OUTPUT_ALIGNED_BSS. */
4854
4855 void
4856 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4857 const char *name, unsigned HOST_WIDE_INT size,
4858 int align)
4859 {
4860 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4861 && size > (unsigned int)ix86_section_threshold)
4862 switch_to_section (get_named_section (decl, ".lbss", 0));
4863 else
4864 switch_to_section (bss_section);
4865 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4866 #ifdef ASM_DECLARE_OBJECT_NAME
4867 last_assemble_variable_decl = decl;
4868 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4869 #else
4870 /* Standard thing is just output label for the object. */
4871 ASM_OUTPUT_LABEL (file, name);
4872 #endif /* ASM_DECLARE_OBJECT_NAME */
4873 ASM_OUTPUT_SKIP (file, size ? size : 1);
4874 }
4875 \f
4876 /* Decide whether we must probe the stack before any space allocation
4877 on this target. It's essentially TARGET_STACK_PROBE except when
4878 -fstack-check causes the stack to be already probed differently. */
4879
4880 bool
4881 ix86_target_stack_probe (void)
4882 {
4883 /* Do not probe the stack twice if static stack checking is enabled. */
4884 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4885 return false;
4886
4887 return TARGET_STACK_PROBE;
4888 }
4889 \f
4890 /* Decide whether we can make a sibling call to a function. DECL is the
4891 declaration of the function being targeted by the call and EXP is the
4892 CALL_EXPR representing the call. */
4893
4894 static bool
4895 ix86_function_ok_for_sibcall (tree decl, tree exp)
4896 {
4897 tree type, decl_or_type;
4898 rtx a, b;
4899
4900 /* If we are generating position-independent code, we cannot sibcall
4901 optimize any indirect call, or a direct call to a global function,
4902 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4903 if (!TARGET_MACHO
4904 && !TARGET_64BIT
4905 && flag_pic
4906 && (!decl || !targetm.binds_local_p (decl)))
4907 return false;
4908
4909 /* If we need to align the outgoing stack, then sibcalling would
4910 unalign the stack, which may break the called function. */
4911 if (ix86_minimum_incoming_stack_boundary (true)
4912 < PREFERRED_STACK_BOUNDARY)
4913 return false;
4914
4915 if (decl)
4916 {
4917 decl_or_type = decl;
4918 type = TREE_TYPE (decl);
4919 }
4920 else
4921 {
4922 /* We're looking at the CALL_EXPR, we need the type of the function. */
4923 type = CALL_EXPR_FN (exp); /* pointer expression */
4924 type = TREE_TYPE (type); /* pointer type */
4925 type = TREE_TYPE (type); /* function type */
4926 decl_or_type = type;
4927 }
4928
4929 /* Check that the return value locations are the same. Like
4930 if we are returning floats on the 80387 register stack, we cannot
4931 make a sibcall from a function that doesn't return a float to a
4932 function that does or, conversely, from a function that does return
4933 a float to a function that doesn't; the necessary stack adjustment
4934 would not be executed. This is also the place we notice
4935 differences in the return value ABI. Note that it is ok for one
4936 of the functions to have void return type as long as the return
4937 value of the other is passed in a register. */
4938 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4939 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4940 cfun->decl, false);
4941 if (STACK_REG_P (a) || STACK_REG_P (b))
4942 {
4943 if (!rtx_equal_p (a, b))
4944 return false;
4945 }
4946 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4947 {
4948 /* Disable sibcall if we need to generate vzeroupper after
4949 callee returns. */
4950 if (TARGET_VZEROUPPER
4951 && cfun->machine->callee_return_avx256_p
4952 && !cfun->machine->caller_return_avx256_p)
4953 return false;
4954 }
4955 else if (!rtx_equal_p (a, b))
4956 return false;
4957
4958 if (TARGET_64BIT)
4959 {
4960 /* The SYSV ABI has more call-clobbered registers;
4961 disallow sibcalls from MS to SYSV. */
4962 if (cfun->machine->call_abi == MS_ABI
4963 && ix86_function_type_abi (type) == SYSV_ABI)
4964 return false;
4965 }
4966 else
4967 {
4968 /* If this call is indirect, we'll need to be able to use a
4969 call-clobbered register for the address of the target function.
4970 Make sure that all such registers are not used for passing
4971 parameters. Note that DLLIMPORT functions are indirect. */
4972 if (!decl
4973 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4974 {
4975 if (ix86_function_regparm (type, NULL) >= 3)
4976 {
4977 /* ??? Need to count the actual number of registers to be used,
4978 not the possible number of registers. Fix later. */
4979 return false;
4980 }
4981 }
4982 }
4983
4984 /* Otherwise okay. That also includes certain types of indirect calls. */
4985 return true;
4986 }
4987
4988 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4989 and "sseregparm" calling convention attributes;
4990 arguments as in struct attribute_spec.handler. */
4991
4992 static tree
4993 ix86_handle_cconv_attribute (tree *node, tree name,
4994 tree args,
4995 int flags ATTRIBUTE_UNUSED,
4996 bool *no_add_attrs)
4997 {
4998 if (TREE_CODE (*node) != FUNCTION_TYPE
4999 && TREE_CODE (*node) != METHOD_TYPE
5000 && TREE_CODE (*node) != FIELD_DECL
5001 && TREE_CODE (*node) != TYPE_DECL)
5002 {
5003 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5004 name);
5005 *no_add_attrs = true;
5006 return NULL_TREE;
5007 }
5008
5009 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5010 if (is_attribute_p ("regparm", name))
5011 {
5012 tree cst;
5013
5014 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("fastcall and regparm attributes are not compatible");
5017 }
5018
5019 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5020 {
5021 error ("regparam and thiscall attributes are not compatible");
5022 }
5023
5024 cst = TREE_VALUE (args);
5025 if (TREE_CODE (cst) != INTEGER_CST)
5026 {
5027 warning (OPT_Wattributes,
5028 "%qE attribute requires an integer constant argument",
5029 name);
5030 *no_add_attrs = true;
5031 }
5032 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5033 {
5034 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5035 name, REGPARM_MAX);
5036 *no_add_attrs = true;
5037 }
5038
5039 return NULL_TREE;
5040 }
5041
5042 if (TARGET_64BIT)
5043 {
5044 /* Do not warn when emulating the MS ABI. */
5045 if ((TREE_CODE (*node) != FUNCTION_TYPE
5046 && TREE_CODE (*node) != METHOD_TYPE)
5047 || ix86_function_type_abi (*node) != MS_ABI)
5048 warning (OPT_Wattributes, "%qE attribute ignored",
5049 name);
5050 *no_add_attrs = true;
5051 return NULL_TREE;
5052 }
5053
5054 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5055 if (is_attribute_p ("fastcall", name))
5056 {
5057 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5058 {
5059 error ("fastcall and cdecl attributes are not compatible");
5060 }
5061 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5062 {
5063 error ("fastcall and stdcall attributes are not compatible");
5064 }
5065 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5066 {
5067 error ("fastcall and regparm attributes are not compatible");
5068 }
5069 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5070 {
5071 error ("fastcall and thiscall attributes are not compatible");
5072 }
5073 }
5074
5075 /* Can combine stdcall with fastcall (redundant), regparm and
5076 sseregparm. */
5077 else if (is_attribute_p ("stdcall", name))
5078 {
5079 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5080 {
5081 error ("stdcall and cdecl attributes are not compatible");
5082 }
5083 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5084 {
5085 error ("stdcall and fastcall attributes are not compatible");
5086 }
5087 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5088 {
5089 error ("stdcall and thiscall attributes are not compatible");
5090 }
5091 }
5092
5093 /* Can combine cdecl with regparm and sseregparm. */
5094 else if (is_attribute_p ("cdecl", name))
5095 {
5096 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5097 {
5098 error ("stdcall and cdecl attributes are not compatible");
5099 }
5100 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5101 {
5102 error ("fastcall and cdecl attributes are not compatible");
5103 }
5104 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5105 {
5106 error ("cdecl and thiscall attributes are not compatible");
5107 }
5108 }
5109 else if (is_attribute_p ("thiscall", name))
5110 {
5111 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5112 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5113 name);
5114 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5115 {
5116 error ("stdcall and thiscall attributes are not compatible");
5117 }
5118 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5119 {
5120 error ("fastcall and thiscall attributes are not compatible");
5121 }
5122 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5123 {
5124 error ("cdecl and thiscall attributes are not compatible");
5125 }
5126 }
5127
5128 /* Can combine sseregparm with all attributes. */
5129
5130 return NULL_TREE;
5131 }
5132
5133 /* The transactional memory builtins are implicitly regparm or fastcall
5134 depending on the ABI. Override the generic do-nothing attribute that
5135 these builtins were declared with, and replace it with one of the two
5136 attributes that we expect elsewhere. */
5137
5138 static tree
5139 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5140 tree args ATTRIBUTE_UNUSED,
5141 int flags ATTRIBUTE_UNUSED,
5142 bool *no_add_attrs)
5143 {
5144 tree alt;
5145
5146 /* In no case do we want to add the placeholder attribute. */
5147 *no_add_attrs = true;
5148
5149 /* The 64-bit ABI is unchanged for transactional memory. */
5150 if (TARGET_64BIT)
5151 return NULL_TREE;
5152
5153 /* ??? Is there a better way to validate 32-bit windows? We have
5154 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5155 if (CHECK_STACK_LIMIT > 0)
5156 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5157 else
5158 {
5159 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5160 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5161 }
5162 decl_attributes (node, alt, flags);
5163
5164 return NULL_TREE;
5165 }
5166
5167 /* This function determines from TYPE the calling-convention. */
5168
5169 unsigned int
5170 ix86_get_callcvt (const_tree type)
5171 {
5172 unsigned int ret = 0;
5173 bool is_stdarg;
5174 tree attrs;
5175
5176 if (TARGET_64BIT)
5177 return IX86_CALLCVT_CDECL;
5178
5179 attrs = TYPE_ATTRIBUTES (type);
5180 if (attrs != NULL_TREE)
5181 {
5182 if (lookup_attribute ("cdecl", attrs))
5183 ret |= IX86_CALLCVT_CDECL;
5184 else if (lookup_attribute ("stdcall", attrs))
5185 ret |= IX86_CALLCVT_STDCALL;
5186 else if (lookup_attribute ("fastcall", attrs))
5187 ret |= IX86_CALLCVT_FASTCALL;
5188 else if (lookup_attribute ("thiscall", attrs))
5189 ret |= IX86_CALLCVT_THISCALL;
5190
5191 /* Regparam isn't allowed for thiscall and fastcall. */
5192 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5193 {
5194 if (lookup_attribute ("regparm", attrs))
5195 ret |= IX86_CALLCVT_REGPARM;
5196 if (lookup_attribute ("sseregparm", attrs))
5197 ret |= IX86_CALLCVT_SSEREGPARM;
5198 }
5199
5200 if (IX86_BASE_CALLCVT(ret) != 0)
5201 return ret;
5202 }
5203
5204 is_stdarg = stdarg_p (type);
5205 if (TARGET_RTD && !is_stdarg)
5206 return IX86_CALLCVT_STDCALL | ret;
5207
5208 if (ret != 0
5209 || is_stdarg
5210 || TREE_CODE (type) != METHOD_TYPE
5211 || ix86_function_type_abi (type) != MS_ABI)
5212 return IX86_CALLCVT_CDECL | ret;
5213
5214 return IX86_CALLCVT_THISCALL;
5215 }
5216
5217 /* Return 0 if the attributes for two types are incompatible, 1 if they
5218 are compatible, and 2 if they are nearly compatible (which causes a
5219 warning to be generated). */
5220
5221 static int
5222 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5223 {
5224 unsigned int ccvt1, ccvt2;
5225
5226 if (TREE_CODE (type1) != FUNCTION_TYPE
5227 && TREE_CODE (type1) != METHOD_TYPE)
5228 return 1;
5229
5230 ccvt1 = ix86_get_callcvt (type1);
5231 ccvt2 = ix86_get_callcvt (type2);
5232 if (ccvt1 != ccvt2)
5233 return 0;
5234 if (ix86_function_regparm (type1, NULL)
5235 != ix86_function_regparm (type2, NULL))
5236 return 0;
5237
5238 return 1;
5239 }
5240 \f
5241 /* Return the regparm value for a function with the indicated TYPE and DECL.
5242 DECL may be NULL when calling function indirectly
5243 or considering a libcall. */
5244
5245 static int
5246 ix86_function_regparm (const_tree type, const_tree decl)
5247 {
5248 tree attr;
5249 int regparm;
5250 unsigned int ccvt;
5251
5252 if (TARGET_64BIT)
5253 return (ix86_function_type_abi (type) == SYSV_ABI
5254 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5255 ccvt = ix86_get_callcvt (type);
5256 regparm = ix86_regparm;
5257
5258 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5259 {
5260 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5261 if (attr)
5262 {
5263 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5264 return regparm;
5265 }
5266 }
5267 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5268 return 2;
5269 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5270 return 1;
5271
5272 /* Use register calling convention for local functions when possible. */
5273 if (decl
5274 && TREE_CODE (decl) == FUNCTION_DECL
5275 && optimize
5276 && !(profile_flag && !flag_fentry))
5277 {
5278 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5279 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5280 if (i && i->local && i->can_change_signature)
5281 {
5282 int local_regparm, globals = 0, regno;
5283
5284 /* Make sure no regparm register is taken by a
5285 fixed register variable. */
5286 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5287 if (fixed_regs[local_regparm])
5288 break;
5289
5290 /* We don't want to use regparm(3) for nested functions as
5291 these use a static chain pointer in the third argument. */
5292 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5293 local_regparm = 2;
5294
5295 /* In 32-bit mode save a register for the split stack. */
5296 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5297 local_regparm = 2;
5298
5299 /* Each fixed register usage increases register pressure,
5300 so less registers should be used for argument passing.
5301 This functionality can be overriden by an explicit
5302 regparm value. */
5303 for (regno = 0; regno <= DI_REG; regno++)
5304 if (fixed_regs[regno])
5305 globals++;
5306
5307 local_regparm
5308 = globals < local_regparm ? local_regparm - globals : 0;
5309
5310 if (local_regparm > regparm)
5311 regparm = local_regparm;
5312 }
5313 }
5314
5315 return regparm;
5316 }
5317
5318 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5319 DFmode (2) arguments in SSE registers for a function with the
5320 indicated TYPE and DECL. DECL may be NULL when calling function
5321 indirectly or considering a libcall. Otherwise return 0. */
5322
5323 static int
5324 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5325 {
5326 gcc_assert (!TARGET_64BIT);
5327
5328 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5329 by the sseregparm attribute. */
5330 if (TARGET_SSEREGPARM
5331 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5332 {
5333 if (!TARGET_SSE)
5334 {
5335 if (warn)
5336 {
5337 if (decl)
5338 error ("calling %qD with attribute sseregparm without "
5339 "SSE/SSE2 enabled", decl);
5340 else
5341 error ("calling %qT with attribute sseregparm without "
5342 "SSE/SSE2 enabled", type);
5343 }
5344 return 0;
5345 }
5346
5347 return 2;
5348 }
5349
5350 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5351 (and DFmode for SSE2) arguments in SSE registers. */
5352 if (decl && TARGET_SSE_MATH && optimize
5353 && !(profile_flag && !flag_fentry))
5354 {
5355 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5356 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5357 if (i && i->local && i->can_change_signature)
5358 return TARGET_SSE2 ? 2 : 1;
5359 }
5360
5361 return 0;
5362 }
5363
5364 /* Return true if EAX is live at the start of the function. Used by
5365 ix86_expand_prologue to determine if we need special help before
5366 calling allocate_stack_worker. */
5367
5368 static bool
5369 ix86_eax_live_at_start_p (void)
5370 {
5371 /* Cheat. Don't bother working forward from ix86_function_regparm
5372 to the function type to whether an actual argument is located in
5373 eax. Instead just look at cfg info, which is still close enough
5374 to correct at this point. This gives false positives for broken
5375 functions that might use uninitialized data that happens to be
5376 allocated in eax, but who cares? */
5377 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5378 }
5379
5380 static bool
5381 ix86_keep_aggregate_return_pointer (tree fntype)
5382 {
5383 tree attr;
5384
5385 if (!TARGET_64BIT)
5386 {
5387 attr = lookup_attribute ("callee_pop_aggregate_return",
5388 TYPE_ATTRIBUTES (fntype));
5389 if (attr)
5390 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5391
5392 /* For 32-bit MS-ABI the default is to keep aggregate
5393 return pointer. */
5394 if (ix86_function_type_abi (fntype) == MS_ABI)
5395 return true;
5396 }
5397 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5398 }
5399
5400 /* Value is the number of bytes of arguments automatically
5401 popped when returning from a subroutine call.
5402 FUNDECL is the declaration node of the function (as a tree),
5403 FUNTYPE is the data type of the function (as a tree),
5404 or for a library call it is an identifier node for the subroutine name.
5405 SIZE is the number of bytes of arguments passed on the stack.
5406
5407 On the 80386, the RTD insn may be used to pop them if the number
5408 of args is fixed, but if the number is variable then the caller
5409 must pop them all. RTD can't be used for library calls now
5410 because the library is compiled with the Unix compiler.
5411 Use of RTD is a selectable option, since it is incompatible with
5412 standard Unix calling sequences. If the option is not selected,
5413 the caller must always pop the args.
5414
5415 The attribute stdcall is equivalent to RTD on a per module basis. */
5416
5417 static int
5418 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5419 {
5420 unsigned int ccvt;
5421
5422 /* None of the 64-bit ABIs pop arguments. */
5423 if (TARGET_64BIT)
5424 return 0;
5425
5426 ccvt = ix86_get_callcvt (funtype);
5427
5428 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5429 | IX86_CALLCVT_THISCALL)) != 0
5430 && ! stdarg_p (funtype))
5431 return size;
5432
5433 /* Lose any fake structure return argument if it is passed on the stack. */
5434 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5435 && !ix86_keep_aggregate_return_pointer (funtype))
5436 {
5437 int nregs = ix86_function_regparm (funtype, fundecl);
5438 if (nregs == 0)
5439 return GET_MODE_SIZE (Pmode);
5440 }
5441
5442 return 0;
5443 }
5444 \f
5445 /* Argument support functions. */
5446
5447 /* Return true when register may be used to pass function parameters. */
5448 bool
5449 ix86_function_arg_regno_p (int regno)
5450 {
5451 int i;
5452 const int *parm_regs;
5453
5454 if (!TARGET_64BIT)
5455 {
5456 if (TARGET_MACHO)
5457 return (regno < REGPARM_MAX
5458 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5459 else
5460 return (regno < REGPARM_MAX
5461 || (TARGET_MMX && MMX_REGNO_P (regno)
5462 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5463 || (TARGET_SSE && SSE_REGNO_P (regno)
5464 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5465 }
5466
5467 if (TARGET_MACHO)
5468 {
5469 if (SSE_REGNO_P (regno) && TARGET_SSE)
5470 return true;
5471 }
5472 else
5473 {
5474 if (TARGET_SSE && SSE_REGNO_P (regno)
5475 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5476 return true;
5477 }
5478
5479 /* TODO: The function should depend on current function ABI but
5480 builtins.c would need updating then. Therefore we use the
5481 default ABI. */
5482
5483 /* RAX is used as hidden argument to va_arg functions. */
5484 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5485 return true;
5486
5487 if (ix86_abi == MS_ABI)
5488 parm_regs = x86_64_ms_abi_int_parameter_registers;
5489 else
5490 parm_regs = x86_64_int_parameter_registers;
5491 for (i = 0; i < (ix86_abi == MS_ABI
5492 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5493 if (regno == parm_regs[i])
5494 return true;
5495 return false;
5496 }
5497
5498 /* Return if we do not know how to pass TYPE solely in registers. */
5499
5500 static bool
5501 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5502 {
5503 if (must_pass_in_stack_var_size_or_pad (mode, type))
5504 return true;
5505
5506 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5507 The layout_type routine is crafty and tries to trick us into passing
5508 currently unsupported vector types on the stack by using TImode. */
5509 return (!TARGET_64BIT && mode == TImode
5510 && type && TREE_CODE (type) != VECTOR_TYPE);
5511 }
5512
5513 /* It returns the size, in bytes, of the area reserved for arguments passed
5514 in registers for the function represented by fndecl dependent to the used
5515 abi format. */
5516 int
5517 ix86_reg_parm_stack_space (const_tree fndecl)
5518 {
5519 enum calling_abi call_abi = SYSV_ABI;
5520 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5521 call_abi = ix86_function_abi (fndecl);
5522 else
5523 call_abi = ix86_function_type_abi (fndecl);
5524 if (TARGET_64BIT && call_abi == MS_ABI)
5525 return 32;
5526 return 0;
5527 }
5528
5529 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5530 call abi used. */
5531 enum calling_abi
5532 ix86_function_type_abi (const_tree fntype)
5533 {
5534 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5535 {
5536 enum calling_abi abi = ix86_abi;
5537 if (abi == SYSV_ABI)
5538 {
5539 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5540 abi = MS_ABI;
5541 }
5542 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5543 abi = SYSV_ABI;
5544 return abi;
5545 }
5546 return ix86_abi;
5547 }
5548
5549 static bool
5550 ix86_function_ms_hook_prologue (const_tree fn)
5551 {
5552 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5553 {
5554 if (decl_function_context (fn) != NULL_TREE)
5555 error_at (DECL_SOURCE_LOCATION (fn),
5556 "ms_hook_prologue is not compatible with nested function");
5557 else
5558 return true;
5559 }
5560 return false;
5561 }
5562
5563 static enum calling_abi
5564 ix86_function_abi (const_tree fndecl)
5565 {
5566 if (! fndecl)
5567 return ix86_abi;
5568 return ix86_function_type_abi (TREE_TYPE (fndecl));
5569 }
5570
5571 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5572 call abi used. */
5573 enum calling_abi
5574 ix86_cfun_abi (void)
5575 {
5576 if (! cfun)
5577 return ix86_abi;
5578 return cfun->machine->call_abi;
5579 }
5580
5581 /* Write the extra assembler code needed to declare a function properly. */
5582
5583 void
5584 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5585 tree decl)
5586 {
5587 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5588
5589 if (is_ms_hook)
5590 {
5591 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5592 unsigned int filler_cc = 0xcccccccc;
5593
5594 for (i = 0; i < filler_count; i += 4)
5595 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5596 }
5597
5598 #ifdef SUBTARGET_ASM_UNWIND_INIT
5599 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5600 #endif
5601
5602 ASM_OUTPUT_LABEL (asm_out_file, fname);
5603
5604 /* Output magic byte marker, if hot-patch attribute is set. */
5605 if (is_ms_hook)
5606 {
5607 if (TARGET_64BIT)
5608 {
5609 /* leaq [%rsp + 0], %rsp */
5610 asm_fprintf (asm_out_file, ASM_BYTE
5611 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5612 }
5613 else
5614 {
5615 /* movl.s %edi, %edi
5616 push %ebp
5617 movl.s %esp, %ebp */
5618 asm_fprintf (asm_out_file, ASM_BYTE
5619 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5620 }
5621 }
5622 }
5623
5624 /* regclass.c */
5625 extern void init_regs (void);
5626
5627 /* Implementation of call abi switching target hook. Specific to FNDECL
5628 the specific call register sets are set. See also
5629 ix86_conditional_register_usage for more details. */
5630 void
5631 ix86_call_abi_override (const_tree fndecl)
5632 {
5633 if (fndecl == NULL_TREE)
5634 cfun->machine->call_abi = ix86_abi;
5635 else
5636 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5637 }
5638
5639 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5640 expensive re-initialization of init_regs each time we switch function context
5641 since this is needed only during RTL expansion. */
5642 static void
5643 ix86_maybe_switch_abi (void)
5644 {
5645 if (TARGET_64BIT &&
5646 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5647 reinit_regs ();
5648 }
5649
5650 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5651 for a call to a function whose data type is FNTYPE.
5652 For a library call, FNTYPE is 0. */
5653
5654 void
5655 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5656 tree fntype, /* tree ptr for function decl */
5657 rtx libname, /* SYMBOL_REF of library name or 0 */
5658 tree fndecl,
5659 int caller)
5660 {
5661 struct cgraph_local_info *i;
5662 tree fnret_type;
5663
5664 memset (cum, 0, sizeof (*cum));
5665
5666 /* Initialize for the current callee. */
5667 if (caller)
5668 {
5669 cfun->machine->callee_pass_avx256_p = false;
5670 cfun->machine->callee_return_avx256_p = false;
5671 }
5672
5673 if (fndecl)
5674 {
5675 i = cgraph_local_info (fndecl);
5676 cum->call_abi = ix86_function_abi (fndecl);
5677 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5678 }
5679 else
5680 {
5681 i = NULL;
5682 cum->call_abi = ix86_function_type_abi (fntype);
5683 if (fntype)
5684 fnret_type = TREE_TYPE (fntype);
5685 else
5686 fnret_type = NULL;
5687 }
5688
5689 if (TARGET_VZEROUPPER && fnret_type)
5690 {
5691 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5692 false);
5693 if (function_pass_avx256_p (fnret_value))
5694 {
5695 /* The return value of this function uses 256bit AVX modes. */
5696 if (caller)
5697 cfun->machine->callee_return_avx256_p = true;
5698 else
5699 cfun->machine->caller_return_avx256_p = true;
5700 }
5701 }
5702
5703 cum->caller = caller;
5704
5705 /* Set up the number of registers to use for passing arguments. */
5706
5707 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5708 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5709 "or subtarget optimization implying it");
5710 cum->nregs = ix86_regparm;
5711 if (TARGET_64BIT)
5712 {
5713 cum->nregs = (cum->call_abi == SYSV_ABI
5714 ? X86_64_REGPARM_MAX
5715 : X86_64_MS_REGPARM_MAX);
5716 }
5717 if (TARGET_SSE)
5718 {
5719 cum->sse_nregs = SSE_REGPARM_MAX;
5720 if (TARGET_64BIT)
5721 {
5722 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5723 ? X86_64_SSE_REGPARM_MAX
5724 : X86_64_MS_SSE_REGPARM_MAX);
5725 }
5726 }
5727 if (TARGET_MMX)
5728 cum->mmx_nregs = MMX_REGPARM_MAX;
5729 cum->warn_avx = true;
5730 cum->warn_sse = true;
5731 cum->warn_mmx = true;
5732
5733 /* Because type might mismatch in between caller and callee, we need to
5734 use actual type of function for local calls.
5735 FIXME: cgraph_analyze can be told to actually record if function uses
5736 va_start so for local functions maybe_vaarg can be made aggressive
5737 helping K&R code.
5738 FIXME: once typesytem is fixed, we won't need this code anymore. */
5739 if (i && i->local && i->can_change_signature)
5740 fntype = TREE_TYPE (fndecl);
5741 cum->maybe_vaarg = (fntype
5742 ? (!prototype_p (fntype) || stdarg_p (fntype))
5743 : !libname);
5744
5745 if (!TARGET_64BIT)
5746 {
5747 /* If there are variable arguments, then we won't pass anything
5748 in registers in 32-bit mode. */
5749 if (stdarg_p (fntype))
5750 {
5751 cum->nregs = 0;
5752 cum->sse_nregs = 0;
5753 cum->mmx_nregs = 0;
5754 cum->warn_avx = 0;
5755 cum->warn_sse = 0;
5756 cum->warn_mmx = 0;
5757 return;
5758 }
5759
5760 /* Use ecx and edx registers if function has fastcall attribute,
5761 else look for regparm information. */
5762 if (fntype)
5763 {
5764 unsigned int ccvt = ix86_get_callcvt (fntype);
5765 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5766 {
5767 cum->nregs = 1;
5768 cum->fastcall = 1; /* Same first register as in fastcall. */
5769 }
5770 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5771 {
5772 cum->nregs = 2;
5773 cum->fastcall = 1;
5774 }
5775 else
5776 cum->nregs = ix86_function_regparm (fntype, fndecl);
5777 }
5778
5779 /* Set up the number of SSE registers used for passing SFmode
5780 and DFmode arguments. Warn for mismatching ABI. */
5781 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5782 }
5783 }
5784
5785 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5786 But in the case of vector types, it is some vector mode.
5787
5788 When we have only some of our vector isa extensions enabled, then there
5789 are some modes for which vector_mode_supported_p is false. For these
5790 modes, the generic vector support in gcc will choose some non-vector mode
5791 in order to implement the type. By computing the natural mode, we'll
5792 select the proper ABI location for the operand and not depend on whatever
5793 the middle-end decides to do with these vector types.
5794
5795 The midde-end can't deal with the vector types > 16 bytes. In this
5796 case, we return the original mode and warn ABI change if CUM isn't
5797 NULL. */
5798
5799 static enum machine_mode
5800 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5801 {
5802 enum machine_mode mode = TYPE_MODE (type);
5803
5804 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5805 {
5806 HOST_WIDE_INT size = int_size_in_bytes (type);
5807 if ((size == 8 || size == 16 || size == 32)
5808 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5809 && TYPE_VECTOR_SUBPARTS (type) > 1)
5810 {
5811 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5812
5813 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5814 mode = MIN_MODE_VECTOR_FLOAT;
5815 else
5816 mode = MIN_MODE_VECTOR_INT;
5817
5818 /* Get the mode which has this inner mode and number of units. */
5819 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5820 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5821 && GET_MODE_INNER (mode) == innermode)
5822 {
5823 if (size == 32 && !TARGET_AVX)
5824 {
5825 static bool warnedavx;
5826
5827 if (cum
5828 && !warnedavx
5829 && cum->warn_avx)
5830 {
5831 warnedavx = true;
5832 warning (0, "AVX vector argument without AVX "
5833 "enabled changes the ABI");
5834 }
5835 return TYPE_MODE (type);
5836 }
5837 else if ((size == 8 || size == 16) && !TARGET_SSE)
5838 {
5839 static bool warnedsse;
5840
5841 if (cum
5842 && !warnedsse
5843 && cum->warn_sse)
5844 {
5845 warnedsse = true;
5846 warning (0, "SSE vector argument without SSE "
5847 "enabled changes the ABI");
5848 }
5849 return mode;
5850 }
5851 else
5852 return mode;
5853 }
5854
5855 gcc_unreachable ();
5856 }
5857 }
5858
5859 return mode;
5860 }
5861
5862 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5863 this may not agree with the mode that the type system has chosen for the
5864 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5865 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5866
5867 static rtx
5868 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5869 unsigned int regno)
5870 {
5871 rtx tmp;
5872
5873 if (orig_mode != BLKmode)
5874 tmp = gen_rtx_REG (orig_mode, regno);
5875 else
5876 {
5877 tmp = gen_rtx_REG (mode, regno);
5878 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5879 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5880 }
5881
5882 return tmp;
5883 }
5884
5885 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5886 of this code is to classify each 8bytes of incoming argument by the register
5887 class and assign registers accordingly. */
5888
5889 /* Return the union class of CLASS1 and CLASS2.
5890 See the x86-64 PS ABI for details. */
5891
5892 static enum x86_64_reg_class
5893 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5894 {
5895 /* Rule #1: If both classes are equal, this is the resulting class. */
5896 if (class1 == class2)
5897 return class1;
5898
5899 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5900 the other class. */
5901 if (class1 == X86_64_NO_CLASS)
5902 return class2;
5903 if (class2 == X86_64_NO_CLASS)
5904 return class1;
5905
5906 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5907 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5908 return X86_64_MEMORY_CLASS;
5909
5910 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5911 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5912 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5913 return X86_64_INTEGERSI_CLASS;
5914 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5915 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5916 return X86_64_INTEGER_CLASS;
5917
5918 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5919 MEMORY is used. */
5920 if (class1 == X86_64_X87_CLASS
5921 || class1 == X86_64_X87UP_CLASS
5922 || class1 == X86_64_COMPLEX_X87_CLASS
5923 || class2 == X86_64_X87_CLASS
5924 || class2 == X86_64_X87UP_CLASS
5925 || class2 == X86_64_COMPLEX_X87_CLASS)
5926 return X86_64_MEMORY_CLASS;
5927
5928 /* Rule #6: Otherwise class SSE is used. */
5929 return X86_64_SSE_CLASS;
5930 }
5931
5932 /* Classify the argument of type TYPE and mode MODE.
5933 CLASSES will be filled by the register class used to pass each word
5934 of the operand. The number of words is returned. In case the parameter
5935 should be passed in memory, 0 is returned. As a special case for zero
5936 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5937
5938 BIT_OFFSET is used internally for handling records and specifies offset
5939 of the offset in bits modulo 256 to avoid overflow cases.
5940
5941 See the x86-64 PS ABI for details.
5942 */
5943
5944 static int
5945 classify_argument (enum machine_mode mode, const_tree type,
5946 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5947 {
5948 HOST_WIDE_INT bytes =
5949 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5950 int words
5951 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5952
5953 /* Variable sized entities are always passed/returned in memory. */
5954 if (bytes < 0)
5955 return 0;
5956
5957 if (mode != VOIDmode
5958 && targetm.calls.must_pass_in_stack (mode, type))
5959 return 0;
5960
5961 if (type && AGGREGATE_TYPE_P (type))
5962 {
5963 int i;
5964 tree field;
5965 enum x86_64_reg_class subclasses[MAX_CLASSES];
5966
5967 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5968 if (bytes > 32)
5969 return 0;
5970
5971 for (i = 0; i < words; i++)
5972 classes[i] = X86_64_NO_CLASS;
5973
5974 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5975 signalize memory class, so handle it as special case. */
5976 if (!words)
5977 {
5978 classes[0] = X86_64_NO_CLASS;
5979 return 1;
5980 }
5981
5982 /* Classify each field of record and merge classes. */
5983 switch (TREE_CODE (type))
5984 {
5985 case RECORD_TYPE:
5986 /* And now merge the fields of structure. */
5987 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5988 {
5989 if (TREE_CODE (field) == FIELD_DECL)
5990 {
5991 int num;
5992
5993 if (TREE_TYPE (field) == error_mark_node)
5994 continue;
5995
5996 /* Bitfields are always classified as integer. Handle them
5997 early, since later code would consider them to be
5998 misaligned integers. */
5999 if (DECL_BIT_FIELD (field))
6000 {
6001 for (i = (int_bit_position (field)
6002 + (bit_offset % 64)) / 8 / 8;
6003 i < ((int_bit_position (field) + (bit_offset % 64))
6004 + tree_low_cst (DECL_SIZE (field), 0)
6005 + 63) / 8 / 8; i++)
6006 classes[i] =
6007 merge_classes (X86_64_INTEGER_CLASS,
6008 classes[i]);
6009 }
6010 else
6011 {
6012 int pos;
6013
6014 type = TREE_TYPE (field);
6015
6016 /* Flexible array member is ignored. */
6017 if (TYPE_MODE (type) == BLKmode
6018 && TREE_CODE (type) == ARRAY_TYPE
6019 && TYPE_SIZE (type) == NULL_TREE
6020 && TYPE_DOMAIN (type) != NULL_TREE
6021 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6022 == NULL_TREE))
6023 {
6024 static bool warned;
6025
6026 if (!warned && warn_psabi)
6027 {
6028 warned = true;
6029 inform (input_location,
6030 "the ABI of passing struct with"
6031 " a flexible array member has"
6032 " changed in GCC 4.4");
6033 }
6034 continue;
6035 }
6036 num = classify_argument (TYPE_MODE (type), type,
6037 subclasses,
6038 (int_bit_position (field)
6039 + bit_offset) % 256);
6040 if (!num)
6041 return 0;
6042 pos = (int_bit_position (field)
6043 + (bit_offset % 64)) / 8 / 8;
6044 for (i = 0; i < num && (i + pos) < words; i++)
6045 classes[i + pos] =
6046 merge_classes (subclasses[i], classes[i + pos]);
6047 }
6048 }
6049 }
6050 break;
6051
6052 case ARRAY_TYPE:
6053 /* Arrays are handled as small records. */
6054 {
6055 int num;
6056 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6057 TREE_TYPE (type), subclasses, bit_offset);
6058 if (!num)
6059 return 0;
6060
6061 /* The partial classes are now full classes. */
6062 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6063 subclasses[0] = X86_64_SSE_CLASS;
6064 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6065 && !((bit_offset % 64) == 0 && bytes == 4))
6066 subclasses[0] = X86_64_INTEGER_CLASS;
6067
6068 for (i = 0; i < words; i++)
6069 classes[i] = subclasses[i % num];
6070
6071 break;
6072 }
6073 case UNION_TYPE:
6074 case QUAL_UNION_TYPE:
6075 /* Unions are similar to RECORD_TYPE but offset is always 0.
6076 */
6077 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6078 {
6079 if (TREE_CODE (field) == FIELD_DECL)
6080 {
6081 int num;
6082
6083 if (TREE_TYPE (field) == error_mark_node)
6084 continue;
6085
6086 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6087 TREE_TYPE (field), subclasses,
6088 bit_offset);
6089 if (!num)
6090 return 0;
6091 for (i = 0; i < num; i++)
6092 classes[i] = merge_classes (subclasses[i], classes[i]);
6093 }
6094 }
6095 break;
6096
6097 default:
6098 gcc_unreachable ();
6099 }
6100
6101 if (words > 2)
6102 {
6103 /* When size > 16 bytes, if the first one isn't
6104 X86_64_SSE_CLASS or any other ones aren't
6105 X86_64_SSEUP_CLASS, everything should be passed in
6106 memory. */
6107 if (classes[0] != X86_64_SSE_CLASS)
6108 return 0;
6109
6110 for (i = 1; i < words; i++)
6111 if (classes[i] != X86_64_SSEUP_CLASS)
6112 return 0;
6113 }
6114
6115 /* Final merger cleanup. */
6116 for (i = 0; i < words; i++)
6117 {
6118 /* If one class is MEMORY, everything should be passed in
6119 memory. */
6120 if (classes[i] == X86_64_MEMORY_CLASS)
6121 return 0;
6122
6123 /* The X86_64_SSEUP_CLASS should be always preceded by
6124 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6125 if (classes[i] == X86_64_SSEUP_CLASS
6126 && classes[i - 1] != X86_64_SSE_CLASS
6127 && classes[i - 1] != X86_64_SSEUP_CLASS)
6128 {
6129 /* The first one should never be X86_64_SSEUP_CLASS. */
6130 gcc_assert (i != 0);
6131 classes[i] = X86_64_SSE_CLASS;
6132 }
6133
6134 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6135 everything should be passed in memory. */
6136 if (classes[i] == X86_64_X87UP_CLASS
6137 && (classes[i - 1] != X86_64_X87_CLASS))
6138 {
6139 static bool warned;
6140
6141 /* The first one should never be X86_64_X87UP_CLASS. */
6142 gcc_assert (i != 0);
6143 if (!warned && warn_psabi)
6144 {
6145 warned = true;
6146 inform (input_location,
6147 "the ABI of passing union with long double"
6148 " has changed in GCC 4.4");
6149 }
6150 return 0;
6151 }
6152 }
6153 return words;
6154 }
6155
6156 /* Compute alignment needed. We align all types to natural boundaries with
6157 exception of XFmode that is aligned to 64bits. */
6158 if (mode != VOIDmode && mode != BLKmode)
6159 {
6160 int mode_alignment = GET_MODE_BITSIZE (mode);
6161
6162 if (mode == XFmode)
6163 mode_alignment = 128;
6164 else if (mode == XCmode)
6165 mode_alignment = 256;
6166 if (COMPLEX_MODE_P (mode))
6167 mode_alignment /= 2;
6168 /* Misaligned fields are always returned in memory. */
6169 if (bit_offset % mode_alignment)
6170 return 0;
6171 }
6172
6173 /* for V1xx modes, just use the base mode */
6174 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6175 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6176 mode = GET_MODE_INNER (mode);
6177
6178 /* Classification of atomic types. */
6179 switch (mode)
6180 {
6181 case SDmode:
6182 case DDmode:
6183 classes[0] = X86_64_SSE_CLASS;
6184 return 1;
6185 case TDmode:
6186 classes[0] = X86_64_SSE_CLASS;
6187 classes[1] = X86_64_SSEUP_CLASS;
6188 return 2;
6189 case DImode:
6190 case SImode:
6191 case HImode:
6192 case QImode:
6193 case CSImode:
6194 case CHImode:
6195 case CQImode:
6196 {
6197 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6198
6199 if (size <= 32)
6200 {
6201 classes[0] = X86_64_INTEGERSI_CLASS;
6202 return 1;
6203 }
6204 else if (size <= 64)
6205 {
6206 classes[0] = X86_64_INTEGER_CLASS;
6207 return 1;
6208 }
6209 else if (size <= 64+32)
6210 {
6211 classes[0] = X86_64_INTEGER_CLASS;
6212 classes[1] = X86_64_INTEGERSI_CLASS;
6213 return 2;
6214 }
6215 else if (size <= 64+64)
6216 {
6217 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6218 return 2;
6219 }
6220 else
6221 gcc_unreachable ();
6222 }
6223 case CDImode:
6224 case TImode:
6225 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6226 return 2;
6227 case COImode:
6228 case OImode:
6229 /* OImode shouldn't be used directly. */
6230 gcc_unreachable ();
6231 case CTImode:
6232 return 0;
6233 case SFmode:
6234 if (!(bit_offset % 64))
6235 classes[0] = X86_64_SSESF_CLASS;
6236 else
6237 classes[0] = X86_64_SSE_CLASS;
6238 return 1;
6239 case DFmode:
6240 classes[0] = X86_64_SSEDF_CLASS;
6241 return 1;
6242 case XFmode:
6243 classes[0] = X86_64_X87_CLASS;
6244 classes[1] = X86_64_X87UP_CLASS;
6245 return 2;
6246 case TFmode:
6247 classes[0] = X86_64_SSE_CLASS;
6248 classes[1] = X86_64_SSEUP_CLASS;
6249 return 2;
6250 case SCmode:
6251 classes[0] = X86_64_SSE_CLASS;
6252 if (!(bit_offset % 64))
6253 return 1;
6254 else
6255 {
6256 static bool warned;
6257
6258 if (!warned && warn_psabi)
6259 {
6260 warned = true;
6261 inform (input_location,
6262 "the ABI of passing structure with complex float"
6263 " member has changed in GCC 4.4");
6264 }
6265 classes[1] = X86_64_SSESF_CLASS;
6266 return 2;
6267 }
6268 case DCmode:
6269 classes[0] = X86_64_SSEDF_CLASS;
6270 classes[1] = X86_64_SSEDF_CLASS;
6271 return 2;
6272 case XCmode:
6273 classes[0] = X86_64_COMPLEX_X87_CLASS;
6274 return 1;
6275 case TCmode:
6276 /* This modes is larger than 16 bytes. */
6277 return 0;
6278 case V8SFmode:
6279 case V8SImode:
6280 case V32QImode:
6281 case V16HImode:
6282 case V4DFmode:
6283 case V4DImode:
6284 classes[0] = X86_64_SSE_CLASS;
6285 classes[1] = X86_64_SSEUP_CLASS;
6286 classes[2] = X86_64_SSEUP_CLASS;
6287 classes[3] = X86_64_SSEUP_CLASS;
6288 return 4;
6289 case V4SFmode:
6290 case V4SImode:
6291 case V16QImode:
6292 case V8HImode:
6293 case V2DFmode:
6294 case V2DImode:
6295 classes[0] = X86_64_SSE_CLASS;
6296 classes[1] = X86_64_SSEUP_CLASS;
6297 return 2;
6298 case V1TImode:
6299 case V1DImode:
6300 case V2SFmode:
6301 case V2SImode:
6302 case V4HImode:
6303 case V8QImode:
6304 classes[0] = X86_64_SSE_CLASS;
6305 return 1;
6306 case BLKmode:
6307 case VOIDmode:
6308 return 0;
6309 default:
6310 gcc_assert (VECTOR_MODE_P (mode));
6311
6312 if (bytes > 16)
6313 return 0;
6314
6315 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6316
6317 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6318 classes[0] = X86_64_INTEGERSI_CLASS;
6319 else
6320 classes[0] = X86_64_INTEGER_CLASS;
6321 classes[1] = X86_64_INTEGER_CLASS;
6322 return 1 + (bytes > 8);
6323 }
6324 }
6325
6326 /* Examine the argument and return set number of register required in each
6327 class. Return 0 iff parameter should be passed in memory. */
6328 static int
6329 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6330 int *int_nregs, int *sse_nregs)
6331 {
6332 enum x86_64_reg_class regclass[MAX_CLASSES];
6333 int n = classify_argument (mode, type, regclass, 0);
6334
6335 *int_nregs = 0;
6336 *sse_nregs = 0;
6337 if (!n)
6338 return 0;
6339 for (n--; n >= 0; n--)
6340 switch (regclass[n])
6341 {
6342 case X86_64_INTEGER_CLASS:
6343 case X86_64_INTEGERSI_CLASS:
6344 (*int_nregs)++;
6345 break;
6346 case X86_64_SSE_CLASS:
6347 case X86_64_SSESF_CLASS:
6348 case X86_64_SSEDF_CLASS:
6349 (*sse_nregs)++;
6350 break;
6351 case X86_64_NO_CLASS:
6352 case X86_64_SSEUP_CLASS:
6353 break;
6354 case X86_64_X87_CLASS:
6355 case X86_64_X87UP_CLASS:
6356 if (!in_return)
6357 return 0;
6358 break;
6359 case X86_64_COMPLEX_X87_CLASS:
6360 return in_return ? 2 : 0;
6361 case X86_64_MEMORY_CLASS:
6362 gcc_unreachable ();
6363 }
6364 return 1;
6365 }
6366
6367 /* Construct container for the argument used by GCC interface. See
6368 FUNCTION_ARG for the detailed description. */
6369
6370 static rtx
6371 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6372 const_tree type, int in_return, int nintregs, int nsseregs,
6373 const int *intreg, int sse_regno)
6374 {
6375 /* The following variables hold the static issued_error state. */
6376 static bool issued_sse_arg_error;
6377 static bool issued_sse_ret_error;
6378 static bool issued_x87_ret_error;
6379
6380 enum machine_mode tmpmode;
6381 int bytes =
6382 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6383 enum x86_64_reg_class regclass[MAX_CLASSES];
6384 int n;
6385 int i;
6386 int nexps = 0;
6387 int needed_sseregs, needed_intregs;
6388 rtx exp[MAX_CLASSES];
6389 rtx ret;
6390
6391 n = classify_argument (mode, type, regclass, 0);
6392 if (!n)
6393 return NULL;
6394 if (!examine_argument (mode, type, in_return, &needed_intregs,
6395 &needed_sseregs))
6396 return NULL;
6397 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6398 return NULL;
6399
6400 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6401 some less clueful developer tries to use floating-point anyway. */
6402 if (needed_sseregs && !TARGET_SSE)
6403 {
6404 if (in_return)
6405 {
6406 if (!issued_sse_ret_error)
6407 {
6408 error ("SSE register return with SSE disabled");
6409 issued_sse_ret_error = true;
6410 }
6411 }
6412 else if (!issued_sse_arg_error)
6413 {
6414 error ("SSE register argument with SSE disabled");
6415 issued_sse_arg_error = true;
6416 }
6417 return NULL;
6418 }
6419
6420 /* Likewise, error if the ABI requires us to return values in the
6421 x87 registers and the user specified -mno-80387. */
6422 if (!TARGET_80387 && in_return)
6423 for (i = 0; i < n; i++)
6424 if (regclass[i] == X86_64_X87_CLASS
6425 || regclass[i] == X86_64_X87UP_CLASS
6426 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6427 {
6428 if (!issued_x87_ret_error)
6429 {
6430 error ("x87 register return with x87 disabled");
6431 issued_x87_ret_error = true;
6432 }
6433 return NULL;
6434 }
6435
6436 /* First construct simple cases. Avoid SCmode, since we want to use
6437 single register to pass this type. */
6438 if (n == 1 && mode != SCmode)
6439 switch (regclass[0])
6440 {
6441 case X86_64_INTEGER_CLASS:
6442 case X86_64_INTEGERSI_CLASS:
6443 return gen_rtx_REG (mode, intreg[0]);
6444 case X86_64_SSE_CLASS:
6445 case X86_64_SSESF_CLASS:
6446 case X86_64_SSEDF_CLASS:
6447 if (mode != BLKmode)
6448 return gen_reg_or_parallel (mode, orig_mode,
6449 SSE_REGNO (sse_regno));
6450 break;
6451 case X86_64_X87_CLASS:
6452 case X86_64_COMPLEX_X87_CLASS:
6453 return gen_rtx_REG (mode, FIRST_STACK_REG);
6454 case X86_64_NO_CLASS:
6455 /* Zero sized array, struct or class. */
6456 return NULL;
6457 default:
6458 gcc_unreachable ();
6459 }
6460 if (n == 2
6461 && regclass[0] == X86_64_SSE_CLASS
6462 && regclass[1] == X86_64_SSEUP_CLASS
6463 && mode != BLKmode)
6464 return gen_reg_or_parallel (mode, orig_mode,
6465 SSE_REGNO (sse_regno));
6466 if (n == 4
6467 && regclass[0] == X86_64_SSE_CLASS
6468 && regclass[1] == X86_64_SSEUP_CLASS
6469 && regclass[2] == X86_64_SSEUP_CLASS
6470 && regclass[3] == X86_64_SSEUP_CLASS
6471 && mode != BLKmode)
6472 return gen_reg_or_parallel (mode, orig_mode,
6473 SSE_REGNO (sse_regno));
6474 if (n == 2
6475 && regclass[0] == X86_64_X87_CLASS
6476 && regclass[1] == X86_64_X87UP_CLASS)
6477 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6478
6479 if (n == 2
6480 && regclass[0] == X86_64_INTEGER_CLASS
6481 && regclass[1] == X86_64_INTEGER_CLASS
6482 && (mode == CDImode || mode == TImode || mode == TFmode)
6483 && intreg[0] + 1 == intreg[1])
6484 return gen_rtx_REG (mode, intreg[0]);
6485
6486 /* Otherwise figure out the entries of the PARALLEL. */
6487 for (i = 0; i < n; i++)
6488 {
6489 int pos;
6490
6491 switch (regclass[i])
6492 {
6493 case X86_64_NO_CLASS:
6494 break;
6495 case X86_64_INTEGER_CLASS:
6496 case X86_64_INTEGERSI_CLASS:
6497 /* Merge TImodes on aligned occasions here too. */
6498 if (i * 8 + 8 > bytes)
6499 tmpmode
6500 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6501 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6502 tmpmode = SImode;
6503 else
6504 tmpmode = DImode;
6505 /* We've requested 24 bytes we
6506 don't have mode for. Use DImode. */
6507 if (tmpmode == BLKmode)
6508 tmpmode = DImode;
6509 exp [nexps++]
6510 = gen_rtx_EXPR_LIST (VOIDmode,
6511 gen_rtx_REG (tmpmode, *intreg),
6512 GEN_INT (i*8));
6513 intreg++;
6514 break;
6515 case X86_64_SSESF_CLASS:
6516 exp [nexps++]
6517 = gen_rtx_EXPR_LIST (VOIDmode,
6518 gen_rtx_REG (SFmode,
6519 SSE_REGNO (sse_regno)),
6520 GEN_INT (i*8));
6521 sse_regno++;
6522 break;
6523 case X86_64_SSEDF_CLASS:
6524 exp [nexps++]
6525 = gen_rtx_EXPR_LIST (VOIDmode,
6526 gen_rtx_REG (DFmode,
6527 SSE_REGNO (sse_regno)),
6528 GEN_INT (i*8));
6529 sse_regno++;
6530 break;
6531 case X86_64_SSE_CLASS:
6532 pos = i;
6533 switch (n)
6534 {
6535 case 1:
6536 tmpmode = DImode;
6537 break;
6538 case 2:
6539 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6540 {
6541 tmpmode = TImode;
6542 i++;
6543 }
6544 else
6545 tmpmode = DImode;
6546 break;
6547 case 4:
6548 gcc_assert (i == 0
6549 && regclass[1] == X86_64_SSEUP_CLASS
6550 && regclass[2] == X86_64_SSEUP_CLASS
6551 && regclass[3] == X86_64_SSEUP_CLASS);
6552 tmpmode = OImode;
6553 i += 3;
6554 break;
6555 default:
6556 gcc_unreachable ();
6557 }
6558 exp [nexps++]
6559 = gen_rtx_EXPR_LIST (VOIDmode,
6560 gen_rtx_REG (tmpmode,
6561 SSE_REGNO (sse_regno)),
6562 GEN_INT (pos*8));
6563 sse_regno++;
6564 break;
6565 default:
6566 gcc_unreachable ();
6567 }
6568 }
6569
6570 /* Empty aligned struct, union or class. */
6571 if (nexps == 0)
6572 return NULL;
6573
6574 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6575 for (i = 0; i < nexps; i++)
6576 XVECEXP (ret, 0, i) = exp [i];
6577 return ret;
6578 }
6579
6580 /* Update the data in CUM to advance over an argument of mode MODE
6581 and data type TYPE. (TYPE is null for libcalls where that information
6582 may not be available.) */
6583
6584 static void
6585 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6586 const_tree type, HOST_WIDE_INT bytes,
6587 HOST_WIDE_INT words)
6588 {
6589 switch (mode)
6590 {
6591 default:
6592 break;
6593
6594 case BLKmode:
6595 if (bytes < 0)
6596 break;
6597 /* FALLTHRU */
6598
6599 case DImode:
6600 case SImode:
6601 case HImode:
6602 case QImode:
6603 cum->words += words;
6604 cum->nregs -= words;
6605 cum->regno += words;
6606
6607 if (cum->nregs <= 0)
6608 {
6609 cum->nregs = 0;
6610 cum->regno = 0;
6611 }
6612 break;
6613
6614 case OImode:
6615 /* OImode shouldn't be used directly. */
6616 gcc_unreachable ();
6617
6618 case DFmode:
6619 if (cum->float_in_sse < 2)
6620 break;
6621 case SFmode:
6622 if (cum->float_in_sse < 1)
6623 break;
6624 /* FALLTHRU */
6625
6626 case V8SFmode:
6627 case V8SImode:
6628 case V32QImode:
6629 case V16HImode:
6630 case V4DFmode:
6631 case V4DImode:
6632 case TImode:
6633 case V16QImode:
6634 case V8HImode:
6635 case V4SImode:
6636 case V2DImode:
6637 case V4SFmode:
6638 case V2DFmode:
6639 if (!type || !AGGREGATE_TYPE_P (type))
6640 {
6641 cum->sse_words += words;
6642 cum->sse_nregs -= 1;
6643 cum->sse_regno += 1;
6644 if (cum->sse_nregs <= 0)
6645 {
6646 cum->sse_nregs = 0;
6647 cum->sse_regno = 0;
6648 }
6649 }
6650 break;
6651
6652 case V8QImode:
6653 case V4HImode:
6654 case V2SImode:
6655 case V2SFmode:
6656 case V1TImode:
6657 case V1DImode:
6658 if (!type || !AGGREGATE_TYPE_P (type))
6659 {
6660 cum->mmx_words += words;
6661 cum->mmx_nregs -= 1;
6662 cum->mmx_regno += 1;
6663 if (cum->mmx_nregs <= 0)
6664 {
6665 cum->mmx_nregs = 0;
6666 cum->mmx_regno = 0;
6667 }
6668 }
6669 break;
6670 }
6671 }
6672
6673 static void
6674 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6675 const_tree type, HOST_WIDE_INT words, bool named)
6676 {
6677 int int_nregs, sse_nregs;
6678
6679 /* Unnamed 256bit vector mode parameters are passed on stack. */
6680 if (!named && VALID_AVX256_REG_MODE (mode))
6681 return;
6682
6683 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6684 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6685 {
6686 cum->nregs -= int_nregs;
6687 cum->sse_nregs -= sse_nregs;
6688 cum->regno += int_nregs;
6689 cum->sse_regno += sse_nregs;
6690 }
6691 else
6692 {
6693 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6694 cum->words = (cum->words + align - 1) & ~(align - 1);
6695 cum->words += words;
6696 }
6697 }
6698
6699 static void
6700 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6701 HOST_WIDE_INT words)
6702 {
6703 /* Otherwise, this should be passed indirect. */
6704 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6705
6706 cum->words += words;
6707 if (cum->nregs > 0)
6708 {
6709 cum->nregs -= 1;
6710 cum->regno += 1;
6711 }
6712 }
6713
6714 /* Update the data in CUM to advance over an argument of mode MODE and
6715 data type TYPE. (TYPE is null for libcalls where that information
6716 may not be available.) */
6717
6718 static void
6719 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6720 const_tree type, bool named)
6721 {
6722 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6723 HOST_WIDE_INT bytes, words;
6724
6725 if (mode == BLKmode)
6726 bytes = int_size_in_bytes (type);
6727 else
6728 bytes = GET_MODE_SIZE (mode);
6729 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6730
6731 if (type)
6732 mode = type_natural_mode (type, NULL);
6733
6734 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6735 function_arg_advance_ms_64 (cum, bytes, words);
6736 else if (TARGET_64BIT)
6737 function_arg_advance_64 (cum, mode, type, words, named);
6738 else
6739 function_arg_advance_32 (cum, mode, type, bytes, words);
6740 }
6741
6742 /* Define where to put the arguments to a function.
6743 Value is zero to push the argument on the stack,
6744 or a hard register in which to store the argument.
6745
6746 MODE is the argument's machine mode.
6747 TYPE is the data type of the argument (as a tree).
6748 This is null for libcalls where that information may
6749 not be available.
6750 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6751 the preceding args and about the function being called.
6752 NAMED is nonzero if this argument is a named parameter
6753 (otherwise it is an extra parameter matching an ellipsis). */
6754
6755 static rtx
6756 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6757 enum machine_mode orig_mode, const_tree type,
6758 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6759 {
6760 static bool warnedsse, warnedmmx;
6761
6762 /* Avoid the AL settings for the Unix64 ABI. */
6763 if (mode == VOIDmode)
6764 return constm1_rtx;
6765
6766 switch (mode)
6767 {
6768 default:
6769 break;
6770
6771 case BLKmode:
6772 if (bytes < 0)
6773 break;
6774 /* FALLTHRU */
6775 case DImode:
6776 case SImode:
6777 case HImode:
6778 case QImode:
6779 if (words <= cum->nregs)
6780 {
6781 int regno = cum->regno;
6782
6783 /* Fastcall allocates the first two DWORD (SImode) or
6784 smaller arguments to ECX and EDX if it isn't an
6785 aggregate type . */
6786 if (cum->fastcall)
6787 {
6788 if (mode == BLKmode
6789 || mode == DImode
6790 || (type && AGGREGATE_TYPE_P (type)))
6791 break;
6792
6793 /* ECX not EAX is the first allocated register. */
6794 if (regno == AX_REG)
6795 regno = CX_REG;
6796 }
6797 return gen_rtx_REG (mode, regno);
6798 }
6799 break;
6800
6801 case DFmode:
6802 if (cum->float_in_sse < 2)
6803 break;
6804 case SFmode:
6805 if (cum->float_in_sse < 1)
6806 break;
6807 /* FALLTHRU */
6808 case TImode:
6809 /* In 32bit, we pass TImode in xmm registers. */
6810 case V16QImode:
6811 case V8HImode:
6812 case V4SImode:
6813 case V2DImode:
6814 case V4SFmode:
6815 case V2DFmode:
6816 if (!type || !AGGREGATE_TYPE_P (type))
6817 {
6818 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6819 {
6820 warnedsse = true;
6821 warning (0, "SSE vector argument without SSE enabled "
6822 "changes the ABI");
6823 }
6824 if (cum->sse_nregs)
6825 return gen_reg_or_parallel (mode, orig_mode,
6826 cum->sse_regno + FIRST_SSE_REG);
6827 }
6828 break;
6829
6830 case OImode:
6831 /* OImode shouldn't be used directly. */
6832 gcc_unreachable ();
6833
6834 case V8SFmode:
6835 case V8SImode:
6836 case V32QImode:
6837 case V16HImode:
6838 case V4DFmode:
6839 case V4DImode:
6840 if (!type || !AGGREGATE_TYPE_P (type))
6841 {
6842 if (cum->sse_nregs)
6843 return gen_reg_or_parallel (mode, orig_mode,
6844 cum->sse_regno + FIRST_SSE_REG);
6845 }
6846 break;
6847
6848 case V8QImode:
6849 case V4HImode:
6850 case V2SImode:
6851 case V2SFmode:
6852 case V1TImode:
6853 case V1DImode:
6854 if (!type || !AGGREGATE_TYPE_P (type))
6855 {
6856 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6857 {
6858 warnedmmx = true;
6859 warning (0, "MMX vector argument without MMX enabled "
6860 "changes the ABI");
6861 }
6862 if (cum->mmx_nregs)
6863 return gen_reg_or_parallel (mode, orig_mode,
6864 cum->mmx_regno + FIRST_MMX_REG);
6865 }
6866 break;
6867 }
6868
6869 return NULL_RTX;
6870 }
6871
6872 static rtx
6873 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6874 enum machine_mode orig_mode, const_tree type, bool named)
6875 {
6876 /* Handle a hidden AL argument containing number of registers
6877 for varargs x86-64 functions. */
6878 if (mode == VOIDmode)
6879 return GEN_INT (cum->maybe_vaarg
6880 ? (cum->sse_nregs < 0
6881 ? X86_64_SSE_REGPARM_MAX
6882 : cum->sse_regno)
6883 : -1);
6884
6885 switch (mode)
6886 {
6887 default:
6888 break;
6889
6890 case V8SFmode:
6891 case V8SImode:
6892 case V32QImode:
6893 case V16HImode:
6894 case V4DFmode:
6895 case V4DImode:
6896 /* Unnamed 256bit vector mode parameters are passed on stack. */
6897 if (!named)
6898 return NULL;
6899 break;
6900 }
6901
6902 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6903 cum->sse_nregs,
6904 &x86_64_int_parameter_registers [cum->regno],
6905 cum->sse_regno);
6906 }
6907
6908 static rtx
6909 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6910 enum machine_mode orig_mode, bool named,
6911 HOST_WIDE_INT bytes)
6912 {
6913 unsigned int regno;
6914
6915 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6916 We use value of -2 to specify that current function call is MSABI. */
6917 if (mode == VOIDmode)
6918 return GEN_INT (-2);
6919
6920 /* If we've run out of registers, it goes on the stack. */
6921 if (cum->nregs == 0)
6922 return NULL_RTX;
6923
6924 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6925
6926 /* Only floating point modes are passed in anything but integer regs. */
6927 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6928 {
6929 if (named)
6930 regno = cum->regno + FIRST_SSE_REG;
6931 else
6932 {
6933 rtx t1, t2;
6934
6935 /* Unnamed floating parameters are passed in both the
6936 SSE and integer registers. */
6937 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6938 t2 = gen_rtx_REG (mode, regno);
6939 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6940 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6941 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6942 }
6943 }
6944 /* Handle aggregated types passed in register. */
6945 if (orig_mode == BLKmode)
6946 {
6947 if (bytes > 0 && bytes <= 8)
6948 mode = (bytes > 4 ? DImode : SImode);
6949 if (mode == BLKmode)
6950 mode = DImode;
6951 }
6952
6953 return gen_reg_or_parallel (mode, orig_mode, regno);
6954 }
6955
6956 /* Return where to put the arguments to a function.
6957 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6958
6959 MODE is the argument's machine mode. TYPE is the data type of the
6960 argument. It is null for libcalls where that information may not be
6961 available. CUM gives information about the preceding args and about
6962 the function being called. NAMED is nonzero if this argument is a
6963 named parameter (otherwise it is an extra parameter matching an
6964 ellipsis). */
6965
6966 static rtx
6967 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6968 const_tree type, bool named)
6969 {
6970 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6971 enum machine_mode mode = omode;
6972 HOST_WIDE_INT bytes, words;
6973 rtx arg;
6974
6975 if (mode == BLKmode)
6976 bytes = int_size_in_bytes (type);
6977 else
6978 bytes = GET_MODE_SIZE (mode);
6979 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6980
6981 /* To simplify the code below, represent vector types with a vector mode
6982 even if MMX/SSE are not active. */
6983 if (type && TREE_CODE (type) == VECTOR_TYPE)
6984 mode = type_natural_mode (type, cum);
6985
6986 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6987 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6988 else if (TARGET_64BIT)
6989 arg = function_arg_64 (cum, mode, omode, type, named);
6990 else
6991 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6992
6993 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6994 {
6995 /* This argument uses 256bit AVX modes. */
6996 if (cum->caller)
6997 cfun->machine->callee_pass_avx256_p = true;
6998 else
6999 cfun->machine->caller_pass_avx256_p = true;
7000 }
7001
7002 return arg;
7003 }
7004
7005 /* A C expression that indicates when an argument must be passed by
7006 reference. If nonzero for an argument, a copy of that argument is
7007 made in memory and a pointer to the argument is passed instead of
7008 the argument itself. The pointer is passed in whatever way is
7009 appropriate for passing a pointer to that type. */
7010
7011 static bool
7012 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7013 enum machine_mode mode ATTRIBUTE_UNUSED,
7014 const_tree type, bool named ATTRIBUTE_UNUSED)
7015 {
7016 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7017
7018 /* See Windows x64 Software Convention. */
7019 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7020 {
7021 int msize = (int) GET_MODE_SIZE (mode);
7022 if (type)
7023 {
7024 /* Arrays are passed by reference. */
7025 if (TREE_CODE (type) == ARRAY_TYPE)
7026 return true;
7027
7028 if (AGGREGATE_TYPE_P (type))
7029 {
7030 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7031 are passed by reference. */
7032 msize = int_size_in_bytes (type);
7033 }
7034 }
7035
7036 /* __m128 is passed by reference. */
7037 switch (msize) {
7038 case 1: case 2: case 4: case 8:
7039 break;
7040 default:
7041 return true;
7042 }
7043 }
7044 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7045 return 1;
7046
7047 return 0;
7048 }
7049
7050 /* Return true when TYPE should be 128bit aligned for 32bit argument
7051 passing ABI. XXX: This function is obsolete and is only used for
7052 checking psABI compatibility with previous versions of GCC. */
7053
7054 static bool
7055 ix86_compat_aligned_value_p (const_tree type)
7056 {
7057 enum machine_mode mode = TYPE_MODE (type);
7058 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7059 || mode == TDmode
7060 || mode == TFmode
7061 || mode == TCmode)
7062 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7063 return true;
7064 if (TYPE_ALIGN (type) < 128)
7065 return false;
7066
7067 if (AGGREGATE_TYPE_P (type))
7068 {
7069 /* Walk the aggregates recursively. */
7070 switch (TREE_CODE (type))
7071 {
7072 case RECORD_TYPE:
7073 case UNION_TYPE:
7074 case QUAL_UNION_TYPE:
7075 {
7076 tree field;
7077
7078 /* Walk all the structure fields. */
7079 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7080 {
7081 if (TREE_CODE (field) == FIELD_DECL
7082 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7083 return true;
7084 }
7085 break;
7086 }
7087
7088 case ARRAY_TYPE:
7089 /* Just for use if some languages passes arrays by value. */
7090 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7091 return true;
7092 break;
7093
7094 default:
7095 gcc_unreachable ();
7096 }
7097 }
7098 return false;
7099 }
7100
7101 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7102 XXX: This function is obsolete and is only used for checking psABI
7103 compatibility with previous versions of GCC. */
7104
7105 static unsigned int
7106 ix86_compat_function_arg_boundary (enum machine_mode mode,
7107 const_tree type, unsigned int align)
7108 {
7109 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7110 natural boundaries. */
7111 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7112 {
7113 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7114 make an exception for SSE modes since these require 128bit
7115 alignment.
7116
7117 The handling here differs from field_alignment. ICC aligns MMX
7118 arguments to 4 byte boundaries, while structure fields are aligned
7119 to 8 byte boundaries. */
7120 if (!type)
7121 {
7122 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7123 align = PARM_BOUNDARY;
7124 }
7125 else
7126 {
7127 if (!ix86_compat_aligned_value_p (type))
7128 align = PARM_BOUNDARY;
7129 }
7130 }
7131 if (align > BIGGEST_ALIGNMENT)
7132 align = BIGGEST_ALIGNMENT;
7133 return align;
7134 }
7135
7136 /* Return true when TYPE should be 128bit aligned for 32bit argument
7137 passing ABI. */
7138
7139 static bool
7140 ix86_contains_aligned_value_p (const_tree type)
7141 {
7142 enum machine_mode mode = TYPE_MODE (type);
7143
7144 if (mode == XFmode || mode == XCmode)
7145 return false;
7146
7147 if (TYPE_ALIGN (type) < 128)
7148 return false;
7149
7150 if (AGGREGATE_TYPE_P (type))
7151 {
7152 /* Walk the aggregates recursively. */
7153 switch (TREE_CODE (type))
7154 {
7155 case RECORD_TYPE:
7156 case UNION_TYPE:
7157 case QUAL_UNION_TYPE:
7158 {
7159 tree field;
7160
7161 /* Walk all the structure fields. */
7162 for (field = TYPE_FIELDS (type);
7163 field;
7164 field = DECL_CHAIN (field))
7165 {
7166 if (TREE_CODE (field) == FIELD_DECL
7167 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7168 return true;
7169 }
7170 break;
7171 }
7172
7173 case ARRAY_TYPE:
7174 /* Just for use if some languages passes arrays by value. */
7175 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7176 return true;
7177 break;
7178
7179 default:
7180 gcc_unreachable ();
7181 }
7182 }
7183 else
7184 return TYPE_ALIGN (type) >= 128;
7185
7186 return false;
7187 }
7188
7189 /* Gives the alignment boundary, in bits, of an argument with the
7190 specified mode and type. */
7191
7192 static unsigned int
7193 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7194 {
7195 unsigned int align;
7196 if (type)
7197 {
7198 /* Since the main variant type is used for call, we convert it to
7199 the main variant type. */
7200 type = TYPE_MAIN_VARIANT (type);
7201 align = TYPE_ALIGN (type);
7202 }
7203 else
7204 align = GET_MODE_ALIGNMENT (mode);
7205 if (align < PARM_BOUNDARY)
7206 align = PARM_BOUNDARY;
7207 else
7208 {
7209 static bool warned;
7210 unsigned int saved_align = align;
7211
7212 if (!TARGET_64BIT)
7213 {
7214 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7215 if (!type)
7216 {
7217 if (mode == XFmode || mode == XCmode)
7218 align = PARM_BOUNDARY;
7219 }
7220 else if (!ix86_contains_aligned_value_p (type))
7221 align = PARM_BOUNDARY;
7222
7223 if (align < 128)
7224 align = PARM_BOUNDARY;
7225 }
7226
7227 if (warn_psabi
7228 && !warned
7229 && align != ix86_compat_function_arg_boundary (mode, type,
7230 saved_align))
7231 {
7232 warned = true;
7233 inform (input_location,
7234 "The ABI for passing parameters with %d-byte"
7235 " alignment has changed in GCC 4.6",
7236 align / BITS_PER_UNIT);
7237 }
7238 }
7239
7240 return align;
7241 }
7242
7243 /* Return true if N is a possible register number of function value. */
7244
7245 static bool
7246 ix86_function_value_regno_p (const unsigned int regno)
7247 {
7248 switch (regno)
7249 {
7250 case AX_REG:
7251 return true;
7252
7253 case FIRST_FLOAT_REG:
7254 /* TODO: The function should depend on current function ABI but
7255 builtins.c would need updating then. Therefore we use the
7256 default ABI. */
7257 if (TARGET_64BIT && ix86_abi == MS_ABI)
7258 return false;
7259 return TARGET_FLOAT_RETURNS_IN_80387;
7260
7261 case FIRST_SSE_REG:
7262 return TARGET_SSE;
7263
7264 case FIRST_MMX_REG:
7265 if (TARGET_MACHO || TARGET_64BIT)
7266 return false;
7267 return TARGET_MMX;
7268 }
7269
7270 return false;
7271 }
7272
7273 /* Define how to find the value returned by a function.
7274 VALTYPE is the data type of the value (as a tree).
7275 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7276 otherwise, FUNC is 0. */
7277
7278 static rtx
7279 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7280 const_tree fntype, const_tree fn)
7281 {
7282 unsigned int regno;
7283
7284 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7285 we normally prevent this case when mmx is not available. However
7286 some ABIs may require the result to be returned like DImode. */
7287 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7288 regno = FIRST_MMX_REG;
7289
7290 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7291 we prevent this case when sse is not available. However some ABIs
7292 may require the result to be returned like integer TImode. */
7293 else if (mode == TImode
7294 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7295 regno = FIRST_SSE_REG;
7296
7297 /* 32-byte vector modes in %ymm0. */
7298 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7299 regno = FIRST_SSE_REG;
7300
7301 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7302 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7303 regno = FIRST_FLOAT_REG;
7304 else
7305 /* Most things go in %eax. */
7306 regno = AX_REG;
7307
7308 /* Override FP return register with %xmm0 for local functions when
7309 SSE math is enabled or for functions with sseregparm attribute. */
7310 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7311 {
7312 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7313 if ((sse_level >= 1 && mode == SFmode)
7314 || (sse_level == 2 && mode == DFmode))
7315 regno = FIRST_SSE_REG;
7316 }
7317
7318 /* OImode shouldn't be used directly. */
7319 gcc_assert (mode != OImode);
7320
7321 return gen_rtx_REG (orig_mode, regno);
7322 }
7323
7324 static rtx
7325 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7326 const_tree valtype)
7327 {
7328 rtx ret;
7329
7330 /* Handle libcalls, which don't provide a type node. */
7331 if (valtype == NULL)
7332 {
7333 unsigned int regno;
7334
7335 switch (mode)
7336 {
7337 case SFmode:
7338 case SCmode:
7339 case DFmode:
7340 case DCmode:
7341 case TFmode:
7342 case SDmode:
7343 case DDmode:
7344 case TDmode:
7345 regno = FIRST_SSE_REG;
7346 break;
7347 case XFmode:
7348 case XCmode:
7349 regno = FIRST_FLOAT_REG;
7350 break;
7351 case TCmode:
7352 return NULL;
7353 default:
7354 regno = AX_REG;
7355 }
7356
7357 return gen_rtx_REG (mode, regno);
7358 }
7359 else if (POINTER_TYPE_P (valtype))
7360 {
7361 /* Pointers are always returned in word_mode. */
7362 mode = word_mode;
7363 }
7364
7365 ret = construct_container (mode, orig_mode, valtype, 1,
7366 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7367 x86_64_int_return_registers, 0);
7368
7369 /* For zero sized structures, construct_container returns NULL, but we
7370 need to keep rest of compiler happy by returning meaningful value. */
7371 if (!ret)
7372 ret = gen_rtx_REG (orig_mode, AX_REG);
7373
7374 return ret;
7375 }
7376
7377 static rtx
7378 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7379 {
7380 unsigned int regno = AX_REG;
7381
7382 if (TARGET_SSE)
7383 {
7384 switch (GET_MODE_SIZE (mode))
7385 {
7386 case 16:
7387 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7388 && !COMPLEX_MODE_P (mode))
7389 regno = FIRST_SSE_REG;
7390 break;
7391 case 8:
7392 case 4:
7393 if (mode == SFmode || mode == DFmode)
7394 regno = FIRST_SSE_REG;
7395 break;
7396 default:
7397 break;
7398 }
7399 }
7400 return gen_rtx_REG (orig_mode, regno);
7401 }
7402
7403 static rtx
7404 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7405 enum machine_mode orig_mode, enum machine_mode mode)
7406 {
7407 const_tree fn, fntype;
7408
7409 fn = NULL_TREE;
7410 if (fntype_or_decl && DECL_P (fntype_or_decl))
7411 fn = fntype_or_decl;
7412 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7413
7414 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7415 return function_value_ms_64 (orig_mode, mode);
7416 else if (TARGET_64BIT)
7417 return function_value_64 (orig_mode, mode, valtype);
7418 else
7419 return function_value_32 (orig_mode, mode, fntype, fn);
7420 }
7421
7422 static rtx
7423 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7424 bool outgoing ATTRIBUTE_UNUSED)
7425 {
7426 enum machine_mode mode, orig_mode;
7427
7428 orig_mode = TYPE_MODE (valtype);
7429 mode = type_natural_mode (valtype, NULL);
7430 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7431 }
7432
7433 /* Pointer function arguments and return values are promoted to
7434 word_mode. */
7435
7436 static enum machine_mode
7437 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7438 int *punsignedp, const_tree fntype,
7439 int for_return)
7440 {
7441 if (type != NULL_TREE && POINTER_TYPE_P (type))
7442 {
7443 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7444 return word_mode;
7445 }
7446 return default_promote_function_mode (type, mode, punsignedp, fntype,
7447 for_return);
7448 }
7449
7450 rtx
7451 ix86_libcall_value (enum machine_mode mode)
7452 {
7453 return ix86_function_value_1 (NULL, NULL, mode, mode);
7454 }
7455
7456 /* Return true iff type is returned in memory. */
7457
7458 static bool ATTRIBUTE_UNUSED
7459 return_in_memory_32 (const_tree type, enum machine_mode mode)
7460 {
7461 HOST_WIDE_INT size;
7462
7463 if (mode == BLKmode)
7464 return true;
7465
7466 size = int_size_in_bytes (type);
7467
7468 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7469 return false;
7470
7471 if (VECTOR_MODE_P (mode) || mode == TImode)
7472 {
7473 /* User-created vectors small enough to fit in EAX. */
7474 if (size < 8)
7475 return false;
7476
7477 /* MMX/3dNow values are returned in MM0,
7478 except when it doesn't exits or the ABI prescribes otherwise. */
7479 if (size == 8)
7480 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7481
7482 /* SSE values are returned in XMM0, except when it doesn't exist. */
7483 if (size == 16)
7484 return !TARGET_SSE;
7485
7486 /* AVX values are returned in YMM0, except when it doesn't exist. */
7487 if (size == 32)
7488 return !TARGET_AVX;
7489 }
7490
7491 if (mode == XFmode)
7492 return false;
7493
7494 if (size > 12)
7495 return true;
7496
7497 /* OImode shouldn't be used directly. */
7498 gcc_assert (mode != OImode);
7499
7500 return false;
7501 }
7502
7503 static bool ATTRIBUTE_UNUSED
7504 return_in_memory_64 (const_tree type, enum machine_mode mode)
7505 {
7506 int needed_intregs, needed_sseregs;
7507 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7508 }
7509
7510 static bool ATTRIBUTE_UNUSED
7511 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7512 {
7513 HOST_WIDE_INT size = int_size_in_bytes (type);
7514
7515 /* __m128 is returned in xmm0. */
7516 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7517 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7518 return false;
7519
7520 /* Otherwise, the size must be exactly in [1248]. */
7521 return size != 1 && size != 2 && size != 4 && size != 8;
7522 }
7523
7524 static bool
7525 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7526 {
7527 #ifdef SUBTARGET_RETURN_IN_MEMORY
7528 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7529 #else
7530 const enum machine_mode mode = type_natural_mode (type, NULL);
7531
7532 if (TARGET_64BIT)
7533 {
7534 if (ix86_function_type_abi (fntype) == MS_ABI)
7535 return return_in_memory_ms_64 (type, mode);
7536 else
7537 return return_in_memory_64 (type, mode);
7538 }
7539 else
7540 return return_in_memory_32 (type, mode);
7541 #endif
7542 }
7543
7544 /* When returning SSE vector types, we have a choice of either
7545 (1) being abi incompatible with a -march switch, or
7546 (2) generating an error.
7547 Given no good solution, I think the safest thing is one warning.
7548 The user won't be able to use -Werror, but....
7549
7550 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7551 called in response to actually generating a caller or callee that
7552 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7553 via aggregate_value_p for general type probing from tree-ssa. */
7554
7555 static rtx
7556 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7557 {
7558 static bool warnedsse, warnedmmx;
7559
7560 if (!TARGET_64BIT && type)
7561 {
7562 /* Look at the return type of the function, not the function type. */
7563 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7564
7565 if (!TARGET_SSE && !warnedsse)
7566 {
7567 if (mode == TImode
7568 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7569 {
7570 warnedsse = true;
7571 warning (0, "SSE vector return without SSE enabled "
7572 "changes the ABI");
7573 }
7574 }
7575
7576 if (!TARGET_MMX && !warnedmmx)
7577 {
7578 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7579 {
7580 warnedmmx = true;
7581 warning (0, "MMX vector return without MMX enabled "
7582 "changes the ABI");
7583 }
7584 }
7585 }
7586
7587 return NULL;
7588 }
7589
7590 \f
7591 /* Create the va_list data type. */
7592
7593 /* Returns the calling convention specific va_list date type.
7594 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7595
7596 static tree
7597 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7598 {
7599 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7600
7601 /* For i386 we use plain pointer to argument area. */
7602 if (!TARGET_64BIT || abi == MS_ABI)
7603 return build_pointer_type (char_type_node);
7604
7605 record = lang_hooks.types.make_type (RECORD_TYPE);
7606 type_decl = build_decl (BUILTINS_LOCATION,
7607 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7608
7609 f_gpr = build_decl (BUILTINS_LOCATION,
7610 FIELD_DECL, get_identifier ("gp_offset"),
7611 unsigned_type_node);
7612 f_fpr = build_decl (BUILTINS_LOCATION,
7613 FIELD_DECL, get_identifier ("fp_offset"),
7614 unsigned_type_node);
7615 f_ovf = build_decl (BUILTINS_LOCATION,
7616 FIELD_DECL, get_identifier ("overflow_arg_area"),
7617 ptr_type_node);
7618 f_sav = build_decl (BUILTINS_LOCATION,
7619 FIELD_DECL, get_identifier ("reg_save_area"),
7620 ptr_type_node);
7621
7622 va_list_gpr_counter_field = f_gpr;
7623 va_list_fpr_counter_field = f_fpr;
7624
7625 DECL_FIELD_CONTEXT (f_gpr) = record;
7626 DECL_FIELD_CONTEXT (f_fpr) = record;
7627 DECL_FIELD_CONTEXT (f_ovf) = record;
7628 DECL_FIELD_CONTEXT (f_sav) = record;
7629
7630 TYPE_STUB_DECL (record) = type_decl;
7631 TYPE_NAME (record) = type_decl;
7632 TYPE_FIELDS (record) = f_gpr;
7633 DECL_CHAIN (f_gpr) = f_fpr;
7634 DECL_CHAIN (f_fpr) = f_ovf;
7635 DECL_CHAIN (f_ovf) = f_sav;
7636
7637 layout_type (record);
7638
7639 /* The correct type is an array type of one element. */
7640 return build_array_type (record, build_index_type (size_zero_node));
7641 }
7642
7643 /* Setup the builtin va_list data type and for 64-bit the additional
7644 calling convention specific va_list data types. */
7645
7646 static tree
7647 ix86_build_builtin_va_list (void)
7648 {
7649 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7650
7651 /* Initialize abi specific va_list builtin types. */
7652 if (TARGET_64BIT)
7653 {
7654 tree t;
7655 if (ix86_abi == MS_ABI)
7656 {
7657 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7658 if (TREE_CODE (t) != RECORD_TYPE)
7659 t = build_variant_type_copy (t);
7660 sysv_va_list_type_node = t;
7661 }
7662 else
7663 {
7664 t = ret;
7665 if (TREE_CODE (t) != RECORD_TYPE)
7666 t = build_variant_type_copy (t);
7667 sysv_va_list_type_node = t;
7668 }
7669 if (ix86_abi != MS_ABI)
7670 {
7671 t = ix86_build_builtin_va_list_abi (MS_ABI);
7672 if (TREE_CODE (t) != RECORD_TYPE)
7673 t = build_variant_type_copy (t);
7674 ms_va_list_type_node = t;
7675 }
7676 else
7677 {
7678 t = ret;
7679 if (TREE_CODE (t) != RECORD_TYPE)
7680 t = build_variant_type_copy (t);
7681 ms_va_list_type_node = t;
7682 }
7683 }
7684
7685 return ret;
7686 }
7687
7688 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7689
7690 static void
7691 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7692 {
7693 rtx save_area, mem;
7694 alias_set_type set;
7695 int i, max;
7696
7697 /* GPR size of varargs save area. */
7698 if (cfun->va_list_gpr_size)
7699 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7700 else
7701 ix86_varargs_gpr_size = 0;
7702
7703 /* FPR size of varargs save area. We don't need it if we don't pass
7704 anything in SSE registers. */
7705 if (TARGET_SSE && cfun->va_list_fpr_size)
7706 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7707 else
7708 ix86_varargs_fpr_size = 0;
7709
7710 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7711 return;
7712
7713 save_area = frame_pointer_rtx;
7714 set = get_varargs_alias_set ();
7715
7716 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7717 if (max > X86_64_REGPARM_MAX)
7718 max = X86_64_REGPARM_MAX;
7719
7720 for (i = cum->regno; i < max; i++)
7721 {
7722 mem = gen_rtx_MEM (word_mode,
7723 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7724 MEM_NOTRAP_P (mem) = 1;
7725 set_mem_alias_set (mem, set);
7726 emit_move_insn (mem,
7727 gen_rtx_REG (word_mode,
7728 x86_64_int_parameter_registers[i]));
7729 }
7730
7731 if (ix86_varargs_fpr_size)
7732 {
7733 enum machine_mode smode;
7734 rtx label, test;
7735
7736 /* Now emit code to save SSE registers. The AX parameter contains number
7737 of SSE parameter registers used to call this function, though all we
7738 actually check here is the zero/non-zero status. */
7739
7740 label = gen_label_rtx ();
7741 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7742 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7743 label));
7744
7745 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7746 we used movdqa (i.e. TImode) instead? Perhaps even better would
7747 be if we could determine the real mode of the data, via a hook
7748 into pass_stdarg. Ignore all that for now. */
7749 smode = V4SFmode;
7750 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7751 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7752
7753 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7754 if (max > X86_64_SSE_REGPARM_MAX)
7755 max = X86_64_SSE_REGPARM_MAX;
7756
7757 for (i = cum->sse_regno; i < max; ++i)
7758 {
7759 mem = plus_constant (Pmode, save_area,
7760 i * 16 + ix86_varargs_gpr_size);
7761 mem = gen_rtx_MEM (smode, mem);
7762 MEM_NOTRAP_P (mem) = 1;
7763 set_mem_alias_set (mem, set);
7764 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7765
7766 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7767 }
7768
7769 emit_label (label);
7770 }
7771 }
7772
7773 static void
7774 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7775 {
7776 alias_set_type set = get_varargs_alias_set ();
7777 int i;
7778
7779 /* Reset to zero, as there might be a sysv vaarg used
7780 before. */
7781 ix86_varargs_gpr_size = 0;
7782 ix86_varargs_fpr_size = 0;
7783
7784 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7785 {
7786 rtx reg, mem;
7787
7788 mem = gen_rtx_MEM (Pmode,
7789 plus_constant (Pmode, virtual_incoming_args_rtx,
7790 i * UNITS_PER_WORD));
7791 MEM_NOTRAP_P (mem) = 1;
7792 set_mem_alias_set (mem, set);
7793
7794 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7795 emit_move_insn (mem, reg);
7796 }
7797 }
7798
7799 static void
7800 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7801 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7802 int no_rtl)
7803 {
7804 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7805 CUMULATIVE_ARGS next_cum;
7806 tree fntype;
7807
7808 /* This argument doesn't appear to be used anymore. Which is good,
7809 because the old code here didn't suppress rtl generation. */
7810 gcc_assert (!no_rtl);
7811
7812 if (!TARGET_64BIT)
7813 return;
7814
7815 fntype = TREE_TYPE (current_function_decl);
7816
7817 /* For varargs, we do not want to skip the dummy va_dcl argument.
7818 For stdargs, we do want to skip the last named argument. */
7819 next_cum = *cum;
7820 if (stdarg_p (fntype))
7821 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7822 true);
7823
7824 if (cum->call_abi == MS_ABI)
7825 setup_incoming_varargs_ms_64 (&next_cum);
7826 else
7827 setup_incoming_varargs_64 (&next_cum);
7828 }
7829
7830 /* Checks if TYPE is of kind va_list char *. */
7831
7832 static bool
7833 is_va_list_char_pointer (tree type)
7834 {
7835 tree canonic;
7836
7837 /* For 32-bit it is always true. */
7838 if (!TARGET_64BIT)
7839 return true;
7840 canonic = ix86_canonical_va_list_type (type);
7841 return (canonic == ms_va_list_type_node
7842 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7843 }
7844
7845 /* Implement va_start. */
7846
7847 static void
7848 ix86_va_start (tree valist, rtx nextarg)
7849 {
7850 HOST_WIDE_INT words, n_gpr, n_fpr;
7851 tree f_gpr, f_fpr, f_ovf, f_sav;
7852 tree gpr, fpr, ovf, sav, t;
7853 tree type;
7854 rtx ovf_rtx;
7855
7856 if (flag_split_stack
7857 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7858 {
7859 unsigned int scratch_regno;
7860
7861 /* When we are splitting the stack, we can't refer to the stack
7862 arguments using internal_arg_pointer, because they may be on
7863 the old stack. The split stack prologue will arrange to
7864 leave a pointer to the old stack arguments in a scratch
7865 register, which we here copy to a pseudo-register. The split
7866 stack prologue can't set the pseudo-register directly because
7867 it (the prologue) runs before any registers have been saved. */
7868
7869 scratch_regno = split_stack_prologue_scratch_regno ();
7870 if (scratch_regno != INVALID_REGNUM)
7871 {
7872 rtx reg, seq;
7873
7874 reg = gen_reg_rtx (Pmode);
7875 cfun->machine->split_stack_varargs_pointer = reg;
7876
7877 start_sequence ();
7878 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7879 seq = get_insns ();
7880 end_sequence ();
7881
7882 push_topmost_sequence ();
7883 emit_insn_after (seq, entry_of_function ());
7884 pop_topmost_sequence ();
7885 }
7886 }
7887
7888 /* Only 64bit target needs something special. */
7889 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7890 {
7891 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7892 std_expand_builtin_va_start (valist, nextarg);
7893 else
7894 {
7895 rtx va_r, next;
7896
7897 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7898 next = expand_binop (ptr_mode, add_optab,
7899 cfun->machine->split_stack_varargs_pointer,
7900 crtl->args.arg_offset_rtx,
7901 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7902 convert_move (va_r, next, 0);
7903 }
7904 return;
7905 }
7906
7907 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7908 f_fpr = DECL_CHAIN (f_gpr);
7909 f_ovf = DECL_CHAIN (f_fpr);
7910 f_sav = DECL_CHAIN (f_ovf);
7911
7912 valist = build_simple_mem_ref (valist);
7913 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7914 /* The following should be folded into the MEM_REF offset. */
7915 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7916 f_gpr, NULL_TREE);
7917 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7918 f_fpr, NULL_TREE);
7919 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7920 f_ovf, NULL_TREE);
7921 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7922 f_sav, NULL_TREE);
7923
7924 /* Count number of gp and fp argument registers used. */
7925 words = crtl->args.info.words;
7926 n_gpr = crtl->args.info.regno;
7927 n_fpr = crtl->args.info.sse_regno;
7928
7929 if (cfun->va_list_gpr_size)
7930 {
7931 type = TREE_TYPE (gpr);
7932 t = build2 (MODIFY_EXPR, type,
7933 gpr, build_int_cst (type, n_gpr * 8));
7934 TREE_SIDE_EFFECTS (t) = 1;
7935 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7936 }
7937
7938 if (TARGET_SSE && cfun->va_list_fpr_size)
7939 {
7940 type = TREE_TYPE (fpr);
7941 t = build2 (MODIFY_EXPR, type, fpr,
7942 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7943 TREE_SIDE_EFFECTS (t) = 1;
7944 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7945 }
7946
7947 /* Find the overflow area. */
7948 type = TREE_TYPE (ovf);
7949 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7950 ovf_rtx = crtl->args.internal_arg_pointer;
7951 else
7952 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7953 t = make_tree (type, ovf_rtx);
7954 if (words != 0)
7955 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7956 t = build2 (MODIFY_EXPR, type, ovf, t);
7957 TREE_SIDE_EFFECTS (t) = 1;
7958 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7959
7960 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7961 {
7962 /* Find the register save area.
7963 Prologue of the function save it right above stack frame. */
7964 type = TREE_TYPE (sav);
7965 t = make_tree (type, frame_pointer_rtx);
7966 if (!ix86_varargs_gpr_size)
7967 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7968 t = build2 (MODIFY_EXPR, type, sav, t);
7969 TREE_SIDE_EFFECTS (t) = 1;
7970 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7971 }
7972 }
7973
7974 /* Implement va_arg. */
7975
7976 static tree
7977 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7978 gimple_seq *post_p)
7979 {
7980 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7981 tree f_gpr, f_fpr, f_ovf, f_sav;
7982 tree gpr, fpr, ovf, sav, t;
7983 int size, rsize;
7984 tree lab_false, lab_over = NULL_TREE;
7985 tree addr, t2;
7986 rtx container;
7987 int indirect_p = 0;
7988 tree ptrtype;
7989 enum machine_mode nat_mode;
7990 unsigned int arg_boundary;
7991
7992 /* Only 64bit target needs something special. */
7993 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7994 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7995
7996 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7997 f_fpr = DECL_CHAIN (f_gpr);
7998 f_ovf = DECL_CHAIN (f_fpr);
7999 f_sav = DECL_CHAIN (f_ovf);
8000
8001 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8002 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8003 valist = build_va_arg_indirect_ref (valist);
8004 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8005 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8006 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8007
8008 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8009 if (indirect_p)
8010 type = build_pointer_type (type);
8011 size = int_size_in_bytes (type);
8012 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8013
8014 nat_mode = type_natural_mode (type, NULL);
8015 switch (nat_mode)
8016 {
8017 case V8SFmode:
8018 case V8SImode:
8019 case V32QImode:
8020 case V16HImode:
8021 case V4DFmode:
8022 case V4DImode:
8023 /* Unnamed 256bit vector mode parameters are passed on stack. */
8024 if (!TARGET_64BIT_MS_ABI)
8025 {
8026 container = NULL;
8027 break;
8028 }
8029
8030 default:
8031 container = construct_container (nat_mode, TYPE_MODE (type),
8032 type, 0, X86_64_REGPARM_MAX,
8033 X86_64_SSE_REGPARM_MAX, intreg,
8034 0);
8035 break;
8036 }
8037
8038 /* Pull the value out of the saved registers. */
8039
8040 addr = create_tmp_var (ptr_type_node, "addr");
8041
8042 if (container)
8043 {
8044 int needed_intregs, needed_sseregs;
8045 bool need_temp;
8046 tree int_addr, sse_addr;
8047
8048 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8049 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8050
8051 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8052
8053 need_temp = (!REG_P (container)
8054 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8055 || TYPE_ALIGN (type) > 128));
8056
8057 /* In case we are passing structure, verify that it is consecutive block
8058 on the register save area. If not we need to do moves. */
8059 if (!need_temp && !REG_P (container))
8060 {
8061 /* Verify that all registers are strictly consecutive */
8062 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8063 {
8064 int i;
8065
8066 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8067 {
8068 rtx slot = XVECEXP (container, 0, i);
8069 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8070 || INTVAL (XEXP (slot, 1)) != i * 16)
8071 need_temp = 1;
8072 }
8073 }
8074 else
8075 {
8076 int i;
8077
8078 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8079 {
8080 rtx slot = XVECEXP (container, 0, i);
8081 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8082 || INTVAL (XEXP (slot, 1)) != i * 8)
8083 need_temp = 1;
8084 }
8085 }
8086 }
8087 if (!need_temp)
8088 {
8089 int_addr = addr;
8090 sse_addr = addr;
8091 }
8092 else
8093 {
8094 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8095 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8096 }
8097
8098 /* First ensure that we fit completely in registers. */
8099 if (needed_intregs)
8100 {
8101 t = build_int_cst (TREE_TYPE (gpr),
8102 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8103 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8104 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8105 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8106 gimplify_and_add (t, pre_p);
8107 }
8108 if (needed_sseregs)
8109 {
8110 t = build_int_cst (TREE_TYPE (fpr),
8111 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8112 + X86_64_REGPARM_MAX * 8);
8113 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8114 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8115 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8116 gimplify_and_add (t, pre_p);
8117 }
8118
8119 /* Compute index to start of area used for integer regs. */
8120 if (needed_intregs)
8121 {
8122 /* int_addr = gpr + sav; */
8123 t = fold_build_pointer_plus (sav, gpr);
8124 gimplify_assign (int_addr, t, pre_p);
8125 }
8126 if (needed_sseregs)
8127 {
8128 /* sse_addr = fpr + sav; */
8129 t = fold_build_pointer_plus (sav, fpr);
8130 gimplify_assign (sse_addr, t, pre_p);
8131 }
8132 if (need_temp)
8133 {
8134 int i, prev_size = 0;
8135 tree temp = create_tmp_var (type, "va_arg_tmp");
8136
8137 /* addr = &temp; */
8138 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8139 gimplify_assign (addr, t, pre_p);
8140
8141 for (i = 0; i < XVECLEN (container, 0); i++)
8142 {
8143 rtx slot = XVECEXP (container, 0, i);
8144 rtx reg = XEXP (slot, 0);
8145 enum machine_mode mode = GET_MODE (reg);
8146 tree piece_type;
8147 tree addr_type;
8148 tree daddr_type;
8149 tree src_addr, src;
8150 int src_offset;
8151 tree dest_addr, dest;
8152 int cur_size = GET_MODE_SIZE (mode);
8153
8154 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8155 prev_size = INTVAL (XEXP (slot, 1));
8156 if (prev_size + cur_size > size)
8157 {
8158 cur_size = size - prev_size;
8159 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8160 if (mode == BLKmode)
8161 mode = QImode;
8162 }
8163 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8164 if (mode == GET_MODE (reg))
8165 addr_type = build_pointer_type (piece_type);
8166 else
8167 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8168 true);
8169 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8170 true);
8171
8172 if (SSE_REGNO_P (REGNO (reg)))
8173 {
8174 src_addr = sse_addr;
8175 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8176 }
8177 else
8178 {
8179 src_addr = int_addr;
8180 src_offset = REGNO (reg) * 8;
8181 }
8182 src_addr = fold_convert (addr_type, src_addr);
8183 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8184
8185 dest_addr = fold_convert (daddr_type, addr);
8186 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8187 if (cur_size == GET_MODE_SIZE (mode))
8188 {
8189 src = build_va_arg_indirect_ref (src_addr);
8190 dest = build_va_arg_indirect_ref (dest_addr);
8191
8192 gimplify_assign (dest, src, pre_p);
8193 }
8194 else
8195 {
8196 tree copy
8197 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8198 3, dest_addr, src_addr,
8199 size_int (cur_size));
8200 gimplify_and_add (copy, pre_p);
8201 }
8202 prev_size += cur_size;
8203 }
8204 }
8205
8206 if (needed_intregs)
8207 {
8208 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8209 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8210 gimplify_assign (gpr, t, pre_p);
8211 }
8212
8213 if (needed_sseregs)
8214 {
8215 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8216 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8217 gimplify_assign (fpr, t, pre_p);
8218 }
8219
8220 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8221
8222 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8223 }
8224
8225 /* ... otherwise out of the overflow area. */
8226
8227 /* When we align parameter on stack for caller, if the parameter
8228 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8229 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8230 here with caller. */
8231 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8232 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8233 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8234
8235 /* Care for on-stack alignment if needed. */
8236 if (arg_boundary <= 64 || size == 0)
8237 t = ovf;
8238 else
8239 {
8240 HOST_WIDE_INT align = arg_boundary / 8;
8241 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8242 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8243 build_int_cst (TREE_TYPE (t), -align));
8244 }
8245
8246 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8247 gimplify_assign (addr, t, pre_p);
8248
8249 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8250 gimplify_assign (unshare_expr (ovf), t, pre_p);
8251
8252 if (container)
8253 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8254
8255 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8256 addr = fold_convert (ptrtype, addr);
8257
8258 if (indirect_p)
8259 addr = build_va_arg_indirect_ref (addr);
8260 return build_va_arg_indirect_ref (addr);
8261 }
8262 \f
8263 /* Return true if OPNUM's MEM should be matched
8264 in movabs* patterns. */
8265
8266 bool
8267 ix86_check_movabs (rtx insn, int opnum)
8268 {
8269 rtx set, mem;
8270
8271 set = PATTERN (insn);
8272 if (GET_CODE (set) == PARALLEL)
8273 set = XVECEXP (set, 0, 0);
8274 gcc_assert (GET_CODE (set) == SET);
8275 mem = XEXP (set, opnum);
8276 while (GET_CODE (mem) == SUBREG)
8277 mem = SUBREG_REG (mem);
8278 gcc_assert (MEM_P (mem));
8279 return volatile_ok || !MEM_VOLATILE_P (mem);
8280 }
8281 \f
8282 /* Initialize the table of extra 80387 mathematical constants. */
8283
8284 static void
8285 init_ext_80387_constants (void)
8286 {
8287 static const char * cst[5] =
8288 {
8289 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8290 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8291 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8292 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8293 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8294 };
8295 int i;
8296
8297 for (i = 0; i < 5; i++)
8298 {
8299 real_from_string (&ext_80387_constants_table[i], cst[i]);
8300 /* Ensure each constant is rounded to XFmode precision. */
8301 real_convert (&ext_80387_constants_table[i],
8302 XFmode, &ext_80387_constants_table[i]);
8303 }
8304
8305 ext_80387_constants_init = 1;
8306 }
8307
8308 /* Return non-zero if the constant is something that
8309 can be loaded with a special instruction. */
8310
8311 int
8312 standard_80387_constant_p (rtx x)
8313 {
8314 enum machine_mode mode = GET_MODE (x);
8315
8316 REAL_VALUE_TYPE r;
8317
8318 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8319 return -1;
8320
8321 if (x == CONST0_RTX (mode))
8322 return 1;
8323 if (x == CONST1_RTX (mode))
8324 return 2;
8325
8326 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8327
8328 /* For XFmode constants, try to find a special 80387 instruction when
8329 optimizing for size or on those CPUs that benefit from them. */
8330 if (mode == XFmode
8331 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8332 {
8333 int i;
8334
8335 if (! ext_80387_constants_init)
8336 init_ext_80387_constants ();
8337
8338 for (i = 0; i < 5; i++)
8339 if (real_identical (&r, &ext_80387_constants_table[i]))
8340 return i + 3;
8341 }
8342
8343 /* Load of the constant -0.0 or -1.0 will be split as
8344 fldz;fchs or fld1;fchs sequence. */
8345 if (real_isnegzero (&r))
8346 return 8;
8347 if (real_identical (&r, &dconstm1))
8348 return 9;
8349
8350 return 0;
8351 }
8352
8353 /* Return the opcode of the special instruction to be used to load
8354 the constant X. */
8355
8356 const char *
8357 standard_80387_constant_opcode (rtx x)
8358 {
8359 switch (standard_80387_constant_p (x))
8360 {
8361 case 1:
8362 return "fldz";
8363 case 2:
8364 return "fld1";
8365 case 3:
8366 return "fldlg2";
8367 case 4:
8368 return "fldln2";
8369 case 5:
8370 return "fldl2e";
8371 case 6:
8372 return "fldl2t";
8373 case 7:
8374 return "fldpi";
8375 case 8:
8376 case 9:
8377 return "#";
8378 default:
8379 gcc_unreachable ();
8380 }
8381 }
8382
8383 /* Return the CONST_DOUBLE representing the 80387 constant that is
8384 loaded by the specified special instruction. The argument IDX
8385 matches the return value from standard_80387_constant_p. */
8386
8387 rtx
8388 standard_80387_constant_rtx (int idx)
8389 {
8390 int i;
8391
8392 if (! ext_80387_constants_init)
8393 init_ext_80387_constants ();
8394
8395 switch (idx)
8396 {
8397 case 3:
8398 case 4:
8399 case 5:
8400 case 6:
8401 case 7:
8402 i = idx - 3;
8403 break;
8404
8405 default:
8406 gcc_unreachable ();
8407 }
8408
8409 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8410 XFmode);
8411 }
8412
8413 /* Return 1 if X is all 0s and 2 if x is all 1s
8414 in supported SSE/AVX vector mode. */
8415
8416 int
8417 standard_sse_constant_p (rtx x)
8418 {
8419 enum machine_mode mode = GET_MODE (x);
8420
8421 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8422 return 1;
8423 if (vector_all_ones_operand (x, mode))
8424 switch (mode)
8425 {
8426 case V16QImode:
8427 case V8HImode:
8428 case V4SImode:
8429 case V2DImode:
8430 if (TARGET_SSE2)
8431 return 2;
8432 case V32QImode:
8433 case V16HImode:
8434 case V8SImode:
8435 case V4DImode:
8436 if (TARGET_AVX2)
8437 return 2;
8438 default:
8439 break;
8440 }
8441
8442 return 0;
8443 }
8444
8445 /* Return the opcode of the special instruction to be used to load
8446 the constant X. */
8447
8448 const char *
8449 standard_sse_constant_opcode (rtx insn, rtx x)
8450 {
8451 switch (standard_sse_constant_p (x))
8452 {
8453 case 1:
8454 switch (get_attr_mode (insn))
8455 {
8456 case MODE_TI:
8457 return "%vpxor\t%0, %d0";
8458 case MODE_V2DF:
8459 return "%vxorpd\t%0, %d0";
8460 case MODE_V4SF:
8461 return "%vxorps\t%0, %d0";
8462
8463 case MODE_OI:
8464 return "vpxor\t%x0, %x0, %x0";
8465 case MODE_V4DF:
8466 return "vxorpd\t%x0, %x0, %x0";
8467 case MODE_V8SF:
8468 return "vxorps\t%x0, %x0, %x0";
8469
8470 default:
8471 break;
8472 }
8473
8474 case 2:
8475 if (TARGET_AVX)
8476 return "vpcmpeqd\t%0, %0, %0";
8477 else
8478 return "pcmpeqd\t%0, %0";
8479
8480 default:
8481 break;
8482 }
8483 gcc_unreachable ();
8484 }
8485
8486 /* Returns true if OP contains a symbol reference */
8487
8488 bool
8489 symbolic_reference_mentioned_p (rtx op)
8490 {
8491 const char *fmt;
8492 int i;
8493
8494 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8495 return true;
8496
8497 fmt = GET_RTX_FORMAT (GET_CODE (op));
8498 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8499 {
8500 if (fmt[i] == 'E')
8501 {
8502 int j;
8503
8504 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8505 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8506 return true;
8507 }
8508
8509 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8510 return true;
8511 }
8512
8513 return false;
8514 }
8515
8516 /* Return true if it is appropriate to emit `ret' instructions in the
8517 body of a function. Do this only if the epilogue is simple, needing a
8518 couple of insns. Prior to reloading, we can't tell how many registers
8519 must be saved, so return false then. Return false if there is no frame
8520 marker to de-allocate. */
8521
8522 bool
8523 ix86_can_use_return_insn_p (void)
8524 {
8525 struct ix86_frame frame;
8526
8527 if (! reload_completed || frame_pointer_needed)
8528 return 0;
8529
8530 /* Don't allow more than 32k pop, since that's all we can do
8531 with one instruction. */
8532 if (crtl->args.pops_args && crtl->args.size >= 32768)
8533 return 0;
8534
8535 ix86_compute_frame_layout (&frame);
8536 return (frame.stack_pointer_offset == UNITS_PER_WORD
8537 && (frame.nregs + frame.nsseregs) == 0);
8538 }
8539 \f
8540 /* Value should be nonzero if functions must have frame pointers.
8541 Zero means the frame pointer need not be set up (and parms may
8542 be accessed via the stack pointer) in functions that seem suitable. */
8543
8544 static bool
8545 ix86_frame_pointer_required (void)
8546 {
8547 /* If we accessed previous frames, then the generated code expects
8548 to be able to access the saved ebp value in our frame. */
8549 if (cfun->machine->accesses_prev_frame)
8550 return true;
8551
8552 /* Several x86 os'es need a frame pointer for other reasons,
8553 usually pertaining to setjmp. */
8554 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8555 return true;
8556
8557 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8558 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8559 return true;
8560
8561 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8562 turns off the frame pointer by default. Turn it back on now if
8563 we've not got a leaf function. */
8564 if (TARGET_OMIT_LEAF_FRAME_POINTER
8565 && (!crtl->is_leaf
8566 || ix86_current_function_calls_tls_descriptor))
8567 return true;
8568
8569 if (crtl->profile && !flag_fentry)
8570 return true;
8571
8572 return false;
8573 }
8574
8575 /* Record that the current function accesses previous call frames. */
8576
8577 void
8578 ix86_setup_frame_addresses (void)
8579 {
8580 cfun->machine->accesses_prev_frame = 1;
8581 }
8582 \f
8583 #ifndef USE_HIDDEN_LINKONCE
8584 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8585 # define USE_HIDDEN_LINKONCE 1
8586 # else
8587 # define USE_HIDDEN_LINKONCE 0
8588 # endif
8589 #endif
8590
8591 static int pic_labels_used;
8592
8593 /* Fills in the label name that should be used for a pc thunk for
8594 the given register. */
8595
8596 static void
8597 get_pc_thunk_name (char name[32], unsigned int regno)
8598 {
8599 gcc_assert (!TARGET_64BIT);
8600
8601 if (USE_HIDDEN_LINKONCE)
8602 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8603 else
8604 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8605 }
8606
8607
8608 /* This function generates code for -fpic that loads %ebx with
8609 the return address of the caller and then returns. */
8610
8611 static void
8612 ix86_code_end (void)
8613 {
8614 rtx xops[2];
8615 int regno;
8616
8617 for (regno = AX_REG; regno <= SP_REG; regno++)
8618 {
8619 char name[32];
8620 tree decl;
8621
8622 if (!(pic_labels_used & (1 << regno)))
8623 continue;
8624
8625 get_pc_thunk_name (name, regno);
8626
8627 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8628 get_identifier (name),
8629 build_function_type_list (void_type_node, NULL_TREE));
8630 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8631 NULL_TREE, void_type_node);
8632 TREE_PUBLIC (decl) = 1;
8633 TREE_STATIC (decl) = 1;
8634 DECL_IGNORED_P (decl) = 1;
8635
8636 #if TARGET_MACHO
8637 if (TARGET_MACHO)
8638 {
8639 switch_to_section (darwin_sections[text_coal_section]);
8640 fputs ("\t.weak_definition\t", asm_out_file);
8641 assemble_name (asm_out_file, name);
8642 fputs ("\n\t.private_extern\t", asm_out_file);
8643 assemble_name (asm_out_file, name);
8644 putc ('\n', asm_out_file);
8645 ASM_OUTPUT_LABEL (asm_out_file, name);
8646 DECL_WEAK (decl) = 1;
8647 }
8648 else
8649 #endif
8650 if (USE_HIDDEN_LINKONCE)
8651 {
8652 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8653
8654 targetm.asm_out.unique_section (decl, 0);
8655 switch_to_section (get_named_section (decl, NULL, 0));
8656
8657 targetm.asm_out.globalize_label (asm_out_file, name);
8658 fputs ("\t.hidden\t", asm_out_file);
8659 assemble_name (asm_out_file, name);
8660 putc ('\n', asm_out_file);
8661 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8662 }
8663 else
8664 {
8665 switch_to_section (text_section);
8666 ASM_OUTPUT_LABEL (asm_out_file, name);
8667 }
8668
8669 DECL_INITIAL (decl) = make_node (BLOCK);
8670 current_function_decl = decl;
8671 init_function_start (decl);
8672 first_function_block_is_cold = false;
8673 /* Make sure unwind info is emitted for the thunk if needed. */
8674 final_start_function (emit_barrier (), asm_out_file, 1);
8675
8676 /* Pad stack IP move with 4 instructions (two NOPs count
8677 as one instruction). */
8678 if (TARGET_PAD_SHORT_FUNCTION)
8679 {
8680 int i = 8;
8681
8682 while (i--)
8683 fputs ("\tnop\n", asm_out_file);
8684 }
8685
8686 xops[0] = gen_rtx_REG (Pmode, regno);
8687 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8688 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8689 fputs ("\tret\n", asm_out_file);
8690 final_end_function ();
8691 init_insn_lengths ();
8692 free_after_compilation (cfun);
8693 set_cfun (NULL);
8694 current_function_decl = NULL;
8695 }
8696
8697 if (flag_split_stack)
8698 file_end_indicate_split_stack ();
8699 }
8700
8701 /* Emit code for the SET_GOT patterns. */
8702
8703 const char *
8704 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8705 {
8706 rtx xops[3];
8707
8708 xops[0] = dest;
8709
8710 if (TARGET_VXWORKS_RTP && flag_pic)
8711 {
8712 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8713 xops[2] = gen_rtx_MEM (Pmode,
8714 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8715 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8716
8717 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8718 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8719 an unadorned address. */
8720 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8721 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8722 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8723 return "";
8724 }
8725
8726 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8727
8728 if (!flag_pic)
8729 {
8730 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8731
8732 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8733
8734 #if TARGET_MACHO
8735 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8736 is what will be referenced by the Mach-O PIC subsystem. */
8737 if (!label)
8738 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8739 #endif
8740
8741 targetm.asm_out.internal_label (asm_out_file, "L",
8742 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8743 }
8744 else
8745 {
8746 char name[32];
8747 get_pc_thunk_name (name, REGNO (dest));
8748 pic_labels_used |= 1 << REGNO (dest);
8749
8750 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8751 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8752 output_asm_insn ("call\t%X2", xops);
8753 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8754 is what will be referenced by the Mach-O PIC subsystem. */
8755 #if TARGET_MACHO
8756 if (!label)
8757 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8758 else
8759 targetm.asm_out.internal_label (asm_out_file, "L",
8760 CODE_LABEL_NUMBER (label));
8761 #endif
8762 }
8763
8764 if (!TARGET_MACHO)
8765 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8766
8767 return "";
8768 }
8769
8770 /* Generate an "push" pattern for input ARG. */
8771
8772 static rtx
8773 gen_push (rtx arg)
8774 {
8775 struct machine_function *m = cfun->machine;
8776
8777 if (m->fs.cfa_reg == stack_pointer_rtx)
8778 m->fs.cfa_offset += UNITS_PER_WORD;
8779 m->fs.sp_offset += UNITS_PER_WORD;
8780
8781 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8782 arg = gen_rtx_REG (word_mode, REGNO (arg));
8783
8784 return gen_rtx_SET (VOIDmode,
8785 gen_rtx_MEM (word_mode,
8786 gen_rtx_PRE_DEC (Pmode,
8787 stack_pointer_rtx)),
8788 arg);
8789 }
8790
8791 /* Generate an "pop" pattern for input ARG. */
8792
8793 static rtx
8794 gen_pop (rtx arg)
8795 {
8796 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8797 arg = gen_rtx_REG (word_mode, REGNO (arg));
8798
8799 return gen_rtx_SET (VOIDmode,
8800 arg,
8801 gen_rtx_MEM (word_mode,
8802 gen_rtx_POST_INC (Pmode,
8803 stack_pointer_rtx)));
8804 }
8805
8806 /* Return >= 0 if there is an unused call-clobbered register available
8807 for the entire function. */
8808
8809 static unsigned int
8810 ix86_select_alt_pic_regnum (void)
8811 {
8812 if (crtl->is_leaf
8813 && !crtl->profile
8814 && !ix86_current_function_calls_tls_descriptor)
8815 {
8816 int i, drap;
8817 /* Can't use the same register for both PIC and DRAP. */
8818 if (crtl->drap_reg)
8819 drap = REGNO (crtl->drap_reg);
8820 else
8821 drap = -1;
8822 for (i = 2; i >= 0; --i)
8823 if (i != drap && !df_regs_ever_live_p (i))
8824 return i;
8825 }
8826
8827 return INVALID_REGNUM;
8828 }
8829
8830 /* Return TRUE if we need to save REGNO. */
8831
8832 static bool
8833 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8834 {
8835 if (pic_offset_table_rtx
8836 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8837 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8838 || crtl->profile
8839 || crtl->calls_eh_return
8840 || crtl->uses_const_pool))
8841 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8842
8843 if (crtl->calls_eh_return && maybe_eh_return)
8844 {
8845 unsigned i;
8846 for (i = 0; ; i++)
8847 {
8848 unsigned test = EH_RETURN_DATA_REGNO (i);
8849 if (test == INVALID_REGNUM)
8850 break;
8851 if (test == regno)
8852 return true;
8853 }
8854 }
8855
8856 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8857 return true;
8858
8859 return (df_regs_ever_live_p (regno)
8860 && !call_used_regs[regno]
8861 && !fixed_regs[regno]
8862 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8863 }
8864
8865 /* Return number of saved general prupose registers. */
8866
8867 static int
8868 ix86_nsaved_regs (void)
8869 {
8870 int nregs = 0;
8871 int regno;
8872
8873 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8874 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8875 nregs ++;
8876 return nregs;
8877 }
8878
8879 /* Return number of saved SSE registrers. */
8880
8881 static int
8882 ix86_nsaved_sseregs (void)
8883 {
8884 int nregs = 0;
8885 int regno;
8886
8887 if (!TARGET_64BIT_MS_ABI)
8888 return 0;
8889 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8890 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8891 nregs ++;
8892 return nregs;
8893 }
8894
8895 /* Given FROM and TO register numbers, say whether this elimination is
8896 allowed. If stack alignment is needed, we can only replace argument
8897 pointer with hard frame pointer, or replace frame pointer with stack
8898 pointer. Otherwise, frame pointer elimination is automatically
8899 handled and all other eliminations are valid. */
8900
8901 static bool
8902 ix86_can_eliminate (const int from, const int to)
8903 {
8904 if (stack_realign_fp)
8905 return ((from == ARG_POINTER_REGNUM
8906 && to == HARD_FRAME_POINTER_REGNUM)
8907 || (from == FRAME_POINTER_REGNUM
8908 && to == STACK_POINTER_REGNUM));
8909 else
8910 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8911 }
8912
8913 /* Return the offset between two registers, one to be eliminated, and the other
8914 its replacement, at the start of a routine. */
8915
8916 HOST_WIDE_INT
8917 ix86_initial_elimination_offset (int from, int to)
8918 {
8919 struct ix86_frame frame;
8920 ix86_compute_frame_layout (&frame);
8921
8922 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8923 return frame.hard_frame_pointer_offset;
8924 else if (from == FRAME_POINTER_REGNUM
8925 && to == HARD_FRAME_POINTER_REGNUM)
8926 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8927 else
8928 {
8929 gcc_assert (to == STACK_POINTER_REGNUM);
8930
8931 if (from == ARG_POINTER_REGNUM)
8932 return frame.stack_pointer_offset;
8933
8934 gcc_assert (from == FRAME_POINTER_REGNUM);
8935 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8936 }
8937 }
8938
8939 /* In a dynamically-aligned function, we can't know the offset from
8940 stack pointer to frame pointer, so we must ensure that setjmp
8941 eliminates fp against the hard fp (%ebp) rather than trying to
8942 index from %esp up to the top of the frame across a gap that is
8943 of unknown (at compile-time) size. */
8944 static rtx
8945 ix86_builtin_setjmp_frame_value (void)
8946 {
8947 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8948 }
8949
8950 /* When using -fsplit-stack, the allocation routines set a field in
8951 the TCB to the bottom of the stack plus this much space, measured
8952 in bytes. */
8953
8954 #define SPLIT_STACK_AVAILABLE 256
8955
8956 /* Fill structure ix86_frame about frame of currently computed function. */
8957
8958 static void
8959 ix86_compute_frame_layout (struct ix86_frame *frame)
8960 {
8961 unsigned HOST_WIDE_INT stack_alignment_needed;
8962 HOST_WIDE_INT offset;
8963 unsigned HOST_WIDE_INT preferred_alignment;
8964 HOST_WIDE_INT size = get_frame_size ();
8965 HOST_WIDE_INT to_allocate;
8966
8967 frame->nregs = ix86_nsaved_regs ();
8968 frame->nsseregs = ix86_nsaved_sseregs ();
8969
8970 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8971 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8972
8973 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8974 function prologues and leaf. */
8975 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8976 && (!crtl->is_leaf || cfun->calls_alloca != 0
8977 || ix86_current_function_calls_tls_descriptor))
8978 {
8979 preferred_alignment = 16;
8980 stack_alignment_needed = 16;
8981 crtl->preferred_stack_boundary = 128;
8982 crtl->stack_alignment_needed = 128;
8983 }
8984
8985 gcc_assert (!size || stack_alignment_needed);
8986 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8987 gcc_assert (preferred_alignment <= stack_alignment_needed);
8988
8989 /* For SEH we have to limit the amount of code movement into the prologue.
8990 At present we do this via a BLOCKAGE, at which point there's very little
8991 scheduling that can be done, which means that there's very little point
8992 in doing anything except PUSHs. */
8993 if (TARGET_SEH)
8994 cfun->machine->use_fast_prologue_epilogue = false;
8995
8996 /* During reload iteration the amount of registers saved can change.
8997 Recompute the value as needed. Do not recompute when amount of registers
8998 didn't change as reload does multiple calls to the function and does not
8999 expect the decision to change within single iteration. */
9000 else if (!optimize_function_for_size_p (cfun)
9001 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9002 {
9003 int count = frame->nregs;
9004 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9005
9006 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9007
9008 /* The fast prologue uses move instead of push to save registers. This
9009 is significantly longer, but also executes faster as modern hardware
9010 can execute the moves in parallel, but can't do that for push/pop.
9011
9012 Be careful about choosing what prologue to emit: When function takes
9013 many instructions to execute we may use slow version as well as in
9014 case function is known to be outside hot spot (this is known with
9015 feedback only). Weight the size of function by number of registers
9016 to save as it is cheap to use one or two push instructions but very
9017 slow to use many of them. */
9018 if (count)
9019 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9020 if (node->frequency < NODE_FREQUENCY_NORMAL
9021 || (flag_branch_probabilities
9022 && node->frequency < NODE_FREQUENCY_HOT))
9023 cfun->machine->use_fast_prologue_epilogue = false;
9024 else
9025 cfun->machine->use_fast_prologue_epilogue
9026 = !expensive_function_p (count);
9027 }
9028
9029 frame->save_regs_using_mov
9030 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9031 /* If static stack checking is enabled and done with probes,
9032 the registers need to be saved before allocating the frame. */
9033 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9034
9035 /* Skip return address. */
9036 offset = UNITS_PER_WORD;
9037
9038 /* Skip pushed static chain. */
9039 if (ix86_static_chain_on_stack)
9040 offset += UNITS_PER_WORD;
9041
9042 /* Skip saved base pointer. */
9043 if (frame_pointer_needed)
9044 offset += UNITS_PER_WORD;
9045 frame->hfp_save_offset = offset;
9046
9047 /* The traditional frame pointer location is at the top of the frame. */
9048 frame->hard_frame_pointer_offset = offset;
9049
9050 /* Register save area */
9051 offset += frame->nregs * UNITS_PER_WORD;
9052 frame->reg_save_offset = offset;
9053
9054 /* Align and set SSE register save area. */
9055 if (frame->nsseregs)
9056 {
9057 /* The only ABI that has saved SSE registers (Win64) also has a
9058 16-byte aligned default stack, and thus we don't need to be
9059 within the re-aligned local stack frame to save them. */
9060 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9061 offset = (offset + 16 - 1) & -16;
9062 offset += frame->nsseregs * 16;
9063 }
9064 frame->sse_reg_save_offset = offset;
9065
9066 /* The re-aligned stack starts here. Values before this point are not
9067 directly comparable with values below this point. In order to make
9068 sure that no value happens to be the same before and after, force
9069 the alignment computation below to add a non-zero value. */
9070 if (stack_realign_fp)
9071 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9072
9073 /* Va-arg area */
9074 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9075 offset += frame->va_arg_size;
9076
9077 /* Align start of frame for local function. */
9078 if (stack_realign_fp
9079 || offset != frame->sse_reg_save_offset
9080 || size != 0
9081 || !crtl->is_leaf
9082 || cfun->calls_alloca
9083 || ix86_current_function_calls_tls_descriptor)
9084 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9085
9086 /* Frame pointer points here. */
9087 frame->frame_pointer_offset = offset;
9088
9089 offset += size;
9090
9091 /* Add outgoing arguments area. Can be skipped if we eliminated
9092 all the function calls as dead code.
9093 Skipping is however impossible when function calls alloca. Alloca
9094 expander assumes that last crtl->outgoing_args_size
9095 of stack frame are unused. */
9096 if (ACCUMULATE_OUTGOING_ARGS
9097 && (!crtl->is_leaf || cfun->calls_alloca
9098 || ix86_current_function_calls_tls_descriptor))
9099 {
9100 offset += crtl->outgoing_args_size;
9101 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9102 }
9103 else
9104 frame->outgoing_arguments_size = 0;
9105
9106 /* Align stack boundary. Only needed if we're calling another function
9107 or using alloca. */
9108 if (!crtl->is_leaf || cfun->calls_alloca
9109 || ix86_current_function_calls_tls_descriptor)
9110 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9111
9112 /* We've reached end of stack frame. */
9113 frame->stack_pointer_offset = offset;
9114
9115 /* Size prologue needs to allocate. */
9116 to_allocate = offset - frame->sse_reg_save_offset;
9117
9118 if ((!to_allocate && frame->nregs <= 1)
9119 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9120 frame->save_regs_using_mov = false;
9121
9122 if (ix86_using_red_zone ()
9123 && crtl->sp_is_unchanging
9124 && crtl->is_leaf
9125 && !ix86_current_function_calls_tls_descriptor)
9126 {
9127 frame->red_zone_size = to_allocate;
9128 if (frame->save_regs_using_mov)
9129 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9130 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9131 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9132 }
9133 else
9134 frame->red_zone_size = 0;
9135 frame->stack_pointer_offset -= frame->red_zone_size;
9136
9137 /* The SEH frame pointer location is near the bottom of the frame.
9138 This is enforced by the fact that the difference between the
9139 stack pointer and the frame pointer is limited to 240 bytes in
9140 the unwind data structure. */
9141 if (TARGET_SEH)
9142 {
9143 HOST_WIDE_INT diff;
9144
9145 /* If we can leave the frame pointer where it is, do so. */
9146 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9147 if (diff > 240 || (diff & 15) != 0)
9148 {
9149 /* Ideally we'd determine what portion of the local stack frame
9150 (within the constraint of the lowest 240) is most heavily used.
9151 But without that complication, simply bias the frame pointer
9152 by 128 bytes so as to maximize the amount of the local stack
9153 frame that is addressable with 8-bit offsets. */
9154 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9155 }
9156 }
9157 }
9158
9159 /* This is semi-inlined memory_address_length, but simplified
9160 since we know that we're always dealing with reg+offset, and
9161 to avoid having to create and discard all that rtl. */
9162
9163 static inline int
9164 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9165 {
9166 int len = 4;
9167
9168 if (offset == 0)
9169 {
9170 /* EBP and R13 cannot be encoded without an offset. */
9171 len = (regno == BP_REG || regno == R13_REG);
9172 }
9173 else if (IN_RANGE (offset, -128, 127))
9174 len = 1;
9175
9176 /* ESP and R12 must be encoded with a SIB byte. */
9177 if (regno == SP_REG || regno == R12_REG)
9178 len++;
9179
9180 return len;
9181 }
9182
9183 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9184 The valid base registers are taken from CFUN->MACHINE->FS. */
9185
9186 static rtx
9187 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9188 {
9189 const struct machine_function *m = cfun->machine;
9190 rtx base_reg = NULL;
9191 HOST_WIDE_INT base_offset = 0;
9192
9193 if (m->use_fast_prologue_epilogue)
9194 {
9195 /* Choose the base register most likely to allow the most scheduling
9196 opportunities. Generally FP is valid throughout the function,
9197 while DRAP must be reloaded within the epilogue. But choose either
9198 over the SP due to increased encoding size. */
9199
9200 if (m->fs.fp_valid)
9201 {
9202 base_reg = hard_frame_pointer_rtx;
9203 base_offset = m->fs.fp_offset - cfa_offset;
9204 }
9205 else if (m->fs.drap_valid)
9206 {
9207 base_reg = crtl->drap_reg;
9208 base_offset = 0 - cfa_offset;
9209 }
9210 else if (m->fs.sp_valid)
9211 {
9212 base_reg = stack_pointer_rtx;
9213 base_offset = m->fs.sp_offset - cfa_offset;
9214 }
9215 }
9216 else
9217 {
9218 HOST_WIDE_INT toffset;
9219 int len = 16, tlen;
9220
9221 /* Choose the base register with the smallest address encoding.
9222 With a tie, choose FP > DRAP > SP. */
9223 if (m->fs.sp_valid)
9224 {
9225 base_reg = stack_pointer_rtx;
9226 base_offset = m->fs.sp_offset - cfa_offset;
9227 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9228 }
9229 if (m->fs.drap_valid)
9230 {
9231 toffset = 0 - cfa_offset;
9232 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9233 if (tlen <= len)
9234 {
9235 base_reg = crtl->drap_reg;
9236 base_offset = toffset;
9237 len = tlen;
9238 }
9239 }
9240 if (m->fs.fp_valid)
9241 {
9242 toffset = m->fs.fp_offset - cfa_offset;
9243 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9244 if (tlen <= len)
9245 {
9246 base_reg = hard_frame_pointer_rtx;
9247 base_offset = toffset;
9248 len = tlen;
9249 }
9250 }
9251 }
9252 gcc_assert (base_reg != NULL);
9253
9254 return plus_constant (Pmode, base_reg, base_offset);
9255 }
9256
9257 /* Emit code to save registers in the prologue. */
9258
9259 static void
9260 ix86_emit_save_regs (void)
9261 {
9262 unsigned int regno;
9263 rtx insn;
9264
9265 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9266 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9267 {
9268 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9269 RTX_FRAME_RELATED_P (insn) = 1;
9270 }
9271 }
9272
9273 /* Emit a single register save at CFA - CFA_OFFSET. */
9274
9275 static void
9276 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9277 HOST_WIDE_INT cfa_offset)
9278 {
9279 struct machine_function *m = cfun->machine;
9280 rtx reg = gen_rtx_REG (mode, regno);
9281 rtx mem, addr, base, insn;
9282
9283 addr = choose_baseaddr (cfa_offset);
9284 mem = gen_frame_mem (mode, addr);
9285
9286 /* For SSE saves, we need to indicate the 128-bit alignment. */
9287 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9288
9289 insn = emit_move_insn (mem, reg);
9290 RTX_FRAME_RELATED_P (insn) = 1;
9291
9292 base = addr;
9293 if (GET_CODE (base) == PLUS)
9294 base = XEXP (base, 0);
9295 gcc_checking_assert (REG_P (base));
9296
9297 /* When saving registers into a re-aligned local stack frame, avoid
9298 any tricky guessing by dwarf2out. */
9299 if (m->fs.realigned)
9300 {
9301 gcc_checking_assert (stack_realign_drap);
9302
9303 if (regno == REGNO (crtl->drap_reg))
9304 {
9305 /* A bit of a hack. We force the DRAP register to be saved in
9306 the re-aligned stack frame, which provides us with a copy
9307 of the CFA that will last past the prologue. Install it. */
9308 gcc_checking_assert (cfun->machine->fs.fp_valid);
9309 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9310 cfun->machine->fs.fp_offset - cfa_offset);
9311 mem = gen_rtx_MEM (mode, addr);
9312 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9313 }
9314 else
9315 {
9316 /* The frame pointer is a stable reference within the
9317 aligned frame. Use it. */
9318 gcc_checking_assert (cfun->machine->fs.fp_valid);
9319 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9320 cfun->machine->fs.fp_offset - cfa_offset);
9321 mem = gen_rtx_MEM (mode, addr);
9322 add_reg_note (insn, REG_CFA_EXPRESSION,
9323 gen_rtx_SET (VOIDmode, mem, reg));
9324 }
9325 }
9326
9327 /* The memory may not be relative to the current CFA register,
9328 which means that we may need to generate a new pattern for
9329 use by the unwind info. */
9330 else if (base != m->fs.cfa_reg)
9331 {
9332 addr = plus_constant (Pmode, m->fs.cfa_reg,
9333 m->fs.cfa_offset - cfa_offset);
9334 mem = gen_rtx_MEM (mode, addr);
9335 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9336 }
9337 }
9338
9339 /* Emit code to save registers using MOV insns.
9340 First register is stored at CFA - CFA_OFFSET. */
9341 static void
9342 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9343 {
9344 unsigned int regno;
9345
9346 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9347 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9348 {
9349 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9350 cfa_offset -= UNITS_PER_WORD;
9351 }
9352 }
9353
9354 /* Emit code to save SSE registers using MOV insns.
9355 First register is stored at CFA - CFA_OFFSET. */
9356 static void
9357 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9358 {
9359 unsigned int regno;
9360
9361 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9362 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9363 {
9364 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9365 cfa_offset -= 16;
9366 }
9367 }
9368
9369 static GTY(()) rtx queued_cfa_restores;
9370
9371 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9372 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9373 Don't add the note if the previously saved value will be left untouched
9374 within stack red-zone till return, as unwinders can find the same value
9375 in the register and on the stack. */
9376
9377 static void
9378 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9379 {
9380 if (!crtl->shrink_wrapped
9381 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9382 return;
9383
9384 if (insn)
9385 {
9386 add_reg_note (insn, REG_CFA_RESTORE, reg);
9387 RTX_FRAME_RELATED_P (insn) = 1;
9388 }
9389 else
9390 queued_cfa_restores
9391 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9392 }
9393
9394 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9395
9396 static void
9397 ix86_add_queued_cfa_restore_notes (rtx insn)
9398 {
9399 rtx last;
9400 if (!queued_cfa_restores)
9401 return;
9402 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9403 ;
9404 XEXP (last, 1) = REG_NOTES (insn);
9405 REG_NOTES (insn) = queued_cfa_restores;
9406 queued_cfa_restores = NULL_RTX;
9407 RTX_FRAME_RELATED_P (insn) = 1;
9408 }
9409
9410 /* Expand prologue or epilogue stack adjustment.
9411 The pattern exist to put a dependency on all ebp-based memory accesses.
9412 STYLE should be negative if instructions should be marked as frame related,
9413 zero if %r11 register is live and cannot be freely used and positive
9414 otherwise. */
9415
9416 static void
9417 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9418 int style, bool set_cfa)
9419 {
9420 struct machine_function *m = cfun->machine;
9421 rtx insn;
9422 bool add_frame_related_expr = false;
9423
9424 if (Pmode == SImode)
9425 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9426 else if (x86_64_immediate_operand (offset, DImode))
9427 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9428 else
9429 {
9430 rtx tmp;
9431 /* r11 is used by indirect sibcall return as well, set before the
9432 epilogue and used after the epilogue. */
9433 if (style)
9434 tmp = gen_rtx_REG (DImode, R11_REG);
9435 else
9436 {
9437 gcc_assert (src != hard_frame_pointer_rtx
9438 && dest != hard_frame_pointer_rtx);
9439 tmp = hard_frame_pointer_rtx;
9440 }
9441 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9442 if (style < 0)
9443 add_frame_related_expr = true;
9444
9445 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9446 }
9447
9448 insn = emit_insn (insn);
9449 if (style >= 0)
9450 ix86_add_queued_cfa_restore_notes (insn);
9451
9452 if (set_cfa)
9453 {
9454 rtx r;
9455
9456 gcc_assert (m->fs.cfa_reg == src);
9457 m->fs.cfa_offset += INTVAL (offset);
9458 m->fs.cfa_reg = dest;
9459
9460 r = gen_rtx_PLUS (Pmode, src, offset);
9461 r = gen_rtx_SET (VOIDmode, dest, r);
9462 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9463 RTX_FRAME_RELATED_P (insn) = 1;
9464 }
9465 else if (style < 0)
9466 {
9467 RTX_FRAME_RELATED_P (insn) = 1;
9468 if (add_frame_related_expr)
9469 {
9470 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9471 r = gen_rtx_SET (VOIDmode, dest, r);
9472 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9473 }
9474 }
9475
9476 if (dest == stack_pointer_rtx)
9477 {
9478 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9479 bool valid = m->fs.sp_valid;
9480
9481 if (src == hard_frame_pointer_rtx)
9482 {
9483 valid = m->fs.fp_valid;
9484 ooffset = m->fs.fp_offset;
9485 }
9486 else if (src == crtl->drap_reg)
9487 {
9488 valid = m->fs.drap_valid;
9489 ooffset = 0;
9490 }
9491 else
9492 {
9493 /* Else there are two possibilities: SP itself, which we set
9494 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9495 taken care of this by hand along the eh_return path. */
9496 gcc_checking_assert (src == stack_pointer_rtx
9497 || offset == const0_rtx);
9498 }
9499
9500 m->fs.sp_offset = ooffset - INTVAL (offset);
9501 m->fs.sp_valid = valid;
9502 }
9503 }
9504
9505 /* Find an available register to be used as dynamic realign argument
9506 pointer regsiter. Such a register will be written in prologue and
9507 used in begin of body, so it must not be
9508 1. parameter passing register.
9509 2. GOT pointer.
9510 We reuse static-chain register if it is available. Otherwise, we
9511 use DI for i386 and R13 for x86-64. We chose R13 since it has
9512 shorter encoding.
9513
9514 Return: the regno of chosen register. */
9515
9516 static unsigned int
9517 find_drap_reg (void)
9518 {
9519 tree decl = cfun->decl;
9520
9521 if (TARGET_64BIT)
9522 {
9523 /* Use R13 for nested function or function need static chain.
9524 Since function with tail call may use any caller-saved
9525 registers in epilogue, DRAP must not use caller-saved
9526 register in such case. */
9527 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9528 return R13_REG;
9529
9530 return R10_REG;
9531 }
9532 else
9533 {
9534 /* Use DI for nested function or function need static chain.
9535 Since function with tail call may use any caller-saved
9536 registers in epilogue, DRAP must not use caller-saved
9537 register in such case. */
9538 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9539 return DI_REG;
9540
9541 /* Reuse static chain register if it isn't used for parameter
9542 passing. */
9543 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9544 {
9545 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9546 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9547 return CX_REG;
9548 }
9549 return DI_REG;
9550 }
9551 }
9552
9553 /* Return minimum incoming stack alignment. */
9554
9555 static unsigned int
9556 ix86_minimum_incoming_stack_boundary (bool sibcall)
9557 {
9558 unsigned int incoming_stack_boundary;
9559
9560 /* Prefer the one specified at command line. */
9561 if (ix86_user_incoming_stack_boundary)
9562 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9563 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9564 if -mstackrealign is used, it isn't used for sibcall check and
9565 estimated stack alignment is 128bit. */
9566 else if (!sibcall
9567 && !TARGET_64BIT
9568 && ix86_force_align_arg_pointer
9569 && crtl->stack_alignment_estimated == 128)
9570 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9571 else
9572 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9573
9574 /* Incoming stack alignment can be changed on individual functions
9575 via force_align_arg_pointer attribute. We use the smallest
9576 incoming stack boundary. */
9577 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9578 && lookup_attribute (ix86_force_align_arg_pointer_string,
9579 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9580 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9581
9582 /* The incoming stack frame has to be aligned at least at
9583 parm_stack_boundary. */
9584 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9585 incoming_stack_boundary = crtl->parm_stack_boundary;
9586
9587 /* Stack at entrance of main is aligned by runtime. We use the
9588 smallest incoming stack boundary. */
9589 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9590 && DECL_NAME (current_function_decl)
9591 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9592 && DECL_FILE_SCOPE_P (current_function_decl))
9593 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9594
9595 return incoming_stack_boundary;
9596 }
9597
9598 /* Update incoming stack boundary and estimated stack alignment. */
9599
9600 static void
9601 ix86_update_stack_boundary (void)
9602 {
9603 ix86_incoming_stack_boundary
9604 = ix86_minimum_incoming_stack_boundary (false);
9605
9606 /* x86_64 vararg needs 16byte stack alignment for register save
9607 area. */
9608 if (TARGET_64BIT
9609 && cfun->stdarg
9610 && crtl->stack_alignment_estimated < 128)
9611 crtl->stack_alignment_estimated = 128;
9612 }
9613
9614 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9615 needed or an rtx for DRAP otherwise. */
9616
9617 static rtx
9618 ix86_get_drap_rtx (void)
9619 {
9620 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9621 crtl->need_drap = true;
9622
9623 if (stack_realign_drap)
9624 {
9625 /* Assign DRAP to vDRAP and returns vDRAP */
9626 unsigned int regno = find_drap_reg ();
9627 rtx drap_vreg;
9628 rtx arg_ptr;
9629 rtx seq, insn;
9630
9631 arg_ptr = gen_rtx_REG (Pmode, regno);
9632 crtl->drap_reg = arg_ptr;
9633
9634 start_sequence ();
9635 drap_vreg = copy_to_reg (arg_ptr);
9636 seq = get_insns ();
9637 end_sequence ();
9638
9639 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9640 if (!optimize)
9641 {
9642 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9643 RTX_FRAME_RELATED_P (insn) = 1;
9644 }
9645 return drap_vreg;
9646 }
9647 else
9648 return NULL;
9649 }
9650
9651 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9652
9653 static rtx
9654 ix86_internal_arg_pointer (void)
9655 {
9656 return virtual_incoming_args_rtx;
9657 }
9658
9659 struct scratch_reg {
9660 rtx reg;
9661 bool saved;
9662 };
9663
9664 /* Return a short-lived scratch register for use on function entry.
9665 In 32-bit mode, it is valid only after the registers are saved
9666 in the prologue. This register must be released by means of
9667 release_scratch_register_on_entry once it is dead. */
9668
9669 static void
9670 get_scratch_register_on_entry (struct scratch_reg *sr)
9671 {
9672 int regno;
9673
9674 sr->saved = false;
9675
9676 if (TARGET_64BIT)
9677 {
9678 /* We always use R11 in 64-bit mode. */
9679 regno = R11_REG;
9680 }
9681 else
9682 {
9683 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9684 bool fastcall_p
9685 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9686 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9687 int regparm = ix86_function_regparm (fntype, decl);
9688 int drap_regno
9689 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9690
9691 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9692 for the static chain register. */
9693 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9694 && drap_regno != AX_REG)
9695 regno = AX_REG;
9696 else if (regparm < 2 && drap_regno != DX_REG)
9697 regno = DX_REG;
9698 /* ecx is the static chain register. */
9699 else if (regparm < 3 && !fastcall_p && !static_chain_p
9700 && drap_regno != CX_REG)
9701 regno = CX_REG;
9702 else if (ix86_save_reg (BX_REG, true))
9703 regno = BX_REG;
9704 /* esi is the static chain register. */
9705 else if (!(regparm == 3 && static_chain_p)
9706 && ix86_save_reg (SI_REG, true))
9707 regno = SI_REG;
9708 else if (ix86_save_reg (DI_REG, true))
9709 regno = DI_REG;
9710 else
9711 {
9712 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9713 sr->saved = true;
9714 }
9715 }
9716
9717 sr->reg = gen_rtx_REG (Pmode, regno);
9718 if (sr->saved)
9719 {
9720 rtx insn = emit_insn (gen_push (sr->reg));
9721 RTX_FRAME_RELATED_P (insn) = 1;
9722 }
9723 }
9724
9725 /* Release a scratch register obtained from the preceding function. */
9726
9727 static void
9728 release_scratch_register_on_entry (struct scratch_reg *sr)
9729 {
9730 if (sr->saved)
9731 {
9732 rtx x, insn = emit_insn (gen_pop (sr->reg));
9733
9734 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9735 RTX_FRAME_RELATED_P (insn) = 1;
9736 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9737 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9738 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9739 }
9740 }
9741
9742 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9743
9744 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9745
9746 static void
9747 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9748 {
9749 /* We skip the probe for the first interval + a small dope of 4 words and
9750 probe that many bytes past the specified size to maintain a protection
9751 area at the botton of the stack. */
9752 const int dope = 4 * UNITS_PER_WORD;
9753 rtx size_rtx = GEN_INT (size), last;
9754
9755 /* See if we have a constant small number of probes to generate. If so,
9756 that's the easy case. The run-time loop is made up of 11 insns in the
9757 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9758 for n # of intervals. */
9759 if (size <= 5 * PROBE_INTERVAL)
9760 {
9761 HOST_WIDE_INT i, adjust;
9762 bool first_probe = true;
9763
9764 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9765 values of N from 1 until it exceeds SIZE. If only one probe is
9766 needed, this will not generate any code. Then adjust and probe
9767 to PROBE_INTERVAL + SIZE. */
9768 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9769 {
9770 if (first_probe)
9771 {
9772 adjust = 2 * PROBE_INTERVAL + dope;
9773 first_probe = false;
9774 }
9775 else
9776 adjust = PROBE_INTERVAL;
9777
9778 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9779 plus_constant (Pmode, stack_pointer_rtx,
9780 -adjust)));
9781 emit_stack_probe (stack_pointer_rtx);
9782 }
9783
9784 if (first_probe)
9785 adjust = size + PROBE_INTERVAL + dope;
9786 else
9787 adjust = size + PROBE_INTERVAL - i;
9788
9789 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9790 plus_constant (Pmode, stack_pointer_rtx,
9791 -adjust)));
9792 emit_stack_probe (stack_pointer_rtx);
9793
9794 /* Adjust back to account for the additional first interval. */
9795 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9796 plus_constant (Pmode, stack_pointer_rtx,
9797 PROBE_INTERVAL + dope)));
9798 }
9799
9800 /* Otherwise, do the same as above, but in a loop. Note that we must be
9801 extra careful with variables wrapping around because we might be at
9802 the very top (or the very bottom) of the address space and we have
9803 to be able to handle this case properly; in particular, we use an
9804 equality test for the loop condition. */
9805 else
9806 {
9807 HOST_WIDE_INT rounded_size;
9808 struct scratch_reg sr;
9809
9810 get_scratch_register_on_entry (&sr);
9811
9812
9813 /* Step 1: round SIZE to the previous multiple of the interval. */
9814
9815 rounded_size = size & -PROBE_INTERVAL;
9816
9817
9818 /* Step 2: compute initial and final value of the loop counter. */
9819
9820 /* SP = SP_0 + PROBE_INTERVAL. */
9821 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9822 plus_constant (Pmode, stack_pointer_rtx,
9823 - (PROBE_INTERVAL + dope))));
9824
9825 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9826 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9827 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9828 gen_rtx_PLUS (Pmode, sr.reg,
9829 stack_pointer_rtx)));
9830
9831
9832 /* Step 3: the loop
9833
9834 while (SP != LAST_ADDR)
9835 {
9836 SP = SP + PROBE_INTERVAL
9837 probe at SP
9838 }
9839
9840 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9841 values of N from 1 until it is equal to ROUNDED_SIZE. */
9842
9843 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9844
9845
9846 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9847 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9848
9849 if (size != rounded_size)
9850 {
9851 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9852 plus_constant (Pmode, stack_pointer_rtx,
9853 rounded_size - size)));
9854 emit_stack_probe (stack_pointer_rtx);
9855 }
9856
9857 /* Adjust back to account for the additional first interval. */
9858 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9859 plus_constant (Pmode, stack_pointer_rtx,
9860 PROBE_INTERVAL + dope)));
9861
9862 release_scratch_register_on_entry (&sr);
9863 }
9864
9865 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9866
9867 /* Even if the stack pointer isn't the CFA register, we need to correctly
9868 describe the adjustments made to it, in particular differentiate the
9869 frame-related ones from the frame-unrelated ones. */
9870 if (size > 0)
9871 {
9872 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9873 XVECEXP (expr, 0, 0)
9874 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9875 plus_constant (Pmode, stack_pointer_rtx, -size));
9876 XVECEXP (expr, 0, 1)
9877 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9878 plus_constant (Pmode, stack_pointer_rtx,
9879 PROBE_INTERVAL + dope + size));
9880 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9881 RTX_FRAME_RELATED_P (last) = 1;
9882
9883 cfun->machine->fs.sp_offset += size;
9884 }
9885
9886 /* Make sure nothing is scheduled before we are done. */
9887 emit_insn (gen_blockage ());
9888 }
9889
9890 /* Adjust the stack pointer up to REG while probing it. */
9891
9892 const char *
9893 output_adjust_stack_and_probe (rtx reg)
9894 {
9895 static int labelno = 0;
9896 char loop_lab[32], end_lab[32];
9897 rtx xops[2];
9898
9899 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9900 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9901
9902 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9903
9904 /* Jump to END_LAB if SP == LAST_ADDR. */
9905 xops[0] = stack_pointer_rtx;
9906 xops[1] = reg;
9907 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9908 fputs ("\tje\t", asm_out_file);
9909 assemble_name_raw (asm_out_file, end_lab);
9910 fputc ('\n', asm_out_file);
9911
9912 /* SP = SP + PROBE_INTERVAL. */
9913 xops[1] = GEN_INT (PROBE_INTERVAL);
9914 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9915
9916 /* Probe at SP. */
9917 xops[1] = const0_rtx;
9918 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9919
9920 fprintf (asm_out_file, "\tjmp\t");
9921 assemble_name_raw (asm_out_file, loop_lab);
9922 fputc ('\n', asm_out_file);
9923
9924 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9925
9926 return "";
9927 }
9928
9929 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9930 inclusive. These are offsets from the current stack pointer. */
9931
9932 static void
9933 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9934 {
9935 /* See if we have a constant small number of probes to generate. If so,
9936 that's the easy case. The run-time loop is made up of 7 insns in the
9937 generic case while the compile-time loop is made up of n insns for n #
9938 of intervals. */
9939 if (size <= 7 * PROBE_INTERVAL)
9940 {
9941 HOST_WIDE_INT i;
9942
9943 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9944 it exceeds SIZE. If only one probe is needed, this will not
9945 generate any code. Then probe at FIRST + SIZE. */
9946 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9947 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9948 -(first + i)));
9949
9950 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9951 -(first + size)));
9952 }
9953
9954 /* Otherwise, do the same as above, but in a loop. Note that we must be
9955 extra careful with variables wrapping around because we might be at
9956 the very top (or the very bottom) of the address space and we have
9957 to be able to handle this case properly; in particular, we use an
9958 equality test for the loop condition. */
9959 else
9960 {
9961 HOST_WIDE_INT rounded_size, last;
9962 struct scratch_reg sr;
9963
9964 get_scratch_register_on_entry (&sr);
9965
9966
9967 /* Step 1: round SIZE to the previous multiple of the interval. */
9968
9969 rounded_size = size & -PROBE_INTERVAL;
9970
9971
9972 /* Step 2: compute initial and final value of the loop counter. */
9973
9974 /* TEST_OFFSET = FIRST. */
9975 emit_move_insn (sr.reg, GEN_INT (-first));
9976
9977 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9978 last = first + rounded_size;
9979
9980
9981 /* Step 3: the loop
9982
9983 while (TEST_ADDR != LAST_ADDR)
9984 {
9985 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9986 probe at TEST_ADDR
9987 }
9988
9989 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9990 until it is equal to ROUNDED_SIZE. */
9991
9992 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9993
9994
9995 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9996 that SIZE is equal to ROUNDED_SIZE. */
9997
9998 if (size != rounded_size)
9999 emit_stack_probe (plus_constant (Pmode,
10000 gen_rtx_PLUS (Pmode,
10001 stack_pointer_rtx,
10002 sr.reg),
10003 rounded_size - size));
10004
10005 release_scratch_register_on_entry (&sr);
10006 }
10007
10008 /* Make sure nothing is scheduled before we are done. */
10009 emit_insn (gen_blockage ());
10010 }
10011
10012 /* Probe a range of stack addresses from REG to END, inclusive. These are
10013 offsets from the current stack pointer. */
10014
10015 const char *
10016 output_probe_stack_range (rtx reg, rtx end)
10017 {
10018 static int labelno = 0;
10019 char loop_lab[32], end_lab[32];
10020 rtx xops[3];
10021
10022 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10023 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10024
10025 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10026
10027 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10028 xops[0] = reg;
10029 xops[1] = end;
10030 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10031 fputs ("\tje\t", asm_out_file);
10032 assemble_name_raw (asm_out_file, end_lab);
10033 fputc ('\n', asm_out_file);
10034
10035 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10036 xops[1] = GEN_INT (PROBE_INTERVAL);
10037 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10038
10039 /* Probe at TEST_ADDR. */
10040 xops[0] = stack_pointer_rtx;
10041 xops[1] = reg;
10042 xops[2] = const0_rtx;
10043 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10044
10045 fprintf (asm_out_file, "\tjmp\t");
10046 assemble_name_raw (asm_out_file, loop_lab);
10047 fputc ('\n', asm_out_file);
10048
10049 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10050
10051 return "";
10052 }
10053
10054 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10055 to be generated in correct form. */
10056 static void
10057 ix86_finalize_stack_realign_flags (void)
10058 {
10059 /* Check if stack realign is really needed after reload, and
10060 stores result in cfun */
10061 unsigned int incoming_stack_boundary
10062 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10063 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10064 unsigned int stack_realign = (incoming_stack_boundary
10065 < (crtl->is_leaf
10066 ? crtl->max_used_stack_slot_alignment
10067 : crtl->stack_alignment_needed));
10068
10069 if (crtl->stack_realign_finalized)
10070 {
10071 /* After stack_realign_needed is finalized, we can't no longer
10072 change it. */
10073 gcc_assert (crtl->stack_realign_needed == stack_realign);
10074 return;
10075 }
10076
10077 /* If the only reason for frame_pointer_needed is that we conservatively
10078 assumed stack realignment might be needed, but in the end nothing that
10079 needed the stack alignment had been spilled, clear frame_pointer_needed
10080 and say we don't need stack realignment. */
10081 if (stack_realign
10082 && !crtl->need_drap
10083 && frame_pointer_needed
10084 && crtl->is_leaf
10085 && flag_omit_frame_pointer
10086 && crtl->sp_is_unchanging
10087 && !ix86_current_function_calls_tls_descriptor
10088 && !crtl->accesses_prior_frames
10089 && !cfun->calls_alloca
10090 && !crtl->calls_eh_return
10091 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10092 && !ix86_frame_pointer_required ()
10093 && get_frame_size () == 0
10094 && ix86_nsaved_sseregs () == 0
10095 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10096 {
10097 HARD_REG_SET set_up_by_prologue, prologue_used;
10098 basic_block bb;
10099
10100 CLEAR_HARD_REG_SET (prologue_used);
10101 CLEAR_HARD_REG_SET (set_up_by_prologue);
10102 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10103 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10104 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10105 HARD_FRAME_POINTER_REGNUM);
10106 FOR_EACH_BB (bb)
10107 {
10108 rtx insn;
10109 FOR_BB_INSNS (bb, insn)
10110 if (NONDEBUG_INSN_P (insn)
10111 && requires_stack_frame_p (insn, prologue_used,
10112 set_up_by_prologue))
10113 {
10114 crtl->stack_realign_needed = stack_realign;
10115 crtl->stack_realign_finalized = true;
10116 return;
10117 }
10118 }
10119
10120 frame_pointer_needed = false;
10121 stack_realign = false;
10122 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10123 crtl->stack_alignment_needed = incoming_stack_boundary;
10124 crtl->stack_alignment_estimated = incoming_stack_boundary;
10125 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10126 crtl->preferred_stack_boundary = incoming_stack_boundary;
10127 df_finish_pass (true);
10128 df_scan_alloc (NULL);
10129 df_scan_blocks ();
10130 df_compute_regs_ever_live (true);
10131 df_analyze ();
10132 }
10133
10134 crtl->stack_realign_needed = stack_realign;
10135 crtl->stack_realign_finalized = true;
10136 }
10137
10138 /* Expand the prologue into a bunch of separate insns. */
10139
10140 void
10141 ix86_expand_prologue (void)
10142 {
10143 struct machine_function *m = cfun->machine;
10144 rtx insn, t;
10145 bool pic_reg_used;
10146 struct ix86_frame frame;
10147 HOST_WIDE_INT allocate;
10148 bool int_registers_saved;
10149
10150 ix86_finalize_stack_realign_flags ();
10151
10152 /* DRAP should not coexist with stack_realign_fp */
10153 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10154
10155 memset (&m->fs, 0, sizeof (m->fs));
10156
10157 /* Initialize CFA state for before the prologue. */
10158 m->fs.cfa_reg = stack_pointer_rtx;
10159 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10160
10161 /* Track SP offset to the CFA. We continue tracking this after we've
10162 swapped the CFA register away from SP. In the case of re-alignment
10163 this is fudged; we're interested to offsets within the local frame. */
10164 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10165 m->fs.sp_valid = true;
10166
10167 ix86_compute_frame_layout (&frame);
10168
10169 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10170 {
10171 /* We should have already generated an error for any use of
10172 ms_hook on a nested function. */
10173 gcc_checking_assert (!ix86_static_chain_on_stack);
10174
10175 /* Check if profiling is active and we shall use profiling before
10176 prologue variant. If so sorry. */
10177 if (crtl->profile && flag_fentry != 0)
10178 sorry ("ms_hook_prologue attribute isn%'t compatible "
10179 "with -mfentry for 32-bit");
10180
10181 /* In ix86_asm_output_function_label we emitted:
10182 8b ff movl.s %edi,%edi
10183 55 push %ebp
10184 8b ec movl.s %esp,%ebp
10185
10186 This matches the hookable function prologue in Win32 API
10187 functions in Microsoft Windows XP Service Pack 2 and newer.
10188 Wine uses this to enable Windows apps to hook the Win32 API
10189 functions provided by Wine.
10190
10191 What that means is that we've already set up the frame pointer. */
10192
10193 if (frame_pointer_needed
10194 && !(crtl->drap_reg && crtl->stack_realign_needed))
10195 {
10196 rtx push, mov;
10197
10198 /* We've decided to use the frame pointer already set up.
10199 Describe this to the unwinder by pretending that both
10200 push and mov insns happen right here.
10201
10202 Putting the unwind info here at the end of the ms_hook
10203 is done so that we can make absolutely certain we get
10204 the required byte sequence at the start of the function,
10205 rather than relying on an assembler that can produce
10206 the exact encoding required.
10207
10208 However it does mean (in the unpatched case) that we have
10209 a 1 insn window where the asynchronous unwind info is
10210 incorrect. However, if we placed the unwind info at
10211 its correct location we would have incorrect unwind info
10212 in the patched case. Which is probably all moot since
10213 I don't expect Wine generates dwarf2 unwind info for the
10214 system libraries that use this feature. */
10215
10216 insn = emit_insn (gen_blockage ());
10217
10218 push = gen_push (hard_frame_pointer_rtx);
10219 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10220 stack_pointer_rtx);
10221 RTX_FRAME_RELATED_P (push) = 1;
10222 RTX_FRAME_RELATED_P (mov) = 1;
10223
10224 RTX_FRAME_RELATED_P (insn) = 1;
10225 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10226 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10227
10228 /* Note that gen_push incremented m->fs.cfa_offset, even
10229 though we didn't emit the push insn here. */
10230 m->fs.cfa_reg = hard_frame_pointer_rtx;
10231 m->fs.fp_offset = m->fs.cfa_offset;
10232 m->fs.fp_valid = true;
10233 }
10234 else
10235 {
10236 /* The frame pointer is not needed so pop %ebp again.
10237 This leaves us with a pristine state. */
10238 emit_insn (gen_pop (hard_frame_pointer_rtx));
10239 }
10240 }
10241
10242 /* The first insn of a function that accepts its static chain on the
10243 stack is to push the register that would be filled in by a direct
10244 call. This insn will be skipped by the trampoline. */
10245 else if (ix86_static_chain_on_stack)
10246 {
10247 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10248 emit_insn (gen_blockage ());
10249
10250 /* We don't want to interpret this push insn as a register save,
10251 only as a stack adjustment. The real copy of the register as
10252 a save will be done later, if needed. */
10253 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10254 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10255 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10256 RTX_FRAME_RELATED_P (insn) = 1;
10257 }
10258
10259 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10260 of DRAP is needed and stack realignment is really needed after reload */
10261 if (stack_realign_drap)
10262 {
10263 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10264
10265 /* Only need to push parameter pointer reg if it is caller saved. */
10266 if (!call_used_regs[REGNO (crtl->drap_reg)])
10267 {
10268 /* Push arg pointer reg */
10269 insn = emit_insn (gen_push (crtl->drap_reg));
10270 RTX_FRAME_RELATED_P (insn) = 1;
10271 }
10272
10273 /* Grab the argument pointer. */
10274 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10275 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10276 RTX_FRAME_RELATED_P (insn) = 1;
10277 m->fs.cfa_reg = crtl->drap_reg;
10278 m->fs.cfa_offset = 0;
10279
10280 /* Align the stack. */
10281 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10282 stack_pointer_rtx,
10283 GEN_INT (-align_bytes)));
10284 RTX_FRAME_RELATED_P (insn) = 1;
10285
10286 /* Replicate the return address on the stack so that return
10287 address can be reached via (argp - 1) slot. This is needed
10288 to implement macro RETURN_ADDR_RTX and intrinsic function
10289 expand_builtin_return_addr etc. */
10290 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10291 t = gen_frame_mem (word_mode, t);
10292 insn = emit_insn (gen_push (t));
10293 RTX_FRAME_RELATED_P (insn) = 1;
10294
10295 /* For the purposes of frame and register save area addressing,
10296 we've started over with a new frame. */
10297 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10298 m->fs.realigned = true;
10299 }
10300
10301 if (frame_pointer_needed && !m->fs.fp_valid)
10302 {
10303 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10304 slower on all targets. Also sdb doesn't like it. */
10305 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10306 RTX_FRAME_RELATED_P (insn) = 1;
10307
10308 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10309 {
10310 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10311 RTX_FRAME_RELATED_P (insn) = 1;
10312
10313 if (m->fs.cfa_reg == stack_pointer_rtx)
10314 m->fs.cfa_reg = hard_frame_pointer_rtx;
10315 m->fs.fp_offset = m->fs.sp_offset;
10316 m->fs.fp_valid = true;
10317 }
10318 }
10319
10320 int_registers_saved = (frame.nregs == 0);
10321
10322 if (!int_registers_saved)
10323 {
10324 /* If saving registers via PUSH, do so now. */
10325 if (!frame.save_regs_using_mov)
10326 {
10327 ix86_emit_save_regs ();
10328 int_registers_saved = true;
10329 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10330 }
10331
10332 /* When using red zone we may start register saving before allocating
10333 the stack frame saving one cycle of the prologue. However, avoid
10334 doing this if we have to probe the stack; at least on x86_64 the
10335 stack probe can turn into a call that clobbers a red zone location. */
10336 else if (ix86_using_red_zone ()
10337 && (! TARGET_STACK_PROBE
10338 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10339 {
10340 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10341 int_registers_saved = true;
10342 }
10343 }
10344
10345 if (stack_realign_fp)
10346 {
10347 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10348 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10349
10350 /* The computation of the size of the re-aligned stack frame means
10351 that we must allocate the size of the register save area before
10352 performing the actual alignment. Otherwise we cannot guarantee
10353 that there's enough storage above the realignment point. */
10354 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10355 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10356 GEN_INT (m->fs.sp_offset
10357 - frame.sse_reg_save_offset),
10358 -1, false);
10359
10360 /* Align the stack. */
10361 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10362 stack_pointer_rtx,
10363 GEN_INT (-align_bytes)));
10364
10365 /* For the purposes of register save area addressing, the stack
10366 pointer is no longer valid. As for the value of sp_offset,
10367 see ix86_compute_frame_layout, which we need to match in order
10368 to pass verification of stack_pointer_offset at the end. */
10369 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10370 m->fs.sp_valid = false;
10371 }
10372
10373 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10374
10375 if (flag_stack_usage_info)
10376 {
10377 /* We start to count from ARG_POINTER. */
10378 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10379
10380 /* If it was realigned, take into account the fake frame. */
10381 if (stack_realign_drap)
10382 {
10383 if (ix86_static_chain_on_stack)
10384 stack_size += UNITS_PER_WORD;
10385
10386 if (!call_used_regs[REGNO (crtl->drap_reg)])
10387 stack_size += UNITS_PER_WORD;
10388
10389 /* This over-estimates by 1 minimal-stack-alignment-unit but
10390 mitigates that by counting in the new return address slot. */
10391 current_function_dynamic_stack_size
10392 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10393 }
10394
10395 current_function_static_stack_size = stack_size;
10396 }
10397
10398 /* The stack has already been decremented by the instruction calling us
10399 so probe if the size is non-negative to preserve the protection area. */
10400 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10401 {
10402 /* We expect the registers to be saved when probes are used. */
10403 gcc_assert (int_registers_saved);
10404
10405 if (STACK_CHECK_MOVING_SP)
10406 {
10407 ix86_adjust_stack_and_probe (allocate);
10408 allocate = 0;
10409 }
10410 else
10411 {
10412 HOST_WIDE_INT size = allocate;
10413
10414 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10415 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10416
10417 if (TARGET_STACK_PROBE)
10418 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10419 else
10420 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10421 }
10422 }
10423
10424 if (allocate == 0)
10425 ;
10426 else if (!ix86_target_stack_probe ()
10427 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10428 {
10429 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10430 GEN_INT (-allocate), -1,
10431 m->fs.cfa_reg == stack_pointer_rtx);
10432 }
10433 else
10434 {
10435 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10436 rtx r10 = NULL;
10437 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10438
10439 bool eax_live = false;
10440 bool r10_live = false;
10441
10442 if (TARGET_64BIT)
10443 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10444 if (!TARGET_64BIT_MS_ABI)
10445 eax_live = ix86_eax_live_at_start_p ();
10446
10447 if (eax_live)
10448 {
10449 emit_insn (gen_push (eax));
10450 allocate -= UNITS_PER_WORD;
10451 }
10452 if (r10_live)
10453 {
10454 r10 = gen_rtx_REG (Pmode, R10_REG);
10455 emit_insn (gen_push (r10));
10456 allocate -= UNITS_PER_WORD;
10457 }
10458
10459 emit_move_insn (eax, GEN_INT (allocate));
10460 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10461
10462 /* Use the fact that AX still contains ALLOCATE. */
10463 adjust_stack_insn = (Pmode == DImode
10464 ? gen_pro_epilogue_adjust_stack_di_sub
10465 : gen_pro_epilogue_adjust_stack_si_sub);
10466
10467 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10468 stack_pointer_rtx, eax));
10469
10470 /* Note that SEH directives need to continue tracking the stack
10471 pointer even after the frame pointer has been set up. */
10472 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10473 {
10474 if (m->fs.cfa_reg == stack_pointer_rtx)
10475 m->fs.cfa_offset += allocate;
10476
10477 RTX_FRAME_RELATED_P (insn) = 1;
10478 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10479 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10480 plus_constant (Pmode, stack_pointer_rtx,
10481 -allocate)));
10482 }
10483 m->fs.sp_offset += allocate;
10484
10485 if (r10_live && eax_live)
10486 {
10487 t = choose_baseaddr (m->fs.sp_offset - allocate);
10488 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10489 gen_frame_mem (word_mode, t));
10490 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10491 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10492 gen_frame_mem (word_mode, t));
10493 }
10494 else if (eax_live || r10_live)
10495 {
10496 t = choose_baseaddr (m->fs.sp_offset - allocate);
10497 emit_move_insn (gen_rtx_REG (word_mode,
10498 (eax_live ? AX_REG : R10_REG)),
10499 gen_frame_mem (word_mode, t));
10500 }
10501 }
10502 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10503
10504 /* If we havn't already set up the frame pointer, do so now. */
10505 if (frame_pointer_needed && !m->fs.fp_valid)
10506 {
10507 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10508 GEN_INT (frame.stack_pointer_offset
10509 - frame.hard_frame_pointer_offset));
10510 insn = emit_insn (insn);
10511 RTX_FRAME_RELATED_P (insn) = 1;
10512 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10513
10514 if (m->fs.cfa_reg == stack_pointer_rtx)
10515 m->fs.cfa_reg = hard_frame_pointer_rtx;
10516 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10517 m->fs.fp_valid = true;
10518 }
10519
10520 if (!int_registers_saved)
10521 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10522 if (frame.nsseregs)
10523 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10524
10525 pic_reg_used = false;
10526 if (pic_offset_table_rtx
10527 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10528 || crtl->profile))
10529 {
10530 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10531
10532 if (alt_pic_reg_used != INVALID_REGNUM)
10533 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10534
10535 pic_reg_used = true;
10536 }
10537
10538 if (pic_reg_used)
10539 {
10540 if (TARGET_64BIT)
10541 {
10542 if (ix86_cmodel == CM_LARGE_PIC)
10543 {
10544 rtx label, tmp_reg;
10545
10546 gcc_assert (Pmode == DImode);
10547 label = gen_label_rtx ();
10548 emit_label (label);
10549 LABEL_PRESERVE_P (label) = 1;
10550 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10551 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10552 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10553 label));
10554 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10555 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10556 pic_offset_table_rtx, tmp_reg));
10557 }
10558 else
10559 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10560 }
10561 else
10562 {
10563 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10564 RTX_FRAME_RELATED_P (insn) = 1;
10565 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10566 }
10567 }
10568
10569 /* In the pic_reg_used case, make sure that the got load isn't deleted
10570 when mcount needs it. Blockage to avoid call movement across mcount
10571 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10572 note. */
10573 if (crtl->profile && !flag_fentry && pic_reg_used)
10574 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10575
10576 if (crtl->drap_reg && !crtl->stack_realign_needed)
10577 {
10578 /* vDRAP is setup but after reload it turns out stack realign
10579 isn't necessary, here we will emit prologue to setup DRAP
10580 without stack realign adjustment */
10581 t = choose_baseaddr (0);
10582 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10583 }
10584
10585 /* Prevent instructions from being scheduled into register save push
10586 sequence when access to the redzone area is done through frame pointer.
10587 The offset between the frame pointer and the stack pointer is calculated
10588 relative to the value of the stack pointer at the end of the function
10589 prologue, and moving instructions that access redzone area via frame
10590 pointer inside push sequence violates this assumption. */
10591 if (frame_pointer_needed && frame.red_zone_size)
10592 emit_insn (gen_memory_blockage ());
10593
10594 /* Emit cld instruction if stringops are used in the function. */
10595 if (TARGET_CLD && ix86_current_function_needs_cld)
10596 emit_insn (gen_cld ());
10597
10598 /* SEH requires that the prologue end within 256 bytes of the start of
10599 the function. Prevent instruction schedules that would extend that.
10600 Further, prevent alloca modifications to the stack pointer from being
10601 combined with prologue modifications. */
10602 if (TARGET_SEH)
10603 emit_insn (gen_prologue_use (stack_pointer_rtx));
10604 }
10605
10606 /* Emit code to restore REG using a POP insn. */
10607
10608 static void
10609 ix86_emit_restore_reg_using_pop (rtx reg)
10610 {
10611 struct machine_function *m = cfun->machine;
10612 rtx insn = emit_insn (gen_pop (reg));
10613
10614 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10615 m->fs.sp_offset -= UNITS_PER_WORD;
10616
10617 if (m->fs.cfa_reg == crtl->drap_reg
10618 && REGNO (reg) == REGNO (crtl->drap_reg))
10619 {
10620 /* Previously we'd represented the CFA as an expression
10621 like *(%ebp - 8). We've just popped that value from
10622 the stack, which means we need to reset the CFA to
10623 the drap register. This will remain until we restore
10624 the stack pointer. */
10625 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10626 RTX_FRAME_RELATED_P (insn) = 1;
10627
10628 /* This means that the DRAP register is valid for addressing too. */
10629 m->fs.drap_valid = true;
10630 return;
10631 }
10632
10633 if (m->fs.cfa_reg == stack_pointer_rtx)
10634 {
10635 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10636 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10637 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10638 RTX_FRAME_RELATED_P (insn) = 1;
10639
10640 m->fs.cfa_offset -= UNITS_PER_WORD;
10641 }
10642
10643 /* When the frame pointer is the CFA, and we pop it, we are
10644 swapping back to the stack pointer as the CFA. This happens
10645 for stack frames that don't allocate other data, so we assume
10646 the stack pointer is now pointing at the return address, i.e.
10647 the function entry state, which makes the offset be 1 word. */
10648 if (reg == hard_frame_pointer_rtx)
10649 {
10650 m->fs.fp_valid = false;
10651 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10652 {
10653 m->fs.cfa_reg = stack_pointer_rtx;
10654 m->fs.cfa_offset -= UNITS_PER_WORD;
10655
10656 add_reg_note (insn, REG_CFA_DEF_CFA,
10657 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10658 GEN_INT (m->fs.cfa_offset)));
10659 RTX_FRAME_RELATED_P (insn) = 1;
10660 }
10661 }
10662 }
10663
10664 /* Emit code to restore saved registers using POP insns. */
10665
10666 static void
10667 ix86_emit_restore_regs_using_pop (void)
10668 {
10669 unsigned int regno;
10670
10671 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10672 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10673 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10674 }
10675
10676 /* Emit code and notes for the LEAVE instruction. */
10677
10678 static void
10679 ix86_emit_leave (void)
10680 {
10681 struct machine_function *m = cfun->machine;
10682 rtx insn = emit_insn (ix86_gen_leave ());
10683
10684 ix86_add_queued_cfa_restore_notes (insn);
10685
10686 gcc_assert (m->fs.fp_valid);
10687 m->fs.sp_valid = true;
10688 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10689 m->fs.fp_valid = false;
10690
10691 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10692 {
10693 m->fs.cfa_reg = stack_pointer_rtx;
10694 m->fs.cfa_offset = m->fs.sp_offset;
10695
10696 add_reg_note (insn, REG_CFA_DEF_CFA,
10697 plus_constant (Pmode, stack_pointer_rtx,
10698 m->fs.sp_offset));
10699 RTX_FRAME_RELATED_P (insn) = 1;
10700 }
10701 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10702 m->fs.fp_offset);
10703 }
10704
10705 /* Emit code to restore saved registers using MOV insns.
10706 First register is restored from CFA - CFA_OFFSET. */
10707 static void
10708 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10709 bool maybe_eh_return)
10710 {
10711 struct machine_function *m = cfun->machine;
10712 unsigned int regno;
10713
10714 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10715 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10716 {
10717 rtx reg = gen_rtx_REG (word_mode, regno);
10718 rtx insn, mem;
10719
10720 mem = choose_baseaddr (cfa_offset);
10721 mem = gen_frame_mem (word_mode, mem);
10722 insn = emit_move_insn (reg, mem);
10723
10724 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10725 {
10726 /* Previously we'd represented the CFA as an expression
10727 like *(%ebp - 8). We've just popped that value from
10728 the stack, which means we need to reset the CFA to
10729 the drap register. This will remain until we restore
10730 the stack pointer. */
10731 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10732 RTX_FRAME_RELATED_P (insn) = 1;
10733
10734 /* This means that the DRAP register is valid for addressing. */
10735 m->fs.drap_valid = true;
10736 }
10737 else
10738 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10739
10740 cfa_offset -= UNITS_PER_WORD;
10741 }
10742 }
10743
10744 /* Emit code to restore saved registers using MOV insns.
10745 First register is restored from CFA - CFA_OFFSET. */
10746 static void
10747 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10748 bool maybe_eh_return)
10749 {
10750 unsigned int regno;
10751
10752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10753 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10754 {
10755 rtx reg = gen_rtx_REG (V4SFmode, regno);
10756 rtx mem;
10757
10758 mem = choose_baseaddr (cfa_offset);
10759 mem = gen_rtx_MEM (V4SFmode, mem);
10760 set_mem_align (mem, 128);
10761 emit_move_insn (reg, mem);
10762
10763 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10764
10765 cfa_offset -= 16;
10766 }
10767 }
10768
10769 /* Emit vzeroupper if needed. */
10770
10771 void
10772 ix86_maybe_emit_epilogue_vzeroupper (void)
10773 {
10774 if (TARGET_VZEROUPPER
10775 && !TREE_THIS_VOLATILE (cfun->decl)
10776 && !cfun->machine->caller_return_avx256_p)
10777 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10778 }
10779
10780 /* Restore function stack, frame, and registers. */
10781
10782 void
10783 ix86_expand_epilogue (int style)
10784 {
10785 struct machine_function *m = cfun->machine;
10786 struct machine_frame_state frame_state_save = m->fs;
10787 struct ix86_frame frame;
10788 bool restore_regs_via_mov;
10789 bool using_drap;
10790
10791 ix86_finalize_stack_realign_flags ();
10792 ix86_compute_frame_layout (&frame);
10793
10794 m->fs.sp_valid = (!frame_pointer_needed
10795 || (crtl->sp_is_unchanging
10796 && !stack_realign_fp));
10797 gcc_assert (!m->fs.sp_valid
10798 || m->fs.sp_offset == frame.stack_pointer_offset);
10799
10800 /* The FP must be valid if the frame pointer is present. */
10801 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10802 gcc_assert (!m->fs.fp_valid
10803 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10804
10805 /* We must have *some* valid pointer to the stack frame. */
10806 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10807
10808 /* The DRAP is never valid at this point. */
10809 gcc_assert (!m->fs.drap_valid);
10810
10811 /* See the comment about red zone and frame
10812 pointer usage in ix86_expand_prologue. */
10813 if (frame_pointer_needed && frame.red_zone_size)
10814 emit_insn (gen_memory_blockage ());
10815
10816 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10817 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10818
10819 /* Determine the CFA offset of the end of the red-zone. */
10820 m->fs.red_zone_offset = 0;
10821 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10822 {
10823 /* The red-zone begins below the return address. */
10824 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10825
10826 /* When the register save area is in the aligned portion of
10827 the stack, determine the maximum runtime displacement that
10828 matches up with the aligned frame. */
10829 if (stack_realign_drap)
10830 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10831 + UNITS_PER_WORD);
10832 }
10833
10834 /* Special care must be taken for the normal return case of a function
10835 using eh_return: the eax and edx registers are marked as saved, but
10836 not restored along this path. Adjust the save location to match. */
10837 if (crtl->calls_eh_return && style != 2)
10838 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10839
10840 /* EH_RETURN requires the use of moves to function properly. */
10841 if (crtl->calls_eh_return)
10842 restore_regs_via_mov = true;
10843 /* SEH requires the use of pops to identify the epilogue. */
10844 else if (TARGET_SEH)
10845 restore_regs_via_mov = false;
10846 /* If we're only restoring one register and sp is not valid then
10847 using a move instruction to restore the register since it's
10848 less work than reloading sp and popping the register. */
10849 else if (!m->fs.sp_valid && frame.nregs <= 1)
10850 restore_regs_via_mov = true;
10851 else if (TARGET_EPILOGUE_USING_MOVE
10852 && cfun->machine->use_fast_prologue_epilogue
10853 && (frame.nregs > 1
10854 || m->fs.sp_offset != frame.reg_save_offset))
10855 restore_regs_via_mov = true;
10856 else if (frame_pointer_needed
10857 && !frame.nregs
10858 && m->fs.sp_offset != frame.reg_save_offset)
10859 restore_regs_via_mov = true;
10860 else if (frame_pointer_needed
10861 && TARGET_USE_LEAVE
10862 && cfun->machine->use_fast_prologue_epilogue
10863 && frame.nregs == 1)
10864 restore_regs_via_mov = true;
10865 else
10866 restore_regs_via_mov = false;
10867
10868 if (restore_regs_via_mov || frame.nsseregs)
10869 {
10870 /* Ensure that the entire register save area is addressable via
10871 the stack pointer, if we will restore via sp. */
10872 if (TARGET_64BIT
10873 && m->fs.sp_offset > 0x7fffffff
10874 && !(m->fs.fp_valid || m->fs.drap_valid)
10875 && (frame.nsseregs + frame.nregs) != 0)
10876 {
10877 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10878 GEN_INT (m->fs.sp_offset
10879 - frame.sse_reg_save_offset),
10880 style,
10881 m->fs.cfa_reg == stack_pointer_rtx);
10882 }
10883 }
10884
10885 /* If there are any SSE registers to restore, then we have to do it
10886 via moves, since there's obviously no pop for SSE regs. */
10887 if (frame.nsseregs)
10888 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10889 style == 2);
10890
10891 if (restore_regs_via_mov)
10892 {
10893 rtx t;
10894
10895 if (frame.nregs)
10896 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10897
10898 /* eh_return epilogues need %ecx added to the stack pointer. */
10899 if (style == 2)
10900 {
10901 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10902
10903 /* Stack align doesn't work with eh_return. */
10904 gcc_assert (!stack_realign_drap);
10905 /* Neither does regparm nested functions. */
10906 gcc_assert (!ix86_static_chain_on_stack);
10907
10908 if (frame_pointer_needed)
10909 {
10910 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10911 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10912 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10913
10914 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10915 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10916
10917 /* Note that we use SA as a temporary CFA, as the return
10918 address is at the proper place relative to it. We
10919 pretend this happens at the FP restore insn because
10920 prior to this insn the FP would be stored at the wrong
10921 offset relative to SA, and after this insn we have no
10922 other reasonable register to use for the CFA. We don't
10923 bother resetting the CFA to the SP for the duration of
10924 the return insn. */
10925 add_reg_note (insn, REG_CFA_DEF_CFA,
10926 plus_constant (Pmode, sa, UNITS_PER_WORD));
10927 ix86_add_queued_cfa_restore_notes (insn);
10928 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10929 RTX_FRAME_RELATED_P (insn) = 1;
10930
10931 m->fs.cfa_reg = sa;
10932 m->fs.cfa_offset = UNITS_PER_WORD;
10933 m->fs.fp_valid = false;
10934
10935 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10936 const0_rtx, style, false);
10937 }
10938 else
10939 {
10940 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10941 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10942 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10943 ix86_add_queued_cfa_restore_notes (insn);
10944
10945 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10946 if (m->fs.cfa_offset != UNITS_PER_WORD)
10947 {
10948 m->fs.cfa_offset = UNITS_PER_WORD;
10949 add_reg_note (insn, REG_CFA_DEF_CFA,
10950 plus_constant (Pmode, stack_pointer_rtx,
10951 UNITS_PER_WORD));
10952 RTX_FRAME_RELATED_P (insn) = 1;
10953 }
10954 }
10955 m->fs.sp_offset = UNITS_PER_WORD;
10956 m->fs.sp_valid = true;
10957 }
10958 }
10959 else
10960 {
10961 /* SEH requires that the function end with (1) a stack adjustment
10962 if necessary, (2) a sequence of pops, and (3) a return or
10963 jump instruction. Prevent insns from the function body from
10964 being scheduled into this sequence. */
10965 if (TARGET_SEH)
10966 {
10967 /* Prevent a catch region from being adjacent to the standard
10968 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10969 several other flags that would be interesting to test are
10970 not yet set up. */
10971 if (flag_non_call_exceptions)
10972 emit_insn (gen_nops (const1_rtx));
10973 else
10974 emit_insn (gen_blockage ());
10975 }
10976
10977 /* First step is to deallocate the stack frame so that we can
10978 pop the registers. */
10979 if (!m->fs.sp_valid)
10980 {
10981 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10982 GEN_INT (m->fs.fp_offset
10983 - frame.reg_save_offset),
10984 style, false);
10985 }
10986 else if (m->fs.sp_offset != frame.reg_save_offset)
10987 {
10988 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10989 GEN_INT (m->fs.sp_offset
10990 - frame.reg_save_offset),
10991 style,
10992 m->fs.cfa_reg == stack_pointer_rtx);
10993 }
10994
10995 ix86_emit_restore_regs_using_pop ();
10996 }
10997
10998 /* If we used a stack pointer and haven't already got rid of it,
10999 then do so now. */
11000 if (m->fs.fp_valid)
11001 {
11002 /* If the stack pointer is valid and pointing at the frame
11003 pointer store address, then we only need a pop. */
11004 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11005 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11006 /* Leave results in shorter dependency chains on CPUs that are
11007 able to grok it fast. */
11008 else if (TARGET_USE_LEAVE
11009 || optimize_function_for_size_p (cfun)
11010 || !cfun->machine->use_fast_prologue_epilogue)
11011 ix86_emit_leave ();
11012 else
11013 {
11014 pro_epilogue_adjust_stack (stack_pointer_rtx,
11015 hard_frame_pointer_rtx,
11016 const0_rtx, style, !using_drap);
11017 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11018 }
11019 }
11020
11021 if (using_drap)
11022 {
11023 int param_ptr_offset = UNITS_PER_WORD;
11024 rtx insn;
11025
11026 gcc_assert (stack_realign_drap);
11027
11028 if (ix86_static_chain_on_stack)
11029 param_ptr_offset += UNITS_PER_WORD;
11030 if (!call_used_regs[REGNO (crtl->drap_reg)])
11031 param_ptr_offset += UNITS_PER_WORD;
11032
11033 insn = emit_insn (gen_rtx_SET
11034 (VOIDmode, stack_pointer_rtx,
11035 gen_rtx_PLUS (Pmode,
11036 crtl->drap_reg,
11037 GEN_INT (-param_ptr_offset))));
11038 m->fs.cfa_reg = stack_pointer_rtx;
11039 m->fs.cfa_offset = param_ptr_offset;
11040 m->fs.sp_offset = param_ptr_offset;
11041 m->fs.realigned = false;
11042
11043 add_reg_note (insn, REG_CFA_DEF_CFA,
11044 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11045 GEN_INT (param_ptr_offset)));
11046 RTX_FRAME_RELATED_P (insn) = 1;
11047
11048 if (!call_used_regs[REGNO (crtl->drap_reg)])
11049 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11050 }
11051
11052 /* At this point the stack pointer must be valid, and we must have
11053 restored all of the registers. We may not have deallocated the
11054 entire stack frame. We've delayed this until now because it may
11055 be possible to merge the local stack deallocation with the
11056 deallocation forced by ix86_static_chain_on_stack. */
11057 gcc_assert (m->fs.sp_valid);
11058 gcc_assert (!m->fs.fp_valid);
11059 gcc_assert (!m->fs.realigned);
11060 if (m->fs.sp_offset != UNITS_PER_WORD)
11061 {
11062 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11063 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11064 style, true);
11065 }
11066 else
11067 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11068
11069 /* Sibcall epilogues don't want a return instruction. */
11070 if (style == 0)
11071 {
11072 m->fs = frame_state_save;
11073 return;
11074 }
11075
11076 /* Emit vzeroupper if needed. */
11077 ix86_maybe_emit_epilogue_vzeroupper ();
11078
11079 if (crtl->args.pops_args && crtl->args.size)
11080 {
11081 rtx popc = GEN_INT (crtl->args.pops_args);
11082
11083 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11084 address, do explicit add, and jump indirectly to the caller. */
11085
11086 if (crtl->args.pops_args >= 65536)
11087 {
11088 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11089 rtx insn;
11090
11091 /* There is no "pascal" calling convention in any 64bit ABI. */
11092 gcc_assert (!TARGET_64BIT);
11093
11094 insn = emit_insn (gen_pop (ecx));
11095 m->fs.cfa_offset -= UNITS_PER_WORD;
11096 m->fs.sp_offset -= UNITS_PER_WORD;
11097
11098 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11099 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11100 add_reg_note (insn, REG_CFA_REGISTER,
11101 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11102 RTX_FRAME_RELATED_P (insn) = 1;
11103
11104 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11105 popc, -1, true);
11106 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11107 }
11108 else
11109 emit_jump_insn (gen_simple_return_pop_internal (popc));
11110 }
11111 else
11112 emit_jump_insn (gen_simple_return_internal ());
11113
11114 /* Restore the state back to the state from the prologue,
11115 so that it's correct for the next epilogue. */
11116 m->fs = frame_state_save;
11117 }
11118
11119 /* Reset from the function's potential modifications. */
11120
11121 static void
11122 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11123 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11124 {
11125 if (pic_offset_table_rtx)
11126 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11127 #if TARGET_MACHO
11128 /* Mach-O doesn't support labels at the end of objects, so if
11129 it looks like we might want one, insert a NOP. */
11130 {
11131 rtx insn = get_last_insn ();
11132 rtx deleted_debug_label = NULL_RTX;
11133 while (insn
11134 && NOTE_P (insn)
11135 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11136 {
11137 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11138 notes only, instead set their CODE_LABEL_NUMBER to -1,
11139 otherwise there would be code generation differences
11140 in between -g and -g0. */
11141 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11142 deleted_debug_label = insn;
11143 insn = PREV_INSN (insn);
11144 }
11145 if (insn
11146 && (LABEL_P (insn)
11147 || (NOTE_P (insn)
11148 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11149 fputs ("\tnop\n", file);
11150 else if (deleted_debug_label)
11151 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11152 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11153 CODE_LABEL_NUMBER (insn) = -1;
11154 }
11155 #endif
11156
11157 }
11158
11159 /* Return a scratch register to use in the split stack prologue. The
11160 split stack prologue is used for -fsplit-stack. It is the first
11161 instructions in the function, even before the regular prologue.
11162 The scratch register can be any caller-saved register which is not
11163 used for parameters or for the static chain. */
11164
11165 static unsigned int
11166 split_stack_prologue_scratch_regno (void)
11167 {
11168 if (TARGET_64BIT)
11169 return R11_REG;
11170 else
11171 {
11172 bool is_fastcall;
11173 int regparm;
11174
11175 is_fastcall = (lookup_attribute ("fastcall",
11176 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11177 != NULL);
11178 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11179
11180 if (is_fastcall)
11181 {
11182 if (DECL_STATIC_CHAIN (cfun->decl))
11183 {
11184 sorry ("-fsplit-stack does not support fastcall with "
11185 "nested function");
11186 return INVALID_REGNUM;
11187 }
11188 return AX_REG;
11189 }
11190 else if (regparm < 3)
11191 {
11192 if (!DECL_STATIC_CHAIN (cfun->decl))
11193 return CX_REG;
11194 else
11195 {
11196 if (regparm >= 2)
11197 {
11198 sorry ("-fsplit-stack does not support 2 register "
11199 " parameters for a nested function");
11200 return INVALID_REGNUM;
11201 }
11202 return DX_REG;
11203 }
11204 }
11205 else
11206 {
11207 /* FIXME: We could make this work by pushing a register
11208 around the addition and comparison. */
11209 sorry ("-fsplit-stack does not support 3 register parameters");
11210 return INVALID_REGNUM;
11211 }
11212 }
11213 }
11214
11215 /* A SYMBOL_REF for the function which allocates new stackspace for
11216 -fsplit-stack. */
11217
11218 static GTY(()) rtx split_stack_fn;
11219
11220 /* A SYMBOL_REF for the more stack function when using the large
11221 model. */
11222
11223 static GTY(()) rtx split_stack_fn_large;
11224
11225 /* Handle -fsplit-stack. These are the first instructions in the
11226 function, even before the regular prologue. */
11227
11228 void
11229 ix86_expand_split_stack_prologue (void)
11230 {
11231 struct ix86_frame frame;
11232 HOST_WIDE_INT allocate;
11233 unsigned HOST_WIDE_INT args_size;
11234 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11235 rtx scratch_reg = NULL_RTX;
11236 rtx varargs_label = NULL_RTX;
11237 rtx fn;
11238
11239 gcc_assert (flag_split_stack && reload_completed);
11240
11241 ix86_finalize_stack_realign_flags ();
11242 ix86_compute_frame_layout (&frame);
11243 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11244
11245 /* This is the label we will branch to if we have enough stack
11246 space. We expect the basic block reordering pass to reverse this
11247 branch if optimizing, so that we branch in the unlikely case. */
11248 label = gen_label_rtx ();
11249
11250 /* We need to compare the stack pointer minus the frame size with
11251 the stack boundary in the TCB. The stack boundary always gives
11252 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11253 can compare directly. Otherwise we need to do an addition. */
11254
11255 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11256 UNSPEC_STACK_CHECK);
11257 limit = gen_rtx_CONST (Pmode, limit);
11258 limit = gen_rtx_MEM (Pmode, limit);
11259 if (allocate < SPLIT_STACK_AVAILABLE)
11260 current = stack_pointer_rtx;
11261 else
11262 {
11263 unsigned int scratch_regno;
11264 rtx offset;
11265
11266 /* We need a scratch register to hold the stack pointer minus
11267 the required frame size. Since this is the very start of the
11268 function, the scratch register can be any caller-saved
11269 register which is not used for parameters. */
11270 offset = GEN_INT (- allocate);
11271 scratch_regno = split_stack_prologue_scratch_regno ();
11272 if (scratch_regno == INVALID_REGNUM)
11273 return;
11274 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11275 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11276 {
11277 /* We don't use ix86_gen_add3 in this case because it will
11278 want to split to lea, but when not optimizing the insn
11279 will not be split after this point. */
11280 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11281 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11282 offset)));
11283 }
11284 else
11285 {
11286 emit_move_insn (scratch_reg, offset);
11287 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11288 stack_pointer_rtx));
11289 }
11290 current = scratch_reg;
11291 }
11292
11293 ix86_expand_branch (GEU, current, limit, label);
11294 jump_insn = get_last_insn ();
11295 JUMP_LABEL (jump_insn) = label;
11296
11297 /* Mark the jump as very likely to be taken. */
11298 add_reg_note (jump_insn, REG_BR_PROB,
11299 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11300
11301 if (split_stack_fn == NULL_RTX)
11302 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11303 fn = split_stack_fn;
11304
11305 /* Get more stack space. We pass in the desired stack space and the
11306 size of the arguments to copy to the new stack. In 32-bit mode
11307 we push the parameters; __morestack will return on a new stack
11308 anyhow. In 64-bit mode we pass the parameters in r10 and
11309 r11. */
11310 allocate_rtx = GEN_INT (allocate);
11311 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11312 call_fusage = NULL_RTX;
11313 if (TARGET_64BIT)
11314 {
11315 rtx reg10, reg11;
11316
11317 reg10 = gen_rtx_REG (Pmode, R10_REG);
11318 reg11 = gen_rtx_REG (Pmode, R11_REG);
11319
11320 /* If this function uses a static chain, it will be in %r10.
11321 Preserve it across the call to __morestack. */
11322 if (DECL_STATIC_CHAIN (cfun->decl))
11323 {
11324 rtx rax;
11325
11326 rax = gen_rtx_REG (word_mode, AX_REG);
11327 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11328 use_reg (&call_fusage, rax);
11329 }
11330
11331 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11332 {
11333 HOST_WIDE_INT argval;
11334
11335 gcc_assert (Pmode == DImode);
11336 /* When using the large model we need to load the address
11337 into a register, and we've run out of registers. So we
11338 switch to a different calling convention, and we call a
11339 different function: __morestack_large. We pass the
11340 argument size in the upper 32 bits of r10 and pass the
11341 frame size in the lower 32 bits. */
11342 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11343 gcc_assert ((args_size & 0xffffffff) == args_size);
11344
11345 if (split_stack_fn_large == NULL_RTX)
11346 split_stack_fn_large =
11347 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11348
11349 if (ix86_cmodel == CM_LARGE_PIC)
11350 {
11351 rtx label, x;
11352
11353 label = gen_label_rtx ();
11354 emit_label (label);
11355 LABEL_PRESERVE_P (label) = 1;
11356 emit_insn (gen_set_rip_rex64 (reg10, label));
11357 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11358 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11359 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11360 UNSPEC_GOT);
11361 x = gen_rtx_CONST (Pmode, x);
11362 emit_move_insn (reg11, x);
11363 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11364 x = gen_const_mem (Pmode, x);
11365 emit_move_insn (reg11, x);
11366 }
11367 else
11368 emit_move_insn (reg11, split_stack_fn_large);
11369
11370 fn = reg11;
11371
11372 argval = ((args_size << 16) << 16) + allocate;
11373 emit_move_insn (reg10, GEN_INT (argval));
11374 }
11375 else
11376 {
11377 emit_move_insn (reg10, allocate_rtx);
11378 emit_move_insn (reg11, GEN_INT (args_size));
11379 use_reg (&call_fusage, reg11);
11380 }
11381
11382 use_reg (&call_fusage, reg10);
11383 }
11384 else
11385 {
11386 emit_insn (gen_push (GEN_INT (args_size)));
11387 emit_insn (gen_push (allocate_rtx));
11388 }
11389 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11390 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11391 NULL_RTX, false);
11392 add_function_usage_to (call_insn, call_fusage);
11393
11394 /* In order to make call/return prediction work right, we now need
11395 to execute a return instruction. See
11396 libgcc/config/i386/morestack.S for the details on how this works.
11397
11398 For flow purposes gcc must not see this as a return
11399 instruction--we need control flow to continue at the subsequent
11400 label. Therefore, we use an unspec. */
11401 gcc_assert (crtl->args.pops_args < 65536);
11402 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11403
11404 /* If we are in 64-bit mode and this function uses a static chain,
11405 we saved %r10 in %rax before calling _morestack. */
11406 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11407 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11408 gen_rtx_REG (word_mode, AX_REG));
11409
11410 /* If this function calls va_start, we need to store a pointer to
11411 the arguments on the old stack, because they may not have been
11412 all copied to the new stack. At this point the old stack can be
11413 found at the frame pointer value used by __morestack, because
11414 __morestack has set that up before calling back to us. Here we
11415 store that pointer in a scratch register, and in
11416 ix86_expand_prologue we store the scratch register in a stack
11417 slot. */
11418 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11419 {
11420 unsigned int scratch_regno;
11421 rtx frame_reg;
11422 int words;
11423
11424 scratch_regno = split_stack_prologue_scratch_regno ();
11425 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11426 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11427
11428 /* 64-bit:
11429 fp -> old fp value
11430 return address within this function
11431 return address of caller of this function
11432 stack arguments
11433 So we add three words to get to the stack arguments.
11434
11435 32-bit:
11436 fp -> old fp value
11437 return address within this function
11438 first argument to __morestack
11439 second argument to __morestack
11440 return address of caller of this function
11441 stack arguments
11442 So we add five words to get to the stack arguments.
11443 */
11444 words = TARGET_64BIT ? 3 : 5;
11445 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11446 gen_rtx_PLUS (Pmode, frame_reg,
11447 GEN_INT (words * UNITS_PER_WORD))));
11448
11449 varargs_label = gen_label_rtx ();
11450 emit_jump_insn (gen_jump (varargs_label));
11451 JUMP_LABEL (get_last_insn ()) = varargs_label;
11452
11453 emit_barrier ();
11454 }
11455
11456 emit_label (label);
11457 LABEL_NUSES (label) = 1;
11458
11459 /* If this function calls va_start, we now have to set the scratch
11460 register for the case where we do not call __morestack. In this
11461 case we need to set it based on the stack pointer. */
11462 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11463 {
11464 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11465 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11466 GEN_INT (UNITS_PER_WORD))));
11467
11468 emit_label (varargs_label);
11469 LABEL_NUSES (varargs_label) = 1;
11470 }
11471 }
11472
11473 /* We may have to tell the dataflow pass that the split stack prologue
11474 is initializing a scratch register. */
11475
11476 static void
11477 ix86_live_on_entry (bitmap regs)
11478 {
11479 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11480 {
11481 gcc_assert (flag_split_stack);
11482 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11483 }
11484 }
11485 \f
11486 /* Determine if op is suitable SUBREG RTX for address. */
11487
11488 static bool
11489 ix86_address_subreg_operand (rtx op)
11490 {
11491 enum machine_mode mode;
11492
11493 if (!REG_P (op))
11494 return false;
11495
11496 mode = GET_MODE (op);
11497
11498 if (GET_MODE_CLASS (mode) != MODE_INT)
11499 return false;
11500
11501 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11502 failures when the register is one word out of a two word structure. */
11503 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11504 return false;
11505
11506 /* Allow only SUBREGs of non-eliminable hard registers. */
11507 return register_no_elim_operand (op, mode);
11508 }
11509
11510 /* Extract the parts of an RTL expression that is a valid memory address
11511 for an instruction. Return 0 if the structure of the address is
11512 grossly off. Return -1 if the address contains ASHIFT, so it is not
11513 strictly valid, but still used for computing length of lea instruction. */
11514
11515 int
11516 ix86_decompose_address (rtx addr, struct ix86_address *out)
11517 {
11518 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11519 rtx base_reg, index_reg;
11520 HOST_WIDE_INT scale = 1;
11521 rtx scale_rtx = NULL_RTX;
11522 rtx tmp;
11523 int retval = 1;
11524 enum ix86_address_seg seg = SEG_DEFAULT;
11525
11526 /* Allow zero-extended SImode addresses,
11527 they will be emitted with addr32 prefix. */
11528 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11529 {
11530 if (GET_CODE (addr) == ZERO_EXTEND
11531 && GET_MODE (XEXP (addr, 0)) == SImode)
11532 addr = XEXP (addr, 0);
11533 else if (GET_CODE (addr) == AND
11534 && const_32bit_mask (XEXP (addr, 1), DImode))
11535 {
11536 addr = XEXP (addr, 0);
11537
11538 /* Adjust SUBREGs. */
11539 if (GET_CODE (addr) == SUBREG
11540 && GET_MODE (SUBREG_REG (addr)) == SImode)
11541 addr = SUBREG_REG (addr);
11542 else if (GET_MODE (addr) == DImode)
11543 addr = gen_rtx_SUBREG (SImode, addr, 0);
11544 else if (GET_MODE (addr) != VOIDmode)
11545 return 0;
11546 }
11547 }
11548
11549 if (REG_P (addr))
11550 base = addr;
11551 else if (GET_CODE (addr) == SUBREG)
11552 {
11553 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11554 base = addr;
11555 else
11556 return 0;
11557 }
11558 else if (GET_CODE (addr) == PLUS)
11559 {
11560 rtx addends[4], op;
11561 int n = 0, i;
11562
11563 op = addr;
11564 do
11565 {
11566 if (n >= 4)
11567 return 0;
11568 addends[n++] = XEXP (op, 1);
11569 op = XEXP (op, 0);
11570 }
11571 while (GET_CODE (op) == PLUS);
11572 if (n >= 4)
11573 return 0;
11574 addends[n] = op;
11575
11576 for (i = n; i >= 0; --i)
11577 {
11578 op = addends[i];
11579 switch (GET_CODE (op))
11580 {
11581 case MULT:
11582 if (index)
11583 return 0;
11584 index = XEXP (op, 0);
11585 scale_rtx = XEXP (op, 1);
11586 break;
11587
11588 case ASHIFT:
11589 if (index)
11590 return 0;
11591 index = XEXP (op, 0);
11592 tmp = XEXP (op, 1);
11593 if (!CONST_INT_P (tmp))
11594 return 0;
11595 scale = INTVAL (tmp);
11596 if ((unsigned HOST_WIDE_INT) scale > 3)
11597 return 0;
11598 scale = 1 << scale;
11599 break;
11600
11601 case ZERO_EXTEND:
11602 op = XEXP (op, 0);
11603 if (GET_CODE (op) != UNSPEC)
11604 return 0;
11605 /* FALLTHRU */
11606
11607 case UNSPEC:
11608 if (XINT (op, 1) == UNSPEC_TP
11609 && TARGET_TLS_DIRECT_SEG_REFS
11610 && seg == SEG_DEFAULT)
11611 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11612 else
11613 return 0;
11614 break;
11615
11616 case SUBREG:
11617 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11618 return 0;
11619 /* FALLTHRU */
11620
11621 case REG:
11622 if (!base)
11623 base = op;
11624 else if (!index)
11625 index = op;
11626 else
11627 return 0;
11628 break;
11629
11630 case CONST:
11631 case CONST_INT:
11632 case SYMBOL_REF:
11633 case LABEL_REF:
11634 if (disp)
11635 return 0;
11636 disp = op;
11637 break;
11638
11639 default:
11640 return 0;
11641 }
11642 }
11643 }
11644 else if (GET_CODE (addr) == MULT)
11645 {
11646 index = XEXP (addr, 0); /* index*scale */
11647 scale_rtx = XEXP (addr, 1);
11648 }
11649 else if (GET_CODE (addr) == ASHIFT)
11650 {
11651 /* We're called for lea too, which implements ashift on occasion. */
11652 index = XEXP (addr, 0);
11653 tmp = XEXP (addr, 1);
11654 if (!CONST_INT_P (tmp))
11655 return 0;
11656 scale = INTVAL (tmp);
11657 if ((unsigned HOST_WIDE_INT) scale > 3)
11658 return 0;
11659 scale = 1 << scale;
11660 retval = -1;
11661 }
11662 else
11663 disp = addr; /* displacement */
11664
11665 if (index)
11666 {
11667 if (REG_P (index))
11668 ;
11669 else if (GET_CODE (index) == SUBREG
11670 && ix86_address_subreg_operand (SUBREG_REG (index)))
11671 ;
11672 else
11673 return 0;
11674 }
11675
11676 /* Address override works only on the (%reg) part of %fs:(%reg). */
11677 if (seg != SEG_DEFAULT
11678 && ((base && GET_MODE (base) != word_mode)
11679 || (index && GET_MODE (index) != word_mode)))
11680 return 0;
11681
11682 /* Extract the integral value of scale. */
11683 if (scale_rtx)
11684 {
11685 if (!CONST_INT_P (scale_rtx))
11686 return 0;
11687 scale = INTVAL (scale_rtx);
11688 }
11689
11690 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11691 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11692
11693 /* Avoid useless 0 displacement. */
11694 if (disp == const0_rtx && (base || index))
11695 disp = NULL_RTX;
11696
11697 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11698 if (base_reg && index_reg && scale == 1
11699 && (index_reg == arg_pointer_rtx
11700 || index_reg == frame_pointer_rtx
11701 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11702 {
11703 rtx tmp;
11704 tmp = base, base = index, index = tmp;
11705 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11706 }
11707
11708 /* Special case: %ebp cannot be encoded as a base without a displacement.
11709 Similarly %r13. */
11710 if (!disp
11711 && base_reg
11712 && (base_reg == hard_frame_pointer_rtx
11713 || base_reg == frame_pointer_rtx
11714 || base_reg == arg_pointer_rtx
11715 || (REG_P (base_reg)
11716 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11717 || REGNO (base_reg) == R13_REG))))
11718 disp = const0_rtx;
11719
11720 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11721 Avoid this by transforming to [%esi+0].
11722 Reload calls address legitimization without cfun defined, so we need
11723 to test cfun for being non-NULL. */
11724 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11725 && base_reg && !index_reg && !disp
11726 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11727 disp = const0_rtx;
11728
11729 /* Special case: encode reg+reg instead of reg*2. */
11730 if (!base && index && scale == 2)
11731 base = index, base_reg = index_reg, scale = 1;
11732
11733 /* Special case: scaling cannot be encoded without base or displacement. */
11734 if (!base && !disp && index && scale != 1)
11735 disp = const0_rtx;
11736
11737 out->base = base;
11738 out->index = index;
11739 out->disp = disp;
11740 out->scale = scale;
11741 out->seg = seg;
11742
11743 return retval;
11744 }
11745 \f
11746 /* Return cost of the memory address x.
11747 For i386, it is better to use a complex address than let gcc copy
11748 the address into a reg and make a new pseudo. But not if the address
11749 requires to two regs - that would mean more pseudos with longer
11750 lifetimes. */
11751 static int
11752 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11753 {
11754 struct ix86_address parts;
11755 int cost = 1;
11756 int ok = ix86_decompose_address (x, &parts);
11757
11758 gcc_assert (ok);
11759
11760 if (parts.base && GET_CODE (parts.base) == SUBREG)
11761 parts.base = SUBREG_REG (parts.base);
11762 if (parts.index && GET_CODE (parts.index) == SUBREG)
11763 parts.index = SUBREG_REG (parts.index);
11764
11765 /* Attempt to minimize number of registers in the address. */
11766 if ((parts.base
11767 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11768 || (parts.index
11769 && (!REG_P (parts.index)
11770 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11771 cost++;
11772
11773 if (parts.base
11774 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11775 && parts.index
11776 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11777 && parts.base != parts.index)
11778 cost++;
11779
11780 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11781 since it's predecode logic can't detect the length of instructions
11782 and it degenerates to vector decoded. Increase cost of such
11783 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11784 to split such addresses or even refuse such addresses at all.
11785
11786 Following addressing modes are affected:
11787 [base+scale*index]
11788 [scale*index+disp]
11789 [base+index]
11790
11791 The first and last case may be avoidable by explicitly coding the zero in
11792 memory address, but I don't have AMD-K6 machine handy to check this
11793 theory. */
11794
11795 if (TARGET_K6
11796 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11797 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11798 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11799 cost += 10;
11800
11801 return cost;
11802 }
11803 \f
11804 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11805 this is used for to form addresses to local data when -fPIC is in
11806 use. */
11807
11808 static bool
11809 darwin_local_data_pic (rtx disp)
11810 {
11811 return (GET_CODE (disp) == UNSPEC
11812 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11813 }
11814
11815 /* Determine if a given RTX is a valid constant. We already know this
11816 satisfies CONSTANT_P. */
11817
11818 static bool
11819 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11820 {
11821 switch (GET_CODE (x))
11822 {
11823 case CONST:
11824 x = XEXP (x, 0);
11825
11826 if (GET_CODE (x) == PLUS)
11827 {
11828 if (!CONST_INT_P (XEXP (x, 1)))
11829 return false;
11830 x = XEXP (x, 0);
11831 }
11832
11833 if (TARGET_MACHO && darwin_local_data_pic (x))
11834 return true;
11835
11836 /* Only some unspecs are valid as "constants". */
11837 if (GET_CODE (x) == UNSPEC)
11838 switch (XINT (x, 1))
11839 {
11840 case UNSPEC_GOT:
11841 case UNSPEC_GOTOFF:
11842 case UNSPEC_PLTOFF:
11843 return TARGET_64BIT;
11844 case UNSPEC_TPOFF:
11845 case UNSPEC_NTPOFF:
11846 x = XVECEXP (x, 0, 0);
11847 return (GET_CODE (x) == SYMBOL_REF
11848 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11849 case UNSPEC_DTPOFF:
11850 x = XVECEXP (x, 0, 0);
11851 return (GET_CODE (x) == SYMBOL_REF
11852 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11853 default:
11854 return false;
11855 }
11856
11857 /* We must have drilled down to a symbol. */
11858 if (GET_CODE (x) == LABEL_REF)
11859 return true;
11860 if (GET_CODE (x) != SYMBOL_REF)
11861 return false;
11862 /* FALLTHRU */
11863
11864 case SYMBOL_REF:
11865 /* TLS symbols are never valid. */
11866 if (SYMBOL_REF_TLS_MODEL (x))
11867 return false;
11868
11869 /* DLLIMPORT symbols are never valid. */
11870 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11871 && SYMBOL_REF_DLLIMPORT_P (x))
11872 return false;
11873
11874 #if TARGET_MACHO
11875 /* mdynamic-no-pic */
11876 if (MACHO_DYNAMIC_NO_PIC_P)
11877 return machopic_symbol_defined_p (x);
11878 #endif
11879 break;
11880
11881 case CONST_DOUBLE:
11882 if (GET_MODE (x) == TImode
11883 && x != CONST0_RTX (TImode)
11884 && !TARGET_64BIT)
11885 return false;
11886 break;
11887
11888 case CONST_VECTOR:
11889 if (!standard_sse_constant_p (x))
11890 return false;
11891
11892 default:
11893 break;
11894 }
11895
11896 /* Otherwise we handle everything else in the move patterns. */
11897 return true;
11898 }
11899
11900 /* Determine if it's legal to put X into the constant pool. This
11901 is not possible for the address of thread-local symbols, which
11902 is checked above. */
11903
11904 static bool
11905 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11906 {
11907 /* We can always put integral constants and vectors in memory. */
11908 switch (GET_CODE (x))
11909 {
11910 case CONST_INT:
11911 case CONST_DOUBLE:
11912 case CONST_VECTOR:
11913 return false;
11914
11915 default:
11916 break;
11917 }
11918 return !ix86_legitimate_constant_p (mode, x);
11919 }
11920
11921
11922 /* Nonzero if the constant value X is a legitimate general operand
11923 when generating PIC code. It is given that flag_pic is on and
11924 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11925
11926 bool
11927 legitimate_pic_operand_p (rtx x)
11928 {
11929 rtx inner;
11930
11931 switch (GET_CODE (x))
11932 {
11933 case CONST:
11934 inner = XEXP (x, 0);
11935 if (GET_CODE (inner) == PLUS
11936 && CONST_INT_P (XEXP (inner, 1)))
11937 inner = XEXP (inner, 0);
11938
11939 /* Only some unspecs are valid as "constants". */
11940 if (GET_CODE (inner) == UNSPEC)
11941 switch (XINT (inner, 1))
11942 {
11943 case UNSPEC_GOT:
11944 case UNSPEC_GOTOFF:
11945 case UNSPEC_PLTOFF:
11946 return TARGET_64BIT;
11947 case UNSPEC_TPOFF:
11948 x = XVECEXP (inner, 0, 0);
11949 return (GET_CODE (x) == SYMBOL_REF
11950 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11951 case UNSPEC_MACHOPIC_OFFSET:
11952 return legitimate_pic_address_disp_p (x);
11953 default:
11954 return false;
11955 }
11956 /* FALLTHRU */
11957
11958 case SYMBOL_REF:
11959 case LABEL_REF:
11960 return legitimate_pic_address_disp_p (x);
11961
11962 default:
11963 return true;
11964 }
11965 }
11966
11967 /* Determine if a given CONST RTX is a valid memory displacement
11968 in PIC mode. */
11969
11970 bool
11971 legitimate_pic_address_disp_p (rtx disp)
11972 {
11973 bool saw_plus;
11974
11975 /* In 64bit mode we can allow direct addresses of symbols and labels
11976 when they are not dynamic symbols. */
11977 if (TARGET_64BIT)
11978 {
11979 rtx op0 = disp, op1;
11980
11981 switch (GET_CODE (disp))
11982 {
11983 case LABEL_REF:
11984 return true;
11985
11986 case CONST:
11987 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11988 break;
11989 op0 = XEXP (XEXP (disp, 0), 0);
11990 op1 = XEXP (XEXP (disp, 0), 1);
11991 if (!CONST_INT_P (op1)
11992 || INTVAL (op1) >= 16*1024*1024
11993 || INTVAL (op1) < -16*1024*1024)
11994 break;
11995 if (GET_CODE (op0) == LABEL_REF)
11996 return true;
11997 if (GET_CODE (op0) == CONST
11998 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11999 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12000 return true;
12001 if (GET_CODE (op0) == UNSPEC
12002 && XINT (op0, 1) == UNSPEC_PCREL)
12003 return true;
12004 if (GET_CODE (op0) != SYMBOL_REF)
12005 break;
12006 /* FALLTHRU */
12007
12008 case SYMBOL_REF:
12009 /* TLS references should always be enclosed in UNSPEC. */
12010 if (SYMBOL_REF_TLS_MODEL (op0))
12011 return false;
12012 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12013 && ix86_cmodel != CM_LARGE_PIC)
12014 return true;
12015 break;
12016
12017 default:
12018 break;
12019 }
12020 }
12021 if (GET_CODE (disp) != CONST)
12022 return false;
12023 disp = XEXP (disp, 0);
12024
12025 if (TARGET_64BIT)
12026 {
12027 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12028 of GOT tables. We should not need these anyway. */
12029 if (GET_CODE (disp) != UNSPEC
12030 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12031 && XINT (disp, 1) != UNSPEC_GOTOFF
12032 && XINT (disp, 1) != UNSPEC_PCREL
12033 && XINT (disp, 1) != UNSPEC_PLTOFF))
12034 return false;
12035
12036 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12037 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12038 return false;
12039 return true;
12040 }
12041
12042 saw_plus = false;
12043 if (GET_CODE (disp) == PLUS)
12044 {
12045 if (!CONST_INT_P (XEXP (disp, 1)))
12046 return false;
12047 disp = XEXP (disp, 0);
12048 saw_plus = true;
12049 }
12050
12051 if (TARGET_MACHO && darwin_local_data_pic (disp))
12052 return true;
12053
12054 if (GET_CODE (disp) != UNSPEC)
12055 return false;
12056
12057 switch (XINT (disp, 1))
12058 {
12059 case UNSPEC_GOT:
12060 if (saw_plus)
12061 return false;
12062 /* We need to check for both symbols and labels because VxWorks loads
12063 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12064 details. */
12065 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12066 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12067 case UNSPEC_GOTOFF:
12068 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12069 While ABI specify also 32bit relocation but we don't produce it in
12070 small PIC model at all. */
12071 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12072 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12073 && !TARGET_64BIT)
12074 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12075 return false;
12076 case UNSPEC_GOTTPOFF:
12077 case UNSPEC_GOTNTPOFF:
12078 case UNSPEC_INDNTPOFF:
12079 if (saw_plus)
12080 return false;
12081 disp = XVECEXP (disp, 0, 0);
12082 return (GET_CODE (disp) == SYMBOL_REF
12083 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12084 case UNSPEC_NTPOFF:
12085 disp = XVECEXP (disp, 0, 0);
12086 return (GET_CODE (disp) == SYMBOL_REF
12087 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12088 case UNSPEC_DTPOFF:
12089 disp = XVECEXP (disp, 0, 0);
12090 return (GET_CODE (disp) == SYMBOL_REF
12091 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12092 }
12093
12094 return false;
12095 }
12096
12097 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12098 replace the input X, or the original X if no replacement is called for.
12099 The output parameter *WIN is 1 if the calling macro should goto WIN,
12100 0 if it should not. */
12101
12102 bool
12103 ix86_legitimize_reload_address (rtx x,
12104 enum machine_mode mode ATTRIBUTE_UNUSED,
12105 int opnum, int type,
12106 int ind_levels ATTRIBUTE_UNUSED)
12107 {
12108 /* Reload can generate:
12109
12110 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12111 (reg:DI 97))
12112 (reg:DI 2 cx))
12113
12114 This RTX is rejected from ix86_legitimate_address_p due to
12115 non-strictness of base register 97. Following this rejection,
12116 reload pushes all three components into separate registers,
12117 creating invalid memory address RTX.
12118
12119 Following code reloads only the invalid part of the
12120 memory address RTX. */
12121
12122 if (GET_CODE (x) == PLUS
12123 && REG_P (XEXP (x, 1))
12124 && GET_CODE (XEXP (x, 0)) == PLUS
12125 && REG_P (XEXP (XEXP (x, 0), 1)))
12126 {
12127 rtx base, index;
12128 bool something_reloaded = false;
12129
12130 base = XEXP (XEXP (x, 0), 1);
12131 if (!REG_OK_FOR_BASE_STRICT_P (base))
12132 {
12133 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12134 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12135 opnum, (enum reload_type) type);
12136 something_reloaded = true;
12137 }
12138
12139 index = XEXP (x, 1);
12140 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12141 {
12142 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12143 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12144 opnum, (enum reload_type) type);
12145 something_reloaded = true;
12146 }
12147
12148 gcc_assert (something_reloaded);
12149 return true;
12150 }
12151
12152 return false;
12153 }
12154
12155 /* Recognizes RTL expressions that are valid memory addresses for an
12156 instruction. The MODE argument is the machine mode for the MEM
12157 expression that wants to use this address.
12158
12159 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12160 convert common non-canonical forms to canonical form so that they will
12161 be recognized. */
12162
12163 static bool
12164 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12165 rtx addr, bool strict)
12166 {
12167 struct ix86_address parts;
12168 rtx base, index, disp;
12169 HOST_WIDE_INT scale;
12170
12171 /* Since constant address in x32 is signed extended to 64bit,
12172 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12173 if (TARGET_X32
12174 && CONST_INT_P (addr)
12175 && INTVAL (addr) < 0)
12176 return false;
12177
12178 if (ix86_decompose_address (addr, &parts) <= 0)
12179 /* Decomposition failed. */
12180 return false;
12181
12182 base = parts.base;
12183 index = parts.index;
12184 disp = parts.disp;
12185 scale = parts.scale;
12186
12187 /* Validate base register. */
12188 if (base)
12189 {
12190 rtx reg;
12191
12192 if (REG_P (base))
12193 reg = base;
12194 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12195 reg = SUBREG_REG (base);
12196 else
12197 /* Base is not a register. */
12198 return false;
12199
12200 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12201 return false;
12202
12203 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12204 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12205 /* Base is not valid. */
12206 return false;
12207 }
12208
12209 /* Validate index register. */
12210 if (index)
12211 {
12212 rtx reg;
12213
12214 if (REG_P (index))
12215 reg = index;
12216 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12217 reg = SUBREG_REG (index);
12218 else
12219 /* Index is not a register. */
12220 return false;
12221
12222 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12223 return false;
12224
12225 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12226 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12227 /* Index is not valid. */
12228 return false;
12229 }
12230
12231 /* Index and base should have the same mode. */
12232 if (base && index
12233 && GET_MODE (base) != GET_MODE (index))
12234 return false;
12235
12236 /* Validate scale factor. */
12237 if (scale != 1)
12238 {
12239 if (!index)
12240 /* Scale without index. */
12241 return false;
12242
12243 if (scale != 2 && scale != 4 && scale != 8)
12244 /* Scale is not a valid multiplier. */
12245 return false;
12246 }
12247
12248 /* Validate displacement. */
12249 if (disp)
12250 {
12251 if (GET_CODE (disp) == CONST
12252 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12253 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12254 switch (XINT (XEXP (disp, 0), 1))
12255 {
12256 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12257 used. While ABI specify also 32bit relocations, we don't produce
12258 them at all and use IP relative instead. */
12259 case UNSPEC_GOT:
12260 case UNSPEC_GOTOFF:
12261 gcc_assert (flag_pic);
12262 if (!TARGET_64BIT)
12263 goto is_legitimate_pic;
12264
12265 /* 64bit address unspec. */
12266 return false;
12267
12268 case UNSPEC_GOTPCREL:
12269 case UNSPEC_PCREL:
12270 gcc_assert (flag_pic);
12271 goto is_legitimate_pic;
12272
12273 case UNSPEC_GOTTPOFF:
12274 case UNSPEC_GOTNTPOFF:
12275 case UNSPEC_INDNTPOFF:
12276 case UNSPEC_NTPOFF:
12277 case UNSPEC_DTPOFF:
12278 break;
12279
12280 case UNSPEC_STACK_CHECK:
12281 gcc_assert (flag_split_stack);
12282 break;
12283
12284 default:
12285 /* Invalid address unspec. */
12286 return false;
12287 }
12288
12289 else if (SYMBOLIC_CONST (disp)
12290 && (flag_pic
12291 || (TARGET_MACHO
12292 #if TARGET_MACHO
12293 && MACHOPIC_INDIRECT
12294 && !machopic_operand_p (disp)
12295 #endif
12296 )))
12297 {
12298
12299 is_legitimate_pic:
12300 if (TARGET_64BIT && (index || base))
12301 {
12302 /* foo@dtpoff(%rX) is ok. */
12303 if (GET_CODE (disp) != CONST
12304 || GET_CODE (XEXP (disp, 0)) != PLUS
12305 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12306 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12307 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12308 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12309 /* Non-constant pic memory reference. */
12310 return false;
12311 }
12312 else if ((!TARGET_MACHO || flag_pic)
12313 && ! legitimate_pic_address_disp_p (disp))
12314 /* Displacement is an invalid pic construct. */
12315 return false;
12316 #if TARGET_MACHO
12317 else if (MACHO_DYNAMIC_NO_PIC_P
12318 && !ix86_legitimate_constant_p (Pmode, disp))
12319 /* displacment must be referenced via non_lazy_pointer */
12320 return false;
12321 #endif
12322
12323 /* This code used to verify that a symbolic pic displacement
12324 includes the pic_offset_table_rtx register.
12325
12326 While this is good idea, unfortunately these constructs may
12327 be created by "adds using lea" optimization for incorrect
12328 code like:
12329
12330 int a;
12331 int foo(int i)
12332 {
12333 return *(&a+i);
12334 }
12335
12336 This code is nonsensical, but results in addressing
12337 GOT table with pic_offset_table_rtx base. We can't
12338 just refuse it easily, since it gets matched by
12339 "addsi3" pattern, that later gets split to lea in the
12340 case output register differs from input. While this
12341 can be handled by separate addsi pattern for this case
12342 that never results in lea, this seems to be easier and
12343 correct fix for crash to disable this test. */
12344 }
12345 else if (GET_CODE (disp) != LABEL_REF
12346 && !CONST_INT_P (disp)
12347 && (GET_CODE (disp) != CONST
12348 || !ix86_legitimate_constant_p (Pmode, disp))
12349 && (GET_CODE (disp) != SYMBOL_REF
12350 || !ix86_legitimate_constant_p (Pmode, disp)))
12351 /* Displacement is not constant. */
12352 return false;
12353 else if (TARGET_64BIT
12354 && !x86_64_immediate_operand (disp, VOIDmode))
12355 /* Displacement is out of range. */
12356 return false;
12357 }
12358
12359 /* Everything looks valid. */
12360 return true;
12361 }
12362
12363 /* Determine if a given RTX is a valid constant address. */
12364
12365 bool
12366 constant_address_p (rtx x)
12367 {
12368 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12369 }
12370 \f
12371 /* Return a unique alias set for the GOT. */
12372
12373 static alias_set_type
12374 ix86_GOT_alias_set (void)
12375 {
12376 static alias_set_type set = -1;
12377 if (set == -1)
12378 set = new_alias_set ();
12379 return set;
12380 }
12381
12382 /* Return a legitimate reference for ORIG (an address) using the
12383 register REG. If REG is 0, a new pseudo is generated.
12384
12385 There are two types of references that must be handled:
12386
12387 1. Global data references must load the address from the GOT, via
12388 the PIC reg. An insn is emitted to do this load, and the reg is
12389 returned.
12390
12391 2. Static data references, constant pool addresses, and code labels
12392 compute the address as an offset from the GOT, whose base is in
12393 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12394 differentiate them from global data objects. The returned
12395 address is the PIC reg + an unspec constant.
12396
12397 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12398 reg also appears in the address. */
12399
12400 static rtx
12401 legitimize_pic_address (rtx orig, rtx reg)
12402 {
12403 rtx addr = orig;
12404 rtx new_rtx = orig;
12405 rtx base;
12406
12407 #if TARGET_MACHO
12408 if (TARGET_MACHO && !TARGET_64BIT)
12409 {
12410 if (reg == 0)
12411 reg = gen_reg_rtx (Pmode);
12412 /* Use the generic Mach-O PIC machinery. */
12413 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12414 }
12415 #endif
12416
12417 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12418 new_rtx = addr;
12419 else if (TARGET_64BIT
12420 && ix86_cmodel != CM_SMALL_PIC
12421 && gotoff_operand (addr, Pmode))
12422 {
12423 rtx tmpreg;
12424 /* This symbol may be referenced via a displacement from the PIC
12425 base address (@GOTOFF). */
12426
12427 if (reload_in_progress)
12428 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12429 if (GET_CODE (addr) == CONST)
12430 addr = XEXP (addr, 0);
12431 if (GET_CODE (addr) == PLUS)
12432 {
12433 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12434 UNSPEC_GOTOFF);
12435 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12436 }
12437 else
12438 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12439 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12440 if (!reg)
12441 tmpreg = gen_reg_rtx (Pmode);
12442 else
12443 tmpreg = reg;
12444 emit_move_insn (tmpreg, new_rtx);
12445
12446 if (reg != 0)
12447 {
12448 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12449 tmpreg, 1, OPTAB_DIRECT);
12450 new_rtx = reg;
12451 }
12452 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12453 }
12454 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12455 {
12456 /* This symbol may be referenced via a displacement from the PIC
12457 base address (@GOTOFF). */
12458
12459 if (reload_in_progress)
12460 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12461 if (GET_CODE (addr) == CONST)
12462 addr = XEXP (addr, 0);
12463 if (GET_CODE (addr) == PLUS)
12464 {
12465 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12466 UNSPEC_GOTOFF);
12467 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12468 }
12469 else
12470 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12471 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12472 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12473
12474 if (reg != 0)
12475 {
12476 emit_move_insn (reg, new_rtx);
12477 new_rtx = reg;
12478 }
12479 }
12480 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12481 /* We can't use @GOTOFF for text labels on VxWorks;
12482 see gotoff_operand. */
12483 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12484 {
12485 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12486 {
12487 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12488 return legitimize_dllimport_symbol (addr, true);
12489 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12490 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12491 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12492 {
12493 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12494 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12495 }
12496 }
12497
12498 /* For x64 PE-COFF there is no GOT table. So we use address
12499 directly. */
12500 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12501 {
12502 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12503 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12504
12505 if (reg == 0)
12506 reg = gen_reg_rtx (Pmode);
12507 emit_move_insn (reg, new_rtx);
12508 new_rtx = reg;
12509 }
12510 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12511 {
12512 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12513 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12514 new_rtx = gen_const_mem (Pmode, new_rtx);
12515 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12516
12517 if (reg == 0)
12518 reg = gen_reg_rtx (Pmode);
12519 /* Use directly gen_movsi, otherwise the address is loaded
12520 into register for CSE. We don't want to CSE this addresses,
12521 instead we CSE addresses from the GOT table, so skip this. */
12522 emit_insn (gen_movsi (reg, new_rtx));
12523 new_rtx = reg;
12524 }
12525 else
12526 {
12527 /* This symbol must be referenced via a load from the
12528 Global Offset Table (@GOT). */
12529
12530 if (reload_in_progress)
12531 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12532 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12533 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12534 if (TARGET_64BIT)
12535 new_rtx = force_reg (Pmode, new_rtx);
12536 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12537 new_rtx = gen_const_mem (Pmode, new_rtx);
12538 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12539
12540 if (reg == 0)
12541 reg = gen_reg_rtx (Pmode);
12542 emit_move_insn (reg, new_rtx);
12543 new_rtx = reg;
12544 }
12545 }
12546 else
12547 {
12548 if (CONST_INT_P (addr)
12549 && !x86_64_immediate_operand (addr, VOIDmode))
12550 {
12551 if (reg)
12552 {
12553 emit_move_insn (reg, addr);
12554 new_rtx = reg;
12555 }
12556 else
12557 new_rtx = force_reg (Pmode, addr);
12558 }
12559 else if (GET_CODE (addr) == CONST)
12560 {
12561 addr = XEXP (addr, 0);
12562
12563 /* We must match stuff we generate before. Assume the only
12564 unspecs that can get here are ours. Not that we could do
12565 anything with them anyway.... */
12566 if (GET_CODE (addr) == UNSPEC
12567 || (GET_CODE (addr) == PLUS
12568 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12569 return orig;
12570 gcc_assert (GET_CODE (addr) == PLUS);
12571 }
12572 if (GET_CODE (addr) == PLUS)
12573 {
12574 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12575
12576 /* Check first to see if this is a constant offset from a @GOTOFF
12577 symbol reference. */
12578 if (gotoff_operand (op0, Pmode)
12579 && CONST_INT_P (op1))
12580 {
12581 if (!TARGET_64BIT)
12582 {
12583 if (reload_in_progress)
12584 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12585 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12586 UNSPEC_GOTOFF);
12587 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12588 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12589 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12590
12591 if (reg != 0)
12592 {
12593 emit_move_insn (reg, new_rtx);
12594 new_rtx = reg;
12595 }
12596 }
12597 else
12598 {
12599 if (INTVAL (op1) < -16*1024*1024
12600 || INTVAL (op1) >= 16*1024*1024)
12601 {
12602 if (!x86_64_immediate_operand (op1, Pmode))
12603 op1 = force_reg (Pmode, op1);
12604 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12605 }
12606 }
12607 }
12608 else
12609 {
12610 base = legitimize_pic_address (XEXP (addr, 0), reg);
12611 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12612 base == reg ? NULL_RTX : reg);
12613
12614 if (CONST_INT_P (new_rtx))
12615 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12616 else
12617 {
12618 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12619 {
12620 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12621 new_rtx = XEXP (new_rtx, 1);
12622 }
12623 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12624 }
12625 }
12626 }
12627 }
12628 return new_rtx;
12629 }
12630 \f
12631 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12632
12633 static rtx
12634 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12635 {
12636 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12637
12638 if (GET_MODE (tp) != tp_mode)
12639 {
12640 gcc_assert (GET_MODE (tp) == SImode);
12641 gcc_assert (tp_mode == DImode);
12642
12643 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12644 }
12645
12646 if (to_reg)
12647 tp = copy_to_mode_reg (tp_mode, tp);
12648
12649 return tp;
12650 }
12651
12652 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12653
12654 static GTY(()) rtx ix86_tls_symbol;
12655
12656 static rtx
12657 ix86_tls_get_addr (void)
12658 {
12659 if (!ix86_tls_symbol)
12660 {
12661 const char *sym
12662 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12663 ? "___tls_get_addr" : "__tls_get_addr");
12664
12665 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12666 }
12667
12668 return ix86_tls_symbol;
12669 }
12670
12671 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12672
12673 static GTY(()) rtx ix86_tls_module_base_symbol;
12674
12675 rtx
12676 ix86_tls_module_base (void)
12677 {
12678 if (!ix86_tls_module_base_symbol)
12679 {
12680 ix86_tls_module_base_symbol
12681 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12682
12683 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12684 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12685 }
12686
12687 return ix86_tls_module_base_symbol;
12688 }
12689
12690 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12691 false if we expect this to be used for a memory address and true if
12692 we expect to load the address into a register. */
12693
12694 static rtx
12695 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12696 {
12697 rtx dest, base, off;
12698 rtx pic = NULL_RTX, tp = NULL_RTX;
12699 enum machine_mode tp_mode = Pmode;
12700 int type;
12701
12702 switch (model)
12703 {
12704 case TLS_MODEL_GLOBAL_DYNAMIC:
12705 dest = gen_reg_rtx (Pmode);
12706
12707 if (!TARGET_64BIT)
12708 {
12709 if (flag_pic)
12710 pic = pic_offset_table_rtx;
12711 else
12712 {
12713 pic = gen_reg_rtx (Pmode);
12714 emit_insn (gen_set_got (pic));
12715 }
12716 }
12717
12718 if (TARGET_GNU2_TLS)
12719 {
12720 if (TARGET_64BIT)
12721 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12722 else
12723 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12724
12725 tp = get_thread_pointer (Pmode, true);
12726 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12727
12728 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12729 }
12730 else
12731 {
12732 rtx caddr = ix86_tls_get_addr ();
12733
12734 if (TARGET_64BIT)
12735 {
12736 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12737
12738 start_sequence ();
12739 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12740 caddr));
12741 insns = get_insns ();
12742 end_sequence ();
12743
12744 RTL_CONST_CALL_P (insns) = 1;
12745 emit_libcall_block (insns, dest, rax, x);
12746 }
12747 else
12748 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12749 }
12750 break;
12751
12752 case TLS_MODEL_LOCAL_DYNAMIC:
12753 base = gen_reg_rtx (Pmode);
12754
12755 if (!TARGET_64BIT)
12756 {
12757 if (flag_pic)
12758 pic = pic_offset_table_rtx;
12759 else
12760 {
12761 pic = gen_reg_rtx (Pmode);
12762 emit_insn (gen_set_got (pic));
12763 }
12764 }
12765
12766 if (TARGET_GNU2_TLS)
12767 {
12768 rtx tmp = ix86_tls_module_base ();
12769
12770 if (TARGET_64BIT)
12771 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12772 else
12773 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12774
12775 tp = get_thread_pointer (Pmode, true);
12776 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12777 gen_rtx_MINUS (Pmode, tmp, tp));
12778 }
12779 else
12780 {
12781 rtx caddr = ix86_tls_get_addr ();
12782
12783 if (TARGET_64BIT)
12784 {
12785 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12786
12787 start_sequence ();
12788 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12789 caddr));
12790 insns = get_insns ();
12791 end_sequence ();
12792
12793 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12794 share the LD_BASE result with other LD model accesses. */
12795 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12796 UNSPEC_TLS_LD_BASE);
12797
12798 RTL_CONST_CALL_P (insns) = 1;
12799 emit_libcall_block (insns, base, rax, eqv);
12800 }
12801 else
12802 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12803 }
12804
12805 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12806 off = gen_rtx_CONST (Pmode, off);
12807
12808 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12809
12810 if (TARGET_GNU2_TLS)
12811 {
12812 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12813
12814 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12815 }
12816 break;
12817
12818 case TLS_MODEL_INITIAL_EXEC:
12819 if (TARGET_64BIT)
12820 {
12821 if (TARGET_SUN_TLS && !TARGET_X32)
12822 {
12823 /* The Sun linker took the AMD64 TLS spec literally
12824 and can only handle %rax as destination of the
12825 initial executable code sequence. */
12826
12827 dest = gen_reg_rtx (DImode);
12828 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12829 return dest;
12830 }
12831
12832 /* Generate DImode references to avoid %fs:(%reg32)
12833 problems and linker IE->LE relaxation bug. */
12834 tp_mode = DImode;
12835 pic = NULL;
12836 type = UNSPEC_GOTNTPOFF;
12837 }
12838 else if (flag_pic)
12839 {
12840 if (reload_in_progress)
12841 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12842 pic = pic_offset_table_rtx;
12843 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12844 }
12845 else if (!TARGET_ANY_GNU_TLS)
12846 {
12847 pic = gen_reg_rtx (Pmode);
12848 emit_insn (gen_set_got (pic));
12849 type = UNSPEC_GOTTPOFF;
12850 }
12851 else
12852 {
12853 pic = NULL;
12854 type = UNSPEC_INDNTPOFF;
12855 }
12856
12857 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12858 off = gen_rtx_CONST (tp_mode, off);
12859 if (pic)
12860 off = gen_rtx_PLUS (tp_mode, pic, off);
12861 off = gen_const_mem (tp_mode, off);
12862 set_mem_alias_set (off, ix86_GOT_alias_set ());
12863
12864 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12865 {
12866 base = get_thread_pointer (tp_mode,
12867 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12868 off = force_reg (tp_mode, off);
12869 return gen_rtx_PLUS (tp_mode, base, off);
12870 }
12871 else
12872 {
12873 base = get_thread_pointer (Pmode, true);
12874 dest = gen_reg_rtx (Pmode);
12875 emit_insn (ix86_gen_sub3 (dest, base, off));
12876 }
12877 break;
12878
12879 case TLS_MODEL_LOCAL_EXEC:
12880 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12881 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12882 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12883 off = gen_rtx_CONST (Pmode, off);
12884
12885 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12886 {
12887 base = get_thread_pointer (Pmode,
12888 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12889 return gen_rtx_PLUS (Pmode, base, off);
12890 }
12891 else
12892 {
12893 base = get_thread_pointer (Pmode, true);
12894 dest = gen_reg_rtx (Pmode);
12895 emit_insn (ix86_gen_sub3 (dest, base, off));
12896 }
12897 break;
12898
12899 default:
12900 gcc_unreachable ();
12901 }
12902
12903 return dest;
12904 }
12905
12906 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12907 to symbol DECL. */
12908
12909 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12910 htab_t dllimport_map;
12911
12912 static tree
12913 get_dllimport_decl (tree decl)
12914 {
12915 struct tree_map *h, in;
12916 void **loc;
12917 const char *name;
12918 const char *prefix;
12919 size_t namelen, prefixlen;
12920 char *imp_name;
12921 tree to;
12922 rtx rtl;
12923
12924 if (!dllimport_map)
12925 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12926
12927 in.hash = htab_hash_pointer (decl);
12928 in.base.from = decl;
12929 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12930 h = (struct tree_map *) *loc;
12931 if (h)
12932 return h->to;
12933
12934 *loc = h = ggc_alloc_tree_map ();
12935 h->hash = in.hash;
12936 h->base.from = decl;
12937 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12938 VAR_DECL, NULL, ptr_type_node);
12939 DECL_ARTIFICIAL (to) = 1;
12940 DECL_IGNORED_P (to) = 1;
12941 DECL_EXTERNAL (to) = 1;
12942 TREE_READONLY (to) = 1;
12943
12944 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12945 name = targetm.strip_name_encoding (name);
12946 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12947 ? "*__imp_" : "*__imp__";
12948 namelen = strlen (name);
12949 prefixlen = strlen (prefix);
12950 imp_name = (char *) alloca (namelen + prefixlen + 1);
12951 memcpy (imp_name, prefix, prefixlen);
12952 memcpy (imp_name + prefixlen, name, namelen + 1);
12953
12954 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12955 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12956 SET_SYMBOL_REF_DECL (rtl, to);
12957 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12958
12959 rtl = gen_const_mem (Pmode, rtl);
12960 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12961
12962 SET_DECL_RTL (to, rtl);
12963 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12964
12965 return to;
12966 }
12967
12968 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12969 true if we require the result be a register. */
12970
12971 static rtx
12972 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12973 {
12974 tree imp_decl;
12975 rtx x;
12976
12977 gcc_assert (SYMBOL_REF_DECL (symbol));
12978 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12979
12980 x = DECL_RTL (imp_decl);
12981 if (want_reg)
12982 x = force_reg (Pmode, x);
12983 return x;
12984 }
12985
12986 /* Try machine-dependent ways of modifying an illegitimate address
12987 to be legitimate. If we find one, return the new, valid address.
12988 This macro is used in only one place: `memory_address' in explow.c.
12989
12990 OLDX is the address as it was before break_out_memory_refs was called.
12991 In some cases it is useful to look at this to decide what needs to be done.
12992
12993 It is always safe for this macro to do nothing. It exists to recognize
12994 opportunities to optimize the output.
12995
12996 For the 80386, we handle X+REG by loading X into a register R and
12997 using R+REG. R will go in a general reg and indexing will be used.
12998 However, if REG is a broken-out memory address or multiplication,
12999 nothing needs to be done because REG can certainly go in a general reg.
13000
13001 When -fpic is used, special handling is needed for symbolic references.
13002 See comments by legitimize_pic_address in i386.c for details. */
13003
13004 static rtx
13005 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13006 enum machine_mode mode)
13007 {
13008 int changed = 0;
13009 unsigned log;
13010
13011 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13012 if (log)
13013 return legitimize_tls_address (x, (enum tls_model) log, false);
13014 if (GET_CODE (x) == CONST
13015 && GET_CODE (XEXP (x, 0)) == PLUS
13016 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13017 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13018 {
13019 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13020 (enum tls_model) log, false);
13021 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13022 }
13023
13024 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13025 {
13026 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13027 return legitimize_dllimport_symbol (x, true);
13028 if (GET_CODE (x) == CONST
13029 && GET_CODE (XEXP (x, 0)) == PLUS
13030 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13031 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13032 {
13033 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13034 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13035 }
13036 }
13037
13038 if (flag_pic && SYMBOLIC_CONST (x))
13039 return legitimize_pic_address (x, 0);
13040
13041 #if TARGET_MACHO
13042 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13043 return machopic_indirect_data_reference (x, 0);
13044 #endif
13045
13046 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13047 if (GET_CODE (x) == ASHIFT
13048 && CONST_INT_P (XEXP (x, 1))
13049 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13050 {
13051 changed = 1;
13052 log = INTVAL (XEXP (x, 1));
13053 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13054 GEN_INT (1 << log));
13055 }
13056
13057 if (GET_CODE (x) == PLUS)
13058 {
13059 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13060
13061 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13062 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13063 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13064 {
13065 changed = 1;
13066 log = INTVAL (XEXP (XEXP (x, 0), 1));
13067 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13068 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13069 GEN_INT (1 << log));
13070 }
13071
13072 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13073 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13074 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13075 {
13076 changed = 1;
13077 log = INTVAL (XEXP (XEXP (x, 1), 1));
13078 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13079 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13080 GEN_INT (1 << log));
13081 }
13082
13083 /* Put multiply first if it isn't already. */
13084 if (GET_CODE (XEXP (x, 1)) == MULT)
13085 {
13086 rtx tmp = XEXP (x, 0);
13087 XEXP (x, 0) = XEXP (x, 1);
13088 XEXP (x, 1) = tmp;
13089 changed = 1;
13090 }
13091
13092 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13093 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13094 created by virtual register instantiation, register elimination, and
13095 similar optimizations. */
13096 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13097 {
13098 changed = 1;
13099 x = gen_rtx_PLUS (Pmode,
13100 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13101 XEXP (XEXP (x, 1), 0)),
13102 XEXP (XEXP (x, 1), 1));
13103 }
13104
13105 /* Canonicalize
13106 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13107 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13108 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13109 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13110 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13111 && CONSTANT_P (XEXP (x, 1)))
13112 {
13113 rtx constant;
13114 rtx other = NULL_RTX;
13115
13116 if (CONST_INT_P (XEXP (x, 1)))
13117 {
13118 constant = XEXP (x, 1);
13119 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13120 }
13121 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13122 {
13123 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13124 other = XEXP (x, 1);
13125 }
13126 else
13127 constant = 0;
13128
13129 if (constant)
13130 {
13131 changed = 1;
13132 x = gen_rtx_PLUS (Pmode,
13133 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13134 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13135 plus_constant (Pmode, other,
13136 INTVAL (constant)));
13137 }
13138 }
13139
13140 if (changed && ix86_legitimate_address_p (mode, x, false))
13141 return x;
13142
13143 if (GET_CODE (XEXP (x, 0)) == MULT)
13144 {
13145 changed = 1;
13146 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13147 }
13148
13149 if (GET_CODE (XEXP (x, 1)) == MULT)
13150 {
13151 changed = 1;
13152 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13153 }
13154
13155 if (changed
13156 && REG_P (XEXP (x, 1))
13157 && REG_P (XEXP (x, 0)))
13158 return x;
13159
13160 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13161 {
13162 changed = 1;
13163 x = legitimize_pic_address (x, 0);
13164 }
13165
13166 if (changed && ix86_legitimate_address_p (mode, x, false))
13167 return x;
13168
13169 if (REG_P (XEXP (x, 0)))
13170 {
13171 rtx temp = gen_reg_rtx (Pmode);
13172 rtx val = force_operand (XEXP (x, 1), temp);
13173 if (val != temp)
13174 {
13175 if (GET_MODE (val) != Pmode)
13176 val = convert_to_mode (Pmode, val, 1);
13177 emit_move_insn (temp, val);
13178 }
13179
13180 XEXP (x, 1) = temp;
13181 return x;
13182 }
13183
13184 else if (REG_P (XEXP (x, 1)))
13185 {
13186 rtx temp = gen_reg_rtx (Pmode);
13187 rtx val = force_operand (XEXP (x, 0), temp);
13188 if (val != temp)
13189 {
13190 if (GET_MODE (val) != Pmode)
13191 val = convert_to_mode (Pmode, val, 1);
13192 emit_move_insn (temp, val);
13193 }
13194
13195 XEXP (x, 0) = temp;
13196 return x;
13197 }
13198 }
13199
13200 return x;
13201 }
13202 \f
13203 /* Print an integer constant expression in assembler syntax. Addition
13204 and subtraction are the only arithmetic that may appear in these
13205 expressions. FILE is the stdio stream to write to, X is the rtx, and
13206 CODE is the operand print code from the output string. */
13207
13208 static void
13209 output_pic_addr_const (FILE *file, rtx x, int code)
13210 {
13211 char buf[256];
13212
13213 switch (GET_CODE (x))
13214 {
13215 case PC:
13216 gcc_assert (flag_pic);
13217 putc ('.', file);
13218 break;
13219
13220 case SYMBOL_REF:
13221 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13222 output_addr_const (file, x);
13223 else
13224 {
13225 const char *name = XSTR (x, 0);
13226
13227 /* Mark the decl as referenced so that cgraph will
13228 output the function. */
13229 if (SYMBOL_REF_DECL (x))
13230 mark_decl_referenced (SYMBOL_REF_DECL (x));
13231
13232 #if TARGET_MACHO
13233 if (MACHOPIC_INDIRECT
13234 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13235 name = machopic_indirection_name (x, /*stub_p=*/true);
13236 #endif
13237 assemble_name (file, name);
13238 }
13239 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13240 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13241 fputs ("@PLT", file);
13242 break;
13243
13244 case LABEL_REF:
13245 x = XEXP (x, 0);
13246 /* FALLTHRU */
13247 case CODE_LABEL:
13248 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13249 assemble_name (asm_out_file, buf);
13250 break;
13251
13252 case CONST_INT:
13253 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13254 break;
13255
13256 case CONST:
13257 /* This used to output parentheses around the expression,
13258 but that does not work on the 386 (either ATT or BSD assembler). */
13259 output_pic_addr_const (file, XEXP (x, 0), code);
13260 break;
13261
13262 case CONST_DOUBLE:
13263 if (GET_MODE (x) == VOIDmode)
13264 {
13265 /* We can use %d if the number is <32 bits and positive. */
13266 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13267 fprintf (file, "0x%lx%08lx",
13268 (unsigned long) CONST_DOUBLE_HIGH (x),
13269 (unsigned long) CONST_DOUBLE_LOW (x));
13270 else
13271 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13272 }
13273 else
13274 /* We can't handle floating point constants;
13275 TARGET_PRINT_OPERAND must handle them. */
13276 output_operand_lossage ("floating constant misused");
13277 break;
13278
13279 case PLUS:
13280 /* Some assemblers need integer constants to appear first. */
13281 if (CONST_INT_P (XEXP (x, 0)))
13282 {
13283 output_pic_addr_const (file, XEXP (x, 0), code);
13284 putc ('+', file);
13285 output_pic_addr_const (file, XEXP (x, 1), code);
13286 }
13287 else
13288 {
13289 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13290 output_pic_addr_const (file, XEXP (x, 1), code);
13291 putc ('+', file);
13292 output_pic_addr_const (file, XEXP (x, 0), code);
13293 }
13294 break;
13295
13296 case MINUS:
13297 if (!TARGET_MACHO)
13298 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13299 output_pic_addr_const (file, XEXP (x, 0), code);
13300 putc ('-', file);
13301 output_pic_addr_const (file, XEXP (x, 1), code);
13302 if (!TARGET_MACHO)
13303 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13304 break;
13305
13306 case UNSPEC:
13307 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13308 {
13309 bool f = i386_asm_output_addr_const_extra (file, x);
13310 gcc_assert (f);
13311 break;
13312 }
13313
13314 gcc_assert (XVECLEN (x, 0) == 1);
13315 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13316 switch (XINT (x, 1))
13317 {
13318 case UNSPEC_GOT:
13319 fputs ("@GOT", file);
13320 break;
13321 case UNSPEC_GOTOFF:
13322 fputs ("@GOTOFF", file);
13323 break;
13324 case UNSPEC_PLTOFF:
13325 fputs ("@PLTOFF", file);
13326 break;
13327 case UNSPEC_PCREL:
13328 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13329 "(%rip)" : "[rip]", file);
13330 break;
13331 case UNSPEC_GOTPCREL:
13332 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13333 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13334 break;
13335 case UNSPEC_GOTTPOFF:
13336 /* FIXME: This might be @TPOFF in Sun ld too. */
13337 fputs ("@gottpoff", file);
13338 break;
13339 case UNSPEC_TPOFF:
13340 fputs ("@tpoff", file);
13341 break;
13342 case UNSPEC_NTPOFF:
13343 if (TARGET_64BIT)
13344 fputs ("@tpoff", file);
13345 else
13346 fputs ("@ntpoff", file);
13347 break;
13348 case UNSPEC_DTPOFF:
13349 fputs ("@dtpoff", file);
13350 break;
13351 case UNSPEC_GOTNTPOFF:
13352 if (TARGET_64BIT)
13353 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13354 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13355 else
13356 fputs ("@gotntpoff", file);
13357 break;
13358 case UNSPEC_INDNTPOFF:
13359 fputs ("@indntpoff", file);
13360 break;
13361 #if TARGET_MACHO
13362 case UNSPEC_MACHOPIC_OFFSET:
13363 putc ('-', file);
13364 machopic_output_function_base_name (file);
13365 break;
13366 #endif
13367 default:
13368 output_operand_lossage ("invalid UNSPEC as operand");
13369 break;
13370 }
13371 break;
13372
13373 default:
13374 output_operand_lossage ("invalid expression as operand");
13375 }
13376 }
13377
13378 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13379 We need to emit DTP-relative relocations. */
13380
13381 static void ATTRIBUTE_UNUSED
13382 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13383 {
13384 fputs (ASM_LONG, file);
13385 output_addr_const (file, x);
13386 fputs ("@dtpoff", file);
13387 switch (size)
13388 {
13389 case 4:
13390 break;
13391 case 8:
13392 fputs (", 0", file);
13393 break;
13394 default:
13395 gcc_unreachable ();
13396 }
13397 }
13398
13399 /* Return true if X is a representation of the PIC register. This copes
13400 with calls from ix86_find_base_term, where the register might have
13401 been replaced by a cselib value. */
13402
13403 static bool
13404 ix86_pic_register_p (rtx x)
13405 {
13406 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13407 return (pic_offset_table_rtx
13408 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13409 else
13410 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13411 }
13412
13413 /* Helper function for ix86_delegitimize_address.
13414 Attempt to delegitimize TLS local-exec accesses. */
13415
13416 static rtx
13417 ix86_delegitimize_tls_address (rtx orig_x)
13418 {
13419 rtx x = orig_x, unspec;
13420 struct ix86_address addr;
13421
13422 if (!TARGET_TLS_DIRECT_SEG_REFS)
13423 return orig_x;
13424 if (MEM_P (x))
13425 x = XEXP (x, 0);
13426 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13427 return orig_x;
13428 if (ix86_decompose_address (x, &addr) == 0
13429 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13430 || addr.disp == NULL_RTX
13431 || GET_CODE (addr.disp) != CONST)
13432 return orig_x;
13433 unspec = XEXP (addr.disp, 0);
13434 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13435 unspec = XEXP (unspec, 0);
13436 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13437 return orig_x;
13438 x = XVECEXP (unspec, 0, 0);
13439 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13440 if (unspec != XEXP (addr.disp, 0))
13441 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13442 if (addr.index)
13443 {
13444 rtx idx = addr.index;
13445 if (addr.scale != 1)
13446 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13447 x = gen_rtx_PLUS (Pmode, idx, x);
13448 }
13449 if (addr.base)
13450 x = gen_rtx_PLUS (Pmode, addr.base, x);
13451 if (MEM_P (orig_x))
13452 x = replace_equiv_address_nv (orig_x, x);
13453 return x;
13454 }
13455
13456 /* In the name of slightly smaller debug output, and to cater to
13457 general assembler lossage, recognize PIC+GOTOFF and turn it back
13458 into a direct symbol reference.
13459
13460 On Darwin, this is necessary to avoid a crash, because Darwin
13461 has a different PIC label for each routine but the DWARF debugging
13462 information is not associated with any particular routine, so it's
13463 necessary to remove references to the PIC label from RTL stored by
13464 the DWARF output code. */
13465
13466 static rtx
13467 ix86_delegitimize_address (rtx x)
13468 {
13469 rtx orig_x = delegitimize_mem_from_attrs (x);
13470 /* addend is NULL or some rtx if x is something+GOTOFF where
13471 something doesn't include the PIC register. */
13472 rtx addend = NULL_RTX;
13473 /* reg_addend is NULL or a multiple of some register. */
13474 rtx reg_addend = NULL_RTX;
13475 /* const_addend is NULL or a const_int. */
13476 rtx const_addend = NULL_RTX;
13477 /* This is the result, or NULL. */
13478 rtx result = NULL_RTX;
13479
13480 x = orig_x;
13481
13482 if (MEM_P (x))
13483 x = XEXP (x, 0);
13484
13485 if (TARGET_64BIT)
13486 {
13487 if (GET_CODE (x) == CONST
13488 && GET_CODE (XEXP (x, 0)) == PLUS
13489 && GET_MODE (XEXP (x, 0)) == Pmode
13490 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13491 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13492 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13493 {
13494 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13495 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13496 if (MEM_P (orig_x))
13497 x = replace_equiv_address_nv (orig_x, x);
13498 return x;
13499 }
13500 if (GET_CODE (x) != CONST
13501 || GET_CODE (XEXP (x, 0)) != UNSPEC
13502 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13503 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13504 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13505 return ix86_delegitimize_tls_address (orig_x);
13506 x = XVECEXP (XEXP (x, 0), 0, 0);
13507 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13508 {
13509 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13510 GET_MODE (x), 0);
13511 if (x == NULL_RTX)
13512 return orig_x;
13513 }
13514 return x;
13515 }
13516
13517 if (GET_CODE (x) != PLUS
13518 || GET_CODE (XEXP (x, 1)) != CONST)
13519 return ix86_delegitimize_tls_address (orig_x);
13520
13521 if (ix86_pic_register_p (XEXP (x, 0)))
13522 /* %ebx + GOT/GOTOFF */
13523 ;
13524 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13525 {
13526 /* %ebx + %reg * scale + GOT/GOTOFF */
13527 reg_addend = XEXP (x, 0);
13528 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13529 reg_addend = XEXP (reg_addend, 1);
13530 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13531 reg_addend = XEXP (reg_addend, 0);
13532 else
13533 {
13534 reg_addend = NULL_RTX;
13535 addend = XEXP (x, 0);
13536 }
13537 }
13538 else
13539 addend = XEXP (x, 0);
13540
13541 x = XEXP (XEXP (x, 1), 0);
13542 if (GET_CODE (x) == PLUS
13543 && CONST_INT_P (XEXP (x, 1)))
13544 {
13545 const_addend = XEXP (x, 1);
13546 x = XEXP (x, 0);
13547 }
13548
13549 if (GET_CODE (x) == UNSPEC
13550 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13551 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13552 result = XVECEXP (x, 0, 0);
13553
13554 if (TARGET_MACHO && darwin_local_data_pic (x)
13555 && !MEM_P (orig_x))
13556 result = XVECEXP (x, 0, 0);
13557
13558 if (! result)
13559 return ix86_delegitimize_tls_address (orig_x);
13560
13561 if (const_addend)
13562 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13563 if (reg_addend)
13564 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13565 if (addend)
13566 {
13567 /* If the rest of original X doesn't involve the PIC register, add
13568 addend and subtract pic_offset_table_rtx. This can happen e.g.
13569 for code like:
13570 leal (%ebx, %ecx, 4), %ecx
13571 ...
13572 movl foo@GOTOFF(%ecx), %edx
13573 in which case we return (%ecx - %ebx) + foo. */
13574 if (pic_offset_table_rtx)
13575 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13576 pic_offset_table_rtx),
13577 result);
13578 else
13579 return orig_x;
13580 }
13581 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13582 {
13583 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13584 if (result == NULL_RTX)
13585 return orig_x;
13586 }
13587 return result;
13588 }
13589
13590 /* If X is a machine specific address (i.e. a symbol or label being
13591 referenced as a displacement from the GOT implemented using an
13592 UNSPEC), then return the base term. Otherwise return X. */
13593
13594 rtx
13595 ix86_find_base_term (rtx x)
13596 {
13597 rtx term;
13598
13599 if (TARGET_64BIT)
13600 {
13601 if (GET_CODE (x) != CONST)
13602 return x;
13603 term = XEXP (x, 0);
13604 if (GET_CODE (term) == PLUS
13605 && (CONST_INT_P (XEXP (term, 1))
13606 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13607 term = XEXP (term, 0);
13608 if (GET_CODE (term) != UNSPEC
13609 || (XINT (term, 1) != UNSPEC_GOTPCREL
13610 && XINT (term, 1) != UNSPEC_PCREL))
13611 return x;
13612
13613 return XVECEXP (term, 0, 0);
13614 }
13615
13616 return ix86_delegitimize_address (x);
13617 }
13618 \f
13619 static void
13620 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13621 bool fp, FILE *file)
13622 {
13623 const char *suffix;
13624
13625 if (mode == CCFPmode || mode == CCFPUmode)
13626 {
13627 code = ix86_fp_compare_code_to_integer (code);
13628 mode = CCmode;
13629 }
13630 if (reverse)
13631 code = reverse_condition (code);
13632
13633 switch (code)
13634 {
13635 case EQ:
13636 switch (mode)
13637 {
13638 case CCAmode:
13639 suffix = "a";
13640 break;
13641
13642 case CCCmode:
13643 suffix = "c";
13644 break;
13645
13646 case CCOmode:
13647 suffix = "o";
13648 break;
13649
13650 case CCSmode:
13651 suffix = "s";
13652 break;
13653
13654 default:
13655 suffix = "e";
13656 }
13657 break;
13658 case NE:
13659 switch (mode)
13660 {
13661 case CCAmode:
13662 suffix = "na";
13663 break;
13664
13665 case CCCmode:
13666 suffix = "nc";
13667 break;
13668
13669 case CCOmode:
13670 suffix = "no";
13671 break;
13672
13673 case CCSmode:
13674 suffix = "ns";
13675 break;
13676
13677 default:
13678 suffix = "ne";
13679 }
13680 break;
13681 case GT:
13682 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13683 suffix = "g";
13684 break;
13685 case GTU:
13686 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13687 Those same assemblers have the same but opposite lossage on cmov. */
13688 if (mode == CCmode)
13689 suffix = fp ? "nbe" : "a";
13690 else if (mode == CCCmode)
13691 suffix = "b";
13692 else
13693 gcc_unreachable ();
13694 break;
13695 case LT:
13696 switch (mode)
13697 {
13698 case CCNOmode:
13699 case CCGOCmode:
13700 suffix = "s";
13701 break;
13702
13703 case CCmode:
13704 case CCGCmode:
13705 suffix = "l";
13706 break;
13707
13708 default:
13709 gcc_unreachable ();
13710 }
13711 break;
13712 case LTU:
13713 gcc_assert (mode == CCmode || mode == CCCmode);
13714 suffix = "b";
13715 break;
13716 case GE:
13717 switch (mode)
13718 {
13719 case CCNOmode:
13720 case CCGOCmode:
13721 suffix = "ns";
13722 break;
13723
13724 case CCmode:
13725 case CCGCmode:
13726 suffix = "ge";
13727 break;
13728
13729 default:
13730 gcc_unreachable ();
13731 }
13732 break;
13733 case GEU:
13734 /* ??? As above. */
13735 gcc_assert (mode == CCmode || mode == CCCmode);
13736 suffix = fp ? "nb" : "ae";
13737 break;
13738 case LE:
13739 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13740 suffix = "le";
13741 break;
13742 case LEU:
13743 /* ??? As above. */
13744 if (mode == CCmode)
13745 suffix = "be";
13746 else if (mode == CCCmode)
13747 suffix = fp ? "nb" : "ae";
13748 else
13749 gcc_unreachable ();
13750 break;
13751 case UNORDERED:
13752 suffix = fp ? "u" : "p";
13753 break;
13754 case ORDERED:
13755 suffix = fp ? "nu" : "np";
13756 break;
13757 default:
13758 gcc_unreachable ();
13759 }
13760 fputs (suffix, file);
13761 }
13762
13763 /* Print the name of register X to FILE based on its machine mode and number.
13764 If CODE is 'w', pretend the mode is HImode.
13765 If CODE is 'b', pretend the mode is QImode.
13766 If CODE is 'k', pretend the mode is SImode.
13767 If CODE is 'q', pretend the mode is DImode.
13768 If CODE is 'x', pretend the mode is V4SFmode.
13769 If CODE is 't', pretend the mode is V8SFmode.
13770 If CODE is 'h', pretend the reg is the 'high' byte register.
13771 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13772 If CODE is 'd', duplicate the operand for AVX instruction.
13773 */
13774
13775 void
13776 print_reg (rtx x, int code, FILE *file)
13777 {
13778 const char *reg;
13779 bool duplicated = code == 'd' && TARGET_AVX;
13780
13781 gcc_assert (x == pc_rtx
13782 || (REGNO (x) != ARG_POINTER_REGNUM
13783 && REGNO (x) != FRAME_POINTER_REGNUM
13784 && REGNO (x) != FLAGS_REG
13785 && REGNO (x) != FPSR_REG
13786 && REGNO (x) != FPCR_REG));
13787
13788 if (ASSEMBLER_DIALECT == ASM_ATT)
13789 putc ('%', file);
13790
13791 if (x == pc_rtx)
13792 {
13793 gcc_assert (TARGET_64BIT);
13794 fputs ("rip", file);
13795 return;
13796 }
13797
13798 if (code == 'w' || MMX_REG_P (x))
13799 code = 2;
13800 else if (code == 'b')
13801 code = 1;
13802 else if (code == 'k')
13803 code = 4;
13804 else if (code == 'q')
13805 code = 8;
13806 else if (code == 'y')
13807 code = 3;
13808 else if (code == 'h')
13809 code = 0;
13810 else if (code == 'x')
13811 code = 16;
13812 else if (code == 't')
13813 code = 32;
13814 else
13815 code = GET_MODE_SIZE (GET_MODE (x));
13816
13817 /* Irritatingly, AMD extended registers use different naming convention
13818 from the normal registers: "r%d[bwd]" */
13819 if (REX_INT_REG_P (x))
13820 {
13821 gcc_assert (TARGET_64BIT);
13822 putc ('r', file);
13823 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13824 switch (code)
13825 {
13826 case 0:
13827 error ("extended registers have no high halves");
13828 break;
13829 case 1:
13830 putc ('b', file);
13831 break;
13832 case 2:
13833 putc ('w', file);
13834 break;
13835 case 4:
13836 putc ('d', file);
13837 break;
13838 case 8:
13839 /* no suffix */
13840 break;
13841 default:
13842 error ("unsupported operand size for extended register");
13843 break;
13844 }
13845 return;
13846 }
13847
13848 reg = NULL;
13849 switch (code)
13850 {
13851 case 3:
13852 if (STACK_TOP_P (x))
13853 {
13854 reg = "st(0)";
13855 break;
13856 }
13857 /* FALLTHRU */
13858 case 8:
13859 case 4:
13860 case 12:
13861 if (! ANY_FP_REG_P (x))
13862 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13863 /* FALLTHRU */
13864 case 16:
13865 case 2:
13866 normal:
13867 reg = hi_reg_name[REGNO (x)];
13868 break;
13869 case 1:
13870 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13871 goto normal;
13872 reg = qi_reg_name[REGNO (x)];
13873 break;
13874 case 0:
13875 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13876 goto normal;
13877 reg = qi_high_reg_name[REGNO (x)];
13878 break;
13879 case 32:
13880 if (SSE_REG_P (x))
13881 {
13882 gcc_assert (!duplicated);
13883 putc ('y', file);
13884 fputs (hi_reg_name[REGNO (x)] + 1, file);
13885 return;
13886 }
13887 break;
13888 default:
13889 gcc_unreachable ();
13890 }
13891
13892 fputs (reg, file);
13893 if (duplicated)
13894 {
13895 if (ASSEMBLER_DIALECT == ASM_ATT)
13896 fprintf (file, ", %%%s", reg);
13897 else
13898 fprintf (file, ", %s", reg);
13899 }
13900 }
13901
13902 /* Locate some local-dynamic symbol still in use by this function
13903 so that we can print its name in some tls_local_dynamic_base
13904 pattern. */
13905
13906 static int
13907 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13908 {
13909 rtx x = *px;
13910
13911 if (GET_CODE (x) == SYMBOL_REF
13912 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13913 {
13914 cfun->machine->some_ld_name = XSTR (x, 0);
13915 return 1;
13916 }
13917
13918 return 0;
13919 }
13920
13921 static const char *
13922 get_some_local_dynamic_name (void)
13923 {
13924 rtx insn;
13925
13926 if (cfun->machine->some_ld_name)
13927 return cfun->machine->some_ld_name;
13928
13929 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13930 if (NONDEBUG_INSN_P (insn)
13931 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13932 return cfun->machine->some_ld_name;
13933
13934 return NULL;
13935 }
13936
13937 /* Meaning of CODE:
13938 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13939 C -- print opcode suffix for set/cmov insn.
13940 c -- like C, but print reversed condition
13941 F,f -- likewise, but for floating-point.
13942 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13943 otherwise nothing
13944 R -- print the prefix for register names.
13945 z -- print the opcode suffix for the size of the current operand.
13946 Z -- likewise, with special suffixes for x87 instructions.
13947 * -- print a star (in certain assembler syntax)
13948 A -- print an absolute memory reference.
13949 E -- print address with DImode register names if TARGET_64BIT.
13950 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13951 s -- print a shift double count, followed by the assemblers argument
13952 delimiter.
13953 b -- print the QImode name of the register for the indicated operand.
13954 %b0 would print %al if operands[0] is reg 0.
13955 w -- likewise, print the HImode name of the register.
13956 k -- likewise, print the SImode name of the register.
13957 q -- likewise, print the DImode name of the register.
13958 x -- likewise, print the V4SFmode name of the register.
13959 t -- likewise, print the V8SFmode name of the register.
13960 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13961 y -- print "st(0)" instead of "st" as a register.
13962 d -- print duplicated register operand for AVX instruction.
13963 D -- print condition for SSE cmp instruction.
13964 P -- if PIC, print an @PLT suffix.
13965 p -- print raw symbol name.
13966 X -- don't print any sort of PIC '@' suffix for a symbol.
13967 & -- print some in-use local-dynamic symbol name.
13968 H -- print a memory address offset by 8; used for sse high-parts
13969 Y -- print condition for XOP pcom* instruction.
13970 + -- print a branch hint as 'cs' or 'ds' prefix
13971 ; -- print a semicolon (after prefixes due to bug in older gas).
13972 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13973 @ -- print a segment register of thread base pointer load
13974 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13975 */
13976
13977 void
13978 ix86_print_operand (FILE *file, rtx x, int code)
13979 {
13980 if (code)
13981 {
13982 switch (code)
13983 {
13984 case 'A':
13985 switch (ASSEMBLER_DIALECT)
13986 {
13987 case ASM_ATT:
13988 putc ('*', file);
13989 break;
13990
13991 case ASM_INTEL:
13992 /* Intel syntax. For absolute addresses, registers should not
13993 be surrounded by braces. */
13994 if (!REG_P (x))
13995 {
13996 putc ('[', file);
13997 ix86_print_operand (file, x, 0);
13998 putc (']', file);
13999 return;
14000 }
14001 break;
14002
14003 default:
14004 gcc_unreachable ();
14005 }
14006
14007 ix86_print_operand (file, x, 0);
14008 return;
14009
14010 case 'E':
14011 /* Wrap address in an UNSPEC to declare special handling. */
14012 if (TARGET_64BIT)
14013 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14014
14015 output_address (x);
14016 return;
14017
14018 case 'L':
14019 if (ASSEMBLER_DIALECT == ASM_ATT)
14020 putc ('l', file);
14021 return;
14022
14023 case 'W':
14024 if (ASSEMBLER_DIALECT == ASM_ATT)
14025 putc ('w', file);
14026 return;
14027
14028 case 'B':
14029 if (ASSEMBLER_DIALECT == ASM_ATT)
14030 putc ('b', file);
14031 return;
14032
14033 case 'Q':
14034 if (ASSEMBLER_DIALECT == ASM_ATT)
14035 putc ('l', file);
14036 return;
14037
14038 case 'S':
14039 if (ASSEMBLER_DIALECT == ASM_ATT)
14040 putc ('s', file);
14041 return;
14042
14043 case 'T':
14044 if (ASSEMBLER_DIALECT == ASM_ATT)
14045 putc ('t', file);
14046 return;
14047
14048 case 'O':
14049 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14050 if (ASSEMBLER_DIALECT != ASM_ATT)
14051 return;
14052
14053 switch (GET_MODE_SIZE (GET_MODE (x)))
14054 {
14055 case 2:
14056 putc ('w', file);
14057 break;
14058
14059 case 4:
14060 putc ('l', file);
14061 break;
14062
14063 case 8:
14064 putc ('q', file);
14065 break;
14066
14067 default:
14068 output_operand_lossage
14069 ("invalid operand size for operand code 'O'");
14070 return;
14071 }
14072
14073 putc ('.', file);
14074 #endif
14075 return;
14076
14077 case 'z':
14078 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14079 {
14080 /* Opcodes don't get size suffixes if using Intel opcodes. */
14081 if (ASSEMBLER_DIALECT == ASM_INTEL)
14082 return;
14083
14084 switch (GET_MODE_SIZE (GET_MODE (x)))
14085 {
14086 case 1:
14087 putc ('b', file);
14088 return;
14089
14090 case 2:
14091 putc ('w', file);
14092 return;
14093
14094 case 4:
14095 putc ('l', file);
14096 return;
14097
14098 case 8:
14099 putc ('q', file);
14100 return;
14101
14102 default:
14103 output_operand_lossage
14104 ("invalid operand size for operand code 'z'");
14105 return;
14106 }
14107 }
14108
14109 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14110 warning
14111 (0, "non-integer operand used with operand code 'z'");
14112 /* FALLTHRU */
14113
14114 case 'Z':
14115 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14116 if (ASSEMBLER_DIALECT == ASM_INTEL)
14117 return;
14118
14119 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14120 {
14121 switch (GET_MODE_SIZE (GET_MODE (x)))
14122 {
14123 case 2:
14124 #ifdef HAVE_AS_IX86_FILDS
14125 putc ('s', file);
14126 #endif
14127 return;
14128
14129 case 4:
14130 putc ('l', file);
14131 return;
14132
14133 case 8:
14134 #ifdef HAVE_AS_IX86_FILDQ
14135 putc ('q', file);
14136 #else
14137 fputs ("ll", file);
14138 #endif
14139 return;
14140
14141 default:
14142 break;
14143 }
14144 }
14145 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14146 {
14147 /* 387 opcodes don't get size suffixes
14148 if the operands are registers. */
14149 if (STACK_REG_P (x))
14150 return;
14151
14152 switch (GET_MODE_SIZE (GET_MODE (x)))
14153 {
14154 case 4:
14155 putc ('s', file);
14156 return;
14157
14158 case 8:
14159 putc ('l', file);
14160 return;
14161
14162 case 12:
14163 case 16:
14164 putc ('t', file);
14165 return;
14166
14167 default:
14168 break;
14169 }
14170 }
14171 else
14172 {
14173 output_operand_lossage
14174 ("invalid operand type used with operand code 'Z'");
14175 return;
14176 }
14177
14178 output_operand_lossage
14179 ("invalid operand size for operand code 'Z'");
14180 return;
14181
14182 case 'd':
14183 case 'b':
14184 case 'w':
14185 case 'k':
14186 case 'q':
14187 case 'h':
14188 case 't':
14189 case 'y':
14190 case 'x':
14191 case 'X':
14192 case 'P':
14193 case 'p':
14194 break;
14195
14196 case 's':
14197 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14198 {
14199 ix86_print_operand (file, x, 0);
14200 fputs (", ", file);
14201 }
14202 return;
14203
14204 case 'Y':
14205 switch (GET_CODE (x))
14206 {
14207 case NE:
14208 fputs ("neq", file);
14209 break;
14210 case EQ:
14211 fputs ("eq", file);
14212 break;
14213 case GE:
14214 case GEU:
14215 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14216 break;
14217 case GT:
14218 case GTU:
14219 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14220 break;
14221 case LE:
14222 case LEU:
14223 fputs ("le", file);
14224 break;
14225 case LT:
14226 case LTU:
14227 fputs ("lt", file);
14228 break;
14229 case UNORDERED:
14230 fputs ("unord", file);
14231 break;
14232 case ORDERED:
14233 fputs ("ord", file);
14234 break;
14235 case UNEQ:
14236 fputs ("ueq", file);
14237 break;
14238 case UNGE:
14239 fputs ("nlt", file);
14240 break;
14241 case UNGT:
14242 fputs ("nle", file);
14243 break;
14244 case UNLE:
14245 fputs ("ule", file);
14246 break;
14247 case UNLT:
14248 fputs ("ult", file);
14249 break;
14250 case LTGT:
14251 fputs ("une", file);
14252 break;
14253 default:
14254 output_operand_lossage ("operand is not a condition code, "
14255 "invalid operand code 'Y'");
14256 return;
14257 }
14258 return;
14259
14260 case 'D':
14261 /* Little bit of braindamage here. The SSE compare instructions
14262 does use completely different names for the comparisons that the
14263 fp conditional moves. */
14264 switch (GET_CODE (x))
14265 {
14266 case UNEQ:
14267 if (TARGET_AVX)
14268 {
14269 fputs ("eq_us", file);
14270 break;
14271 }
14272 case EQ:
14273 fputs ("eq", file);
14274 break;
14275 case UNLT:
14276 if (TARGET_AVX)
14277 {
14278 fputs ("nge", file);
14279 break;
14280 }
14281 case LT:
14282 fputs ("lt", file);
14283 break;
14284 case UNLE:
14285 if (TARGET_AVX)
14286 {
14287 fputs ("ngt", file);
14288 break;
14289 }
14290 case LE:
14291 fputs ("le", file);
14292 break;
14293 case UNORDERED:
14294 fputs ("unord", file);
14295 break;
14296 case LTGT:
14297 if (TARGET_AVX)
14298 {
14299 fputs ("neq_oq", file);
14300 break;
14301 }
14302 case NE:
14303 fputs ("neq", file);
14304 break;
14305 case GE:
14306 if (TARGET_AVX)
14307 {
14308 fputs ("ge", file);
14309 break;
14310 }
14311 case UNGE:
14312 fputs ("nlt", file);
14313 break;
14314 case GT:
14315 if (TARGET_AVX)
14316 {
14317 fputs ("gt", file);
14318 break;
14319 }
14320 case UNGT:
14321 fputs ("nle", file);
14322 break;
14323 case ORDERED:
14324 fputs ("ord", file);
14325 break;
14326 default:
14327 output_operand_lossage ("operand is not a condition code, "
14328 "invalid operand code 'D'");
14329 return;
14330 }
14331 return;
14332
14333 case 'F':
14334 case 'f':
14335 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14336 if (ASSEMBLER_DIALECT == ASM_ATT)
14337 putc ('.', file);
14338 #endif
14339
14340 case 'C':
14341 case 'c':
14342 if (!COMPARISON_P (x))
14343 {
14344 output_operand_lossage ("operand is not a condition code, "
14345 "invalid operand code '%c'", code);
14346 return;
14347 }
14348 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14349 code == 'c' || code == 'f',
14350 code == 'F' || code == 'f',
14351 file);
14352 return;
14353
14354 case 'H':
14355 if (!offsettable_memref_p (x))
14356 {
14357 output_operand_lossage ("operand is not an offsettable memory "
14358 "reference, invalid operand code 'H'");
14359 return;
14360 }
14361 /* It doesn't actually matter what mode we use here, as we're
14362 only going to use this for printing. */
14363 x = adjust_address_nv (x, DImode, 8);
14364 break;
14365
14366 case 'K':
14367 gcc_assert (CONST_INT_P (x));
14368
14369 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14370 #ifdef HAVE_AS_IX86_HLE
14371 fputs ("xacquire ", file);
14372 #else
14373 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14374 #endif
14375 else if (INTVAL (x) & IX86_HLE_RELEASE)
14376 #ifdef HAVE_AS_IX86_HLE
14377 fputs ("xrelease ", file);
14378 #else
14379 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14380 #endif
14381 /* We do not want to print value of the operand. */
14382 return;
14383
14384 case '*':
14385 if (ASSEMBLER_DIALECT == ASM_ATT)
14386 putc ('*', file);
14387 return;
14388
14389 case '&':
14390 {
14391 const char *name = get_some_local_dynamic_name ();
14392 if (name == NULL)
14393 output_operand_lossage ("'%%&' used without any "
14394 "local dynamic TLS references");
14395 else
14396 assemble_name (file, name);
14397 return;
14398 }
14399
14400 case '+':
14401 {
14402 rtx x;
14403
14404 if (!optimize
14405 || optimize_function_for_size_p (cfun)
14406 || !TARGET_BRANCH_PREDICTION_HINTS)
14407 return;
14408
14409 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14410 if (x)
14411 {
14412 int pred_val = INTVAL (XEXP (x, 0));
14413
14414 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14415 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14416 {
14417 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14418 bool cputaken
14419 = final_forward_branch_p (current_output_insn) == 0;
14420
14421 /* Emit hints only in the case default branch prediction
14422 heuristics would fail. */
14423 if (taken != cputaken)
14424 {
14425 /* We use 3e (DS) prefix for taken branches and
14426 2e (CS) prefix for not taken branches. */
14427 if (taken)
14428 fputs ("ds ; ", file);
14429 else
14430 fputs ("cs ; ", file);
14431 }
14432 }
14433 }
14434 return;
14435 }
14436
14437 case ';':
14438 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14439 putc (';', file);
14440 #endif
14441 return;
14442
14443 case '@':
14444 if (ASSEMBLER_DIALECT == ASM_ATT)
14445 putc ('%', file);
14446
14447 /* The kernel uses a different segment register for performance
14448 reasons; a system call would not have to trash the userspace
14449 segment register, which would be expensive. */
14450 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14451 fputs ("fs", file);
14452 else
14453 fputs ("gs", file);
14454 return;
14455
14456 case '~':
14457 putc (TARGET_AVX2 ? 'i' : 'f', file);
14458 return;
14459
14460 case '^':
14461 if (TARGET_64BIT && Pmode != word_mode)
14462 fputs ("addr32 ", file);
14463 return;
14464
14465 default:
14466 output_operand_lossage ("invalid operand code '%c'", code);
14467 }
14468 }
14469
14470 if (REG_P (x))
14471 print_reg (x, code, file);
14472
14473 else if (MEM_P (x))
14474 {
14475 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14476 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14477 && GET_MODE (x) != BLKmode)
14478 {
14479 const char * size;
14480 switch (GET_MODE_SIZE (GET_MODE (x)))
14481 {
14482 case 1: size = "BYTE"; break;
14483 case 2: size = "WORD"; break;
14484 case 4: size = "DWORD"; break;
14485 case 8: size = "QWORD"; break;
14486 case 12: size = "TBYTE"; break;
14487 case 16:
14488 if (GET_MODE (x) == XFmode)
14489 size = "TBYTE";
14490 else
14491 size = "XMMWORD";
14492 break;
14493 case 32: size = "YMMWORD"; break;
14494 default:
14495 gcc_unreachable ();
14496 }
14497
14498 /* Check for explicit size override (codes 'b', 'w', 'k',
14499 'q' and 'x') */
14500 if (code == 'b')
14501 size = "BYTE";
14502 else if (code == 'w')
14503 size = "WORD";
14504 else if (code == 'k')
14505 size = "DWORD";
14506 else if (code == 'q')
14507 size = "QWORD";
14508 else if (code == 'x')
14509 size = "XMMWORD";
14510
14511 fputs (size, file);
14512 fputs (" PTR ", file);
14513 }
14514
14515 x = XEXP (x, 0);
14516 /* Avoid (%rip) for call operands. */
14517 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14518 && !CONST_INT_P (x))
14519 output_addr_const (file, x);
14520 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14521 output_operand_lossage ("invalid constraints for operand");
14522 else
14523 output_address (x);
14524 }
14525
14526 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14527 {
14528 REAL_VALUE_TYPE r;
14529 long l;
14530
14531 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14532 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14533
14534 if (ASSEMBLER_DIALECT == ASM_ATT)
14535 putc ('$', file);
14536 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14537 if (code == 'q')
14538 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14539 else
14540 fprintf (file, "0x%08x", (unsigned int) l);
14541 }
14542
14543 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14544 {
14545 REAL_VALUE_TYPE r;
14546 long l[2];
14547
14548 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14549 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14550
14551 if (ASSEMBLER_DIALECT == ASM_ATT)
14552 putc ('$', file);
14553 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14554 }
14555
14556 /* These float cases don't actually occur as immediate operands. */
14557 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14558 {
14559 char dstr[30];
14560
14561 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14562 fputs (dstr, file);
14563 }
14564
14565 else
14566 {
14567 /* We have patterns that allow zero sets of memory, for instance.
14568 In 64-bit mode, we should probably support all 8-byte vectors,
14569 since we can in fact encode that into an immediate. */
14570 if (GET_CODE (x) == CONST_VECTOR)
14571 {
14572 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14573 x = const0_rtx;
14574 }
14575
14576 if (code != 'P' && code != 'p')
14577 {
14578 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14579 {
14580 if (ASSEMBLER_DIALECT == ASM_ATT)
14581 putc ('$', file);
14582 }
14583 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14584 || GET_CODE (x) == LABEL_REF)
14585 {
14586 if (ASSEMBLER_DIALECT == ASM_ATT)
14587 putc ('$', file);
14588 else
14589 fputs ("OFFSET FLAT:", file);
14590 }
14591 }
14592 if (CONST_INT_P (x))
14593 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14594 else if (flag_pic || MACHOPIC_INDIRECT)
14595 output_pic_addr_const (file, x, code);
14596 else
14597 output_addr_const (file, x);
14598 }
14599 }
14600
14601 static bool
14602 ix86_print_operand_punct_valid_p (unsigned char code)
14603 {
14604 return (code == '@' || code == '*' || code == '+' || code == '&'
14605 || code == ';' || code == '~' || code == '^');
14606 }
14607 \f
14608 /* Print a memory operand whose address is ADDR. */
14609
14610 static void
14611 ix86_print_operand_address (FILE *file, rtx addr)
14612 {
14613 struct ix86_address parts;
14614 rtx base, index, disp;
14615 int scale;
14616 int ok;
14617 bool vsib = false;
14618 int code = 0;
14619
14620 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14621 {
14622 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14623 gcc_assert (parts.index == NULL_RTX);
14624 parts.index = XVECEXP (addr, 0, 1);
14625 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14626 addr = XVECEXP (addr, 0, 0);
14627 vsib = true;
14628 }
14629 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14630 {
14631 gcc_assert (TARGET_64BIT);
14632 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14633 code = 'q';
14634 }
14635 else
14636 ok = ix86_decompose_address (addr, &parts);
14637
14638 gcc_assert (ok);
14639
14640 if (parts.base && GET_CODE (parts.base) == SUBREG)
14641 {
14642 rtx tmp = SUBREG_REG (parts.base);
14643 parts.base = simplify_subreg (GET_MODE (parts.base),
14644 tmp, GET_MODE (tmp), 0);
14645 }
14646
14647 if (parts.index && GET_CODE (parts.index) == SUBREG)
14648 {
14649 rtx tmp = SUBREG_REG (parts.index);
14650 parts.index = simplify_subreg (GET_MODE (parts.index),
14651 tmp, GET_MODE (tmp), 0);
14652 }
14653
14654 base = parts.base;
14655 index = parts.index;
14656 disp = parts.disp;
14657 scale = parts.scale;
14658
14659 switch (parts.seg)
14660 {
14661 case SEG_DEFAULT:
14662 break;
14663 case SEG_FS:
14664 case SEG_GS:
14665 if (ASSEMBLER_DIALECT == ASM_ATT)
14666 putc ('%', file);
14667 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14668 break;
14669 default:
14670 gcc_unreachable ();
14671 }
14672
14673 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14674 if (TARGET_64BIT && !base && !index)
14675 {
14676 rtx symbol = disp;
14677
14678 if (GET_CODE (disp) == CONST
14679 && GET_CODE (XEXP (disp, 0)) == PLUS
14680 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14681 symbol = XEXP (XEXP (disp, 0), 0);
14682
14683 if (GET_CODE (symbol) == LABEL_REF
14684 || (GET_CODE (symbol) == SYMBOL_REF
14685 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14686 base = pc_rtx;
14687 }
14688 if (!base && !index)
14689 {
14690 /* Displacement only requires special attention. */
14691
14692 if (CONST_INT_P (disp))
14693 {
14694 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14695 fputs ("ds:", file);
14696 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14697 }
14698 else if (flag_pic)
14699 output_pic_addr_const (file, disp, 0);
14700 else
14701 output_addr_const (file, disp);
14702 }
14703 else
14704 {
14705 /* Print SImode register names for zero-extended
14706 addresses to force addr32 prefix. */
14707 if (TARGET_64BIT
14708 && (GET_CODE (addr) == ZERO_EXTEND
14709 || GET_CODE (addr) == AND))
14710 {
14711 gcc_assert (!code);
14712 code = 'l';
14713 }
14714
14715 if (ASSEMBLER_DIALECT == ASM_ATT)
14716 {
14717 if (disp)
14718 {
14719 if (flag_pic)
14720 output_pic_addr_const (file, disp, 0);
14721 else if (GET_CODE (disp) == LABEL_REF)
14722 output_asm_label (disp);
14723 else
14724 output_addr_const (file, disp);
14725 }
14726
14727 putc ('(', file);
14728 if (base)
14729 print_reg (base, code, file);
14730 if (index)
14731 {
14732 putc (',', file);
14733 print_reg (index, vsib ? 0 : code, file);
14734 if (scale != 1 || vsib)
14735 fprintf (file, ",%d", scale);
14736 }
14737 putc (')', file);
14738 }
14739 else
14740 {
14741 rtx offset = NULL_RTX;
14742
14743 if (disp)
14744 {
14745 /* Pull out the offset of a symbol; print any symbol itself. */
14746 if (GET_CODE (disp) == CONST
14747 && GET_CODE (XEXP (disp, 0)) == PLUS
14748 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14749 {
14750 offset = XEXP (XEXP (disp, 0), 1);
14751 disp = gen_rtx_CONST (VOIDmode,
14752 XEXP (XEXP (disp, 0), 0));
14753 }
14754
14755 if (flag_pic)
14756 output_pic_addr_const (file, disp, 0);
14757 else if (GET_CODE (disp) == LABEL_REF)
14758 output_asm_label (disp);
14759 else if (CONST_INT_P (disp))
14760 offset = disp;
14761 else
14762 output_addr_const (file, disp);
14763 }
14764
14765 putc ('[', file);
14766 if (base)
14767 {
14768 print_reg (base, code, file);
14769 if (offset)
14770 {
14771 if (INTVAL (offset) >= 0)
14772 putc ('+', file);
14773 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14774 }
14775 }
14776 else if (offset)
14777 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14778 else
14779 putc ('0', file);
14780
14781 if (index)
14782 {
14783 putc ('+', file);
14784 print_reg (index, vsib ? 0 : code, file);
14785 if (scale != 1 || vsib)
14786 fprintf (file, "*%d", scale);
14787 }
14788 putc (']', file);
14789 }
14790 }
14791 }
14792
14793 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14794
14795 static bool
14796 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14797 {
14798 rtx op;
14799
14800 if (GET_CODE (x) != UNSPEC)
14801 return false;
14802
14803 op = XVECEXP (x, 0, 0);
14804 switch (XINT (x, 1))
14805 {
14806 case UNSPEC_GOTTPOFF:
14807 output_addr_const (file, op);
14808 /* FIXME: This might be @TPOFF in Sun ld. */
14809 fputs ("@gottpoff", file);
14810 break;
14811 case UNSPEC_TPOFF:
14812 output_addr_const (file, op);
14813 fputs ("@tpoff", file);
14814 break;
14815 case UNSPEC_NTPOFF:
14816 output_addr_const (file, op);
14817 if (TARGET_64BIT)
14818 fputs ("@tpoff", file);
14819 else
14820 fputs ("@ntpoff", file);
14821 break;
14822 case UNSPEC_DTPOFF:
14823 output_addr_const (file, op);
14824 fputs ("@dtpoff", file);
14825 break;
14826 case UNSPEC_GOTNTPOFF:
14827 output_addr_const (file, op);
14828 if (TARGET_64BIT)
14829 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14830 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14831 else
14832 fputs ("@gotntpoff", file);
14833 break;
14834 case UNSPEC_INDNTPOFF:
14835 output_addr_const (file, op);
14836 fputs ("@indntpoff", file);
14837 break;
14838 #if TARGET_MACHO
14839 case UNSPEC_MACHOPIC_OFFSET:
14840 output_addr_const (file, op);
14841 putc ('-', file);
14842 machopic_output_function_base_name (file);
14843 break;
14844 #endif
14845
14846 case UNSPEC_STACK_CHECK:
14847 {
14848 int offset;
14849
14850 gcc_assert (flag_split_stack);
14851
14852 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14853 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14854 #else
14855 gcc_unreachable ();
14856 #endif
14857
14858 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14859 }
14860 break;
14861
14862 default:
14863 return false;
14864 }
14865
14866 return true;
14867 }
14868 \f
14869 /* Split one or more double-mode RTL references into pairs of half-mode
14870 references. The RTL can be REG, offsettable MEM, integer constant, or
14871 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14872 split and "num" is its length. lo_half and hi_half are output arrays
14873 that parallel "operands". */
14874
14875 void
14876 split_double_mode (enum machine_mode mode, rtx operands[],
14877 int num, rtx lo_half[], rtx hi_half[])
14878 {
14879 enum machine_mode half_mode;
14880 unsigned int byte;
14881
14882 switch (mode)
14883 {
14884 case TImode:
14885 half_mode = DImode;
14886 break;
14887 case DImode:
14888 half_mode = SImode;
14889 break;
14890 default:
14891 gcc_unreachable ();
14892 }
14893
14894 byte = GET_MODE_SIZE (half_mode);
14895
14896 while (num--)
14897 {
14898 rtx op = operands[num];
14899
14900 /* simplify_subreg refuse to split volatile memory addresses,
14901 but we still have to handle it. */
14902 if (MEM_P (op))
14903 {
14904 lo_half[num] = adjust_address (op, half_mode, 0);
14905 hi_half[num] = adjust_address (op, half_mode, byte);
14906 }
14907 else
14908 {
14909 lo_half[num] = simplify_gen_subreg (half_mode, op,
14910 GET_MODE (op) == VOIDmode
14911 ? mode : GET_MODE (op), 0);
14912 hi_half[num] = simplify_gen_subreg (half_mode, op,
14913 GET_MODE (op) == VOIDmode
14914 ? mode : GET_MODE (op), byte);
14915 }
14916 }
14917 }
14918 \f
14919 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14920 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14921 is the expression of the binary operation. The output may either be
14922 emitted here, or returned to the caller, like all output_* functions.
14923
14924 There is no guarantee that the operands are the same mode, as they
14925 might be within FLOAT or FLOAT_EXTEND expressions. */
14926
14927 #ifndef SYSV386_COMPAT
14928 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14929 wants to fix the assemblers because that causes incompatibility
14930 with gcc. No-one wants to fix gcc because that causes
14931 incompatibility with assemblers... You can use the option of
14932 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14933 #define SYSV386_COMPAT 1
14934 #endif
14935
14936 const char *
14937 output_387_binary_op (rtx insn, rtx *operands)
14938 {
14939 static char buf[40];
14940 const char *p;
14941 const char *ssep;
14942 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14943
14944 #ifdef ENABLE_CHECKING
14945 /* Even if we do not want to check the inputs, this documents input
14946 constraints. Which helps in understanding the following code. */
14947 if (STACK_REG_P (operands[0])
14948 && ((REG_P (operands[1])
14949 && REGNO (operands[0]) == REGNO (operands[1])
14950 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14951 || (REG_P (operands[2])
14952 && REGNO (operands[0]) == REGNO (operands[2])
14953 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14954 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14955 ; /* ok */
14956 else
14957 gcc_assert (is_sse);
14958 #endif
14959
14960 switch (GET_CODE (operands[3]))
14961 {
14962 case PLUS:
14963 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14964 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14965 p = "fiadd";
14966 else
14967 p = "fadd";
14968 ssep = "vadd";
14969 break;
14970
14971 case MINUS:
14972 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14973 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14974 p = "fisub";
14975 else
14976 p = "fsub";
14977 ssep = "vsub";
14978 break;
14979
14980 case MULT:
14981 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14982 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14983 p = "fimul";
14984 else
14985 p = "fmul";
14986 ssep = "vmul";
14987 break;
14988
14989 case DIV:
14990 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14991 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14992 p = "fidiv";
14993 else
14994 p = "fdiv";
14995 ssep = "vdiv";
14996 break;
14997
14998 default:
14999 gcc_unreachable ();
15000 }
15001
15002 if (is_sse)
15003 {
15004 if (TARGET_AVX)
15005 {
15006 strcpy (buf, ssep);
15007 if (GET_MODE (operands[0]) == SFmode)
15008 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15009 else
15010 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15011 }
15012 else
15013 {
15014 strcpy (buf, ssep + 1);
15015 if (GET_MODE (operands[0]) == SFmode)
15016 strcat (buf, "ss\t{%2, %0|%0, %2}");
15017 else
15018 strcat (buf, "sd\t{%2, %0|%0, %2}");
15019 }
15020 return buf;
15021 }
15022 strcpy (buf, p);
15023
15024 switch (GET_CODE (operands[3]))
15025 {
15026 case MULT:
15027 case PLUS:
15028 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15029 {
15030 rtx temp = operands[2];
15031 operands[2] = operands[1];
15032 operands[1] = temp;
15033 }
15034
15035 /* know operands[0] == operands[1]. */
15036
15037 if (MEM_P (operands[2]))
15038 {
15039 p = "%Z2\t%2";
15040 break;
15041 }
15042
15043 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15044 {
15045 if (STACK_TOP_P (operands[0]))
15046 /* How is it that we are storing to a dead operand[2]?
15047 Well, presumably operands[1] is dead too. We can't
15048 store the result to st(0) as st(0) gets popped on this
15049 instruction. Instead store to operands[2] (which I
15050 think has to be st(1)). st(1) will be popped later.
15051 gcc <= 2.8.1 didn't have this check and generated
15052 assembly code that the Unixware assembler rejected. */
15053 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15054 else
15055 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15056 break;
15057 }
15058
15059 if (STACK_TOP_P (operands[0]))
15060 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15061 else
15062 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15063 break;
15064
15065 case MINUS:
15066 case DIV:
15067 if (MEM_P (operands[1]))
15068 {
15069 p = "r%Z1\t%1";
15070 break;
15071 }
15072
15073 if (MEM_P (operands[2]))
15074 {
15075 p = "%Z2\t%2";
15076 break;
15077 }
15078
15079 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15080 {
15081 #if SYSV386_COMPAT
15082 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15083 derived assemblers, confusingly reverse the direction of
15084 the operation for fsub{r} and fdiv{r} when the
15085 destination register is not st(0). The Intel assembler
15086 doesn't have this brain damage. Read !SYSV386_COMPAT to
15087 figure out what the hardware really does. */
15088 if (STACK_TOP_P (operands[0]))
15089 p = "{p\t%0, %2|rp\t%2, %0}";
15090 else
15091 p = "{rp\t%2, %0|p\t%0, %2}";
15092 #else
15093 if (STACK_TOP_P (operands[0]))
15094 /* As above for fmul/fadd, we can't store to st(0). */
15095 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15096 else
15097 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15098 #endif
15099 break;
15100 }
15101
15102 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15103 {
15104 #if SYSV386_COMPAT
15105 if (STACK_TOP_P (operands[0]))
15106 p = "{rp\t%0, %1|p\t%1, %0}";
15107 else
15108 p = "{p\t%1, %0|rp\t%0, %1}";
15109 #else
15110 if (STACK_TOP_P (operands[0]))
15111 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15112 else
15113 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15114 #endif
15115 break;
15116 }
15117
15118 if (STACK_TOP_P (operands[0]))
15119 {
15120 if (STACK_TOP_P (operands[1]))
15121 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15122 else
15123 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15124 break;
15125 }
15126 else if (STACK_TOP_P (operands[1]))
15127 {
15128 #if SYSV386_COMPAT
15129 p = "{\t%1, %0|r\t%0, %1}";
15130 #else
15131 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15132 #endif
15133 }
15134 else
15135 {
15136 #if SYSV386_COMPAT
15137 p = "{r\t%2, %0|\t%0, %2}";
15138 #else
15139 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15140 #endif
15141 }
15142 break;
15143
15144 default:
15145 gcc_unreachable ();
15146 }
15147
15148 strcat (buf, p);
15149 return buf;
15150 }
15151
15152 /* Return needed mode for entity in optimize_mode_switching pass. */
15153
15154 int
15155 ix86_mode_needed (int entity, rtx insn)
15156 {
15157 enum attr_i387_cw mode;
15158
15159 /* The mode UNINITIALIZED is used to store control word after a
15160 function call or ASM pattern. The mode ANY specify that function
15161 has no requirements on the control word and make no changes in the
15162 bits we are interested in. */
15163
15164 if (CALL_P (insn)
15165 || (NONJUMP_INSN_P (insn)
15166 && (asm_noperands (PATTERN (insn)) >= 0
15167 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15168 return I387_CW_UNINITIALIZED;
15169
15170 if (recog_memoized (insn) < 0)
15171 return I387_CW_ANY;
15172
15173 mode = get_attr_i387_cw (insn);
15174
15175 switch (entity)
15176 {
15177 case I387_TRUNC:
15178 if (mode == I387_CW_TRUNC)
15179 return mode;
15180 break;
15181
15182 case I387_FLOOR:
15183 if (mode == I387_CW_FLOOR)
15184 return mode;
15185 break;
15186
15187 case I387_CEIL:
15188 if (mode == I387_CW_CEIL)
15189 return mode;
15190 break;
15191
15192 case I387_MASK_PM:
15193 if (mode == I387_CW_MASK_PM)
15194 return mode;
15195 break;
15196
15197 default:
15198 gcc_unreachable ();
15199 }
15200
15201 return I387_CW_ANY;
15202 }
15203
15204 /* Output code to initialize control word copies used by trunc?f?i and
15205 rounding patterns. CURRENT_MODE is set to current control word,
15206 while NEW_MODE is set to new control word. */
15207
15208 void
15209 emit_i387_cw_initialization (int mode)
15210 {
15211 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15212 rtx new_mode;
15213
15214 enum ix86_stack_slot slot;
15215
15216 rtx reg = gen_reg_rtx (HImode);
15217
15218 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15219 emit_move_insn (reg, copy_rtx (stored_mode));
15220
15221 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15222 || optimize_function_for_size_p (cfun))
15223 {
15224 switch (mode)
15225 {
15226 case I387_CW_TRUNC:
15227 /* round toward zero (truncate) */
15228 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15229 slot = SLOT_CW_TRUNC;
15230 break;
15231
15232 case I387_CW_FLOOR:
15233 /* round down toward -oo */
15234 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15235 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15236 slot = SLOT_CW_FLOOR;
15237 break;
15238
15239 case I387_CW_CEIL:
15240 /* round up toward +oo */
15241 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15242 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15243 slot = SLOT_CW_CEIL;
15244 break;
15245
15246 case I387_CW_MASK_PM:
15247 /* mask precision exception for nearbyint() */
15248 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15249 slot = SLOT_CW_MASK_PM;
15250 break;
15251
15252 default:
15253 gcc_unreachable ();
15254 }
15255 }
15256 else
15257 {
15258 switch (mode)
15259 {
15260 case I387_CW_TRUNC:
15261 /* round toward zero (truncate) */
15262 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15263 slot = SLOT_CW_TRUNC;
15264 break;
15265
15266 case I387_CW_FLOOR:
15267 /* round down toward -oo */
15268 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15269 slot = SLOT_CW_FLOOR;
15270 break;
15271
15272 case I387_CW_CEIL:
15273 /* round up toward +oo */
15274 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15275 slot = SLOT_CW_CEIL;
15276 break;
15277
15278 case I387_CW_MASK_PM:
15279 /* mask precision exception for nearbyint() */
15280 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15281 slot = SLOT_CW_MASK_PM;
15282 break;
15283
15284 default:
15285 gcc_unreachable ();
15286 }
15287 }
15288
15289 gcc_assert (slot < MAX_386_STACK_LOCALS);
15290
15291 new_mode = assign_386_stack_local (HImode, slot);
15292 emit_move_insn (new_mode, reg);
15293 }
15294
15295 /* Output code for INSN to convert a float to a signed int. OPERANDS
15296 are the insn operands. The output may be [HSD]Imode and the input
15297 operand may be [SDX]Fmode. */
15298
15299 const char *
15300 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15301 {
15302 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15303 int dimode_p = GET_MODE (operands[0]) == DImode;
15304 int round_mode = get_attr_i387_cw (insn);
15305
15306 /* Jump through a hoop or two for DImode, since the hardware has no
15307 non-popping instruction. We used to do this a different way, but
15308 that was somewhat fragile and broke with post-reload splitters. */
15309 if ((dimode_p || fisttp) && !stack_top_dies)
15310 output_asm_insn ("fld\t%y1", operands);
15311
15312 gcc_assert (STACK_TOP_P (operands[1]));
15313 gcc_assert (MEM_P (operands[0]));
15314 gcc_assert (GET_MODE (operands[1]) != TFmode);
15315
15316 if (fisttp)
15317 output_asm_insn ("fisttp%Z0\t%0", operands);
15318 else
15319 {
15320 if (round_mode != I387_CW_ANY)
15321 output_asm_insn ("fldcw\t%3", operands);
15322 if (stack_top_dies || dimode_p)
15323 output_asm_insn ("fistp%Z0\t%0", operands);
15324 else
15325 output_asm_insn ("fist%Z0\t%0", operands);
15326 if (round_mode != I387_CW_ANY)
15327 output_asm_insn ("fldcw\t%2", operands);
15328 }
15329
15330 return "";
15331 }
15332
15333 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15334 have the values zero or one, indicates the ffreep insn's operand
15335 from the OPERANDS array. */
15336
15337 static const char *
15338 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15339 {
15340 if (TARGET_USE_FFREEP)
15341 #ifdef HAVE_AS_IX86_FFREEP
15342 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15343 #else
15344 {
15345 static char retval[32];
15346 int regno = REGNO (operands[opno]);
15347
15348 gcc_assert (FP_REGNO_P (regno));
15349
15350 regno -= FIRST_STACK_REG;
15351
15352 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15353 return retval;
15354 }
15355 #endif
15356
15357 return opno ? "fstp\t%y1" : "fstp\t%y0";
15358 }
15359
15360
15361 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15362 should be used. UNORDERED_P is true when fucom should be used. */
15363
15364 const char *
15365 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15366 {
15367 int stack_top_dies;
15368 rtx cmp_op0, cmp_op1;
15369 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15370
15371 if (eflags_p)
15372 {
15373 cmp_op0 = operands[0];
15374 cmp_op1 = operands[1];
15375 }
15376 else
15377 {
15378 cmp_op0 = operands[1];
15379 cmp_op1 = operands[2];
15380 }
15381
15382 if (is_sse)
15383 {
15384 if (GET_MODE (operands[0]) == SFmode)
15385 if (unordered_p)
15386 return "%vucomiss\t{%1, %0|%0, %1}";
15387 else
15388 return "%vcomiss\t{%1, %0|%0, %1}";
15389 else
15390 if (unordered_p)
15391 return "%vucomisd\t{%1, %0|%0, %1}";
15392 else
15393 return "%vcomisd\t{%1, %0|%0, %1}";
15394 }
15395
15396 gcc_assert (STACK_TOP_P (cmp_op0));
15397
15398 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15399
15400 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15401 {
15402 if (stack_top_dies)
15403 {
15404 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15405 return output_387_ffreep (operands, 1);
15406 }
15407 else
15408 return "ftst\n\tfnstsw\t%0";
15409 }
15410
15411 if (STACK_REG_P (cmp_op1)
15412 && stack_top_dies
15413 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15414 && REGNO (cmp_op1) != FIRST_STACK_REG)
15415 {
15416 /* If both the top of the 387 stack dies, and the other operand
15417 is also a stack register that dies, then this must be a
15418 `fcompp' float compare */
15419
15420 if (eflags_p)
15421 {
15422 /* There is no double popping fcomi variant. Fortunately,
15423 eflags is immune from the fstp's cc clobbering. */
15424 if (unordered_p)
15425 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15426 else
15427 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15428 return output_387_ffreep (operands, 0);
15429 }
15430 else
15431 {
15432 if (unordered_p)
15433 return "fucompp\n\tfnstsw\t%0";
15434 else
15435 return "fcompp\n\tfnstsw\t%0";
15436 }
15437 }
15438 else
15439 {
15440 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15441
15442 static const char * const alt[16] =
15443 {
15444 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15445 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15446 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15447 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15448
15449 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15450 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15451 NULL,
15452 NULL,
15453
15454 "fcomi\t{%y1, %0|%0, %y1}",
15455 "fcomip\t{%y1, %0|%0, %y1}",
15456 "fucomi\t{%y1, %0|%0, %y1}",
15457 "fucomip\t{%y1, %0|%0, %y1}",
15458
15459 NULL,
15460 NULL,
15461 NULL,
15462 NULL
15463 };
15464
15465 int mask;
15466 const char *ret;
15467
15468 mask = eflags_p << 3;
15469 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15470 mask |= unordered_p << 1;
15471 mask |= stack_top_dies;
15472
15473 gcc_assert (mask < 16);
15474 ret = alt[mask];
15475 gcc_assert (ret);
15476
15477 return ret;
15478 }
15479 }
15480
15481 void
15482 ix86_output_addr_vec_elt (FILE *file, int value)
15483 {
15484 const char *directive = ASM_LONG;
15485
15486 #ifdef ASM_QUAD
15487 if (TARGET_LP64)
15488 directive = ASM_QUAD;
15489 #else
15490 gcc_assert (!TARGET_64BIT);
15491 #endif
15492
15493 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15494 }
15495
15496 void
15497 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15498 {
15499 const char *directive = ASM_LONG;
15500
15501 #ifdef ASM_QUAD
15502 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15503 directive = ASM_QUAD;
15504 #else
15505 gcc_assert (!TARGET_64BIT);
15506 #endif
15507 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15508 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15509 fprintf (file, "%s%s%d-%s%d\n",
15510 directive, LPREFIX, value, LPREFIX, rel);
15511 else if (HAVE_AS_GOTOFF_IN_DATA)
15512 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15513 #if TARGET_MACHO
15514 else if (TARGET_MACHO)
15515 {
15516 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15517 machopic_output_function_base_name (file);
15518 putc ('\n', file);
15519 }
15520 #endif
15521 else
15522 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15523 GOT_SYMBOL_NAME, LPREFIX, value);
15524 }
15525 \f
15526 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15527 for the target. */
15528
15529 void
15530 ix86_expand_clear (rtx dest)
15531 {
15532 rtx tmp;
15533
15534 /* We play register width games, which are only valid after reload. */
15535 gcc_assert (reload_completed);
15536
15537 /* Avoid HImode and its attendant prefix byte. */
15538 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15539 dest = gen_rtx_REG (SImode, REGNO (dest));
15540 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15541
15542 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15543 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15544 {
15545 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15546 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15547 }
15548
15549 emit_insn (tmp);
15550 }
15551
15552 /* X is an unchanging MEM. If it is a constant pool reference, return
15553 the constant pool rtx, else NULL. */
15554
15555 rtx
15556 maybe_get_pool_constant (rtx x)
15557 {
15558 x = ix86_delegitimize_address (XEXP (x, 0));
15559
15560 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15561 return get_pool_constant (x);
15562
15563 return NULL_RTX;
15564 }
15565
15566 void
15567 ix86_expand_move (enum machine_mode mode, rtx operands[])
15568 {
15569 rtx op0, op1;
15570 enum tls_model model;
15571
15572 op0 = operands[0];
15573 op1 = operands[1];
15574
15575 if (GET_CODE (op1) == SYMBOL_REF)
15576 {
15577 model = SYMBOL_REF_TLS_MODEL (op1);
15578 if (model)
15579 {
15580 op1 = legitimize_tls_address (op1, model, true);
15581 op1 = force_operand (op1, op0);
15582 if (op1 == op0)
15583 return;
15584 if (GET_MODE (op1) != mode)
15585 op1 = convert_to_mode (mode, op1, 1);
15586 }
15587 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15588 && SYMBOL_REF_DLLIMPORT_P (op1))
15589 op1 = legitimize_dllimport_symbol (op1, false);
15590 }
15591 else if (GET_CODE (op1) == CONST
15592 && GET_CODE (XEXP (op1, 0)) == PLUS
15593 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15594 {
15595 rtx addend = XEXP (XEXP (op1, 0), 1);
15596 rtx symbol = XEXP (XEXP (op1, 0), 0);
15597 rtx tmp = NULL;
15598
15599 model = SYMBOL_REF_TLS_MODEL (symbol);
15600 if (model)
15601 tmp = legitimize_tls_address (symbol, model, true);
15602 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15603 && SYMBOL_REF_DLLIMPORT_P (symbol))
15604 tmp = legitimize_dllimport_symbol (symbol, true);
15605
15606 if (tmp)
15607 {
15608 tmp = force_operand (tmp, NULL);
15609 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15610 op0, 1, OPTAB_DIRECT);
15611 if (tmp == op0)
15612 return;
15613 if (GET_MODE (tmp) != mode)
15614 op1 = convert_to_mode (mode, tmp, 1);
15615 }
15616 }
15617
15618 if ((flag_pic || MACHOPIC_INDIRECT)
15619 && symbolic_operand (op1, mode))
15620 {
15621 if (TARGET_MACHO && !TARGET_64BIT)
15622 {
15623 #if TARGET_MACHO
15624 /* dynamic-no-pic */
15625 if (MACHOPIC_INDIRECT)
15626 {
15627 rtx temp = ((reload_in_progress
15628 || ((op0 && REG_P (op0))
15629 && mode == Pmode))
15630 ? op0 : gen_reg_rtx (Pmode));
15631 op1 = machopic_indirect_data_reference (op1, temp);
15632 if (MACHOPIC_PURE)
15633 op1 = machopic_legitimize_pic_address (op1, mode,
15634 temp == op1 ? 0 : temp);
15635 }
15636 if (op0 != op1 && GET_CODE (op0) != MEM)
15637 {
15638 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15639 emit_insn (insn);
15640 return;
15641 }
15642 if (GET_CODE (op0) == MEM)
15643 op1 = force_reg (Pmode, op1);
15644 else
15645 {
15646 rtx temp = op0;
15647 if (GET_CODE (temp) != REG)
15648 temp = gen_reg_rtx (Pmode);
15649 temp = legitimize_pic_address (op1, temp);
15650 if (temp == op0)
15651 return;
15652 op1 = temp;
15653 }
15654 /* dynamic-no-pic */
15655 #endif
15656 }
15657 else
15658 {
15659 if (MEM_P (op0))
15660 op1 = force_reg (mode, op1);
15661 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15662 {
15663 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15664 op1 = legitimize_pic_address (op1, reg);
15665 if (op0 == op1)
15666 return;
15667 if (GET_MODE (op1) != mode)
15668 op1 = convert_to_mode (mode, op1, 1);
15669 }
15670 }
15671 }
15672 else
15673 {
15674 if (MEM_P (op0)
15675 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15676 || !push_operand (op0, mode))
15677 && MEM_P (op1))
15678 op1 = force_reg (mode, op1);
15679
15680 if (push_operand (op0, mode)
15681 && ! general_no_elim_operand (op1, mode))
15682 op1 = copy_to_mode_reg (mode, op1);
15683
15684 /* Force large constants in 64bit compilation into register
15685 to get them CSEed. */
15686 if (can_create_pseudo_p ()
15687 && (mode == DImode) && TARGET_64BIT
15688 && immediate_operand (op1, mode)
15689 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15690 && !register_operand (op0, mode)
15691 && optimize)
15692 op1 = copy_to_mode_reg (mode, op1);
15693
15694 if (can_create_pseudo_p ()
15695 && FLOAT_MODE_P (mode)
15696 && GET_CODE (op1) == CONST_DOUBLE)
15697 {
15698 /* If we are loading a floating point constant to a register,
15699 force the value to memory now, since we'll get better code
15700 out the back end. */
15701
15702 op1 = validize_mem (force_const_mem (mode, op1));
15703 if (!register_operand (op0, mode))
15704 {
15705 rtx temp = gen_reg_rtx (mode);
15706 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15707 emit_move_insn (op0, temp);
15708 return;
15709 }
15710 }
15711 }
15712
15713 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15714 }
15715
15716 void
15717 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15718 {
15719 rtx op0 = operands[0], op1 = operands[1];
15720 unsigned int align = GET_MODE_ALIGNMENT (mode);
15721
15722 /* Force constants other than zero into memory. We do not know how
15723 the instructions used to build constants modify the upper 64 bits
15724 of the register, once we have that information we may be able
15725 to handle some of them more efficiently. */
15726 if (can_create_pseudo_p ()
15727 && register_operand (op0, mode)
15728 && (CONSTANT_P (op1)
15729 || (GET_CODE (op1) == SUBREG
15730 && CONSTANT_P (SUBREG_REG (op1))))
15731 && !standard_sse_constant_p (op1))
15732 op1 = validize_mem (force_const_mem (mode, op1));
15733
15734 /* We need to check memory alignment for SSE mode since attribute
15735 can make operands unaligned. */
15736 if (can_create_pseudo_p ()
15737 && SSE_REG_MODE_P (mode)
15738 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15739 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15740 {
15741 rtx tmp[2];
15742
15743 /* ix86_expand_vector_move_misalign() does not like constants ... */
15744 if (CONSTANT_P (op1)
15745 || (GET_CODE (op1) == SUBREG
15746 && CONSTANT_P (SUBREG_REG (op1))))
15747 op1 = validize_mem (force_const_mem (mode, op1));
15748
15749 /* ... nor both arguments in memory. */
15750 if (!register_operand (op0, mode)
15751 && !register_operand (op1, mode))
15752 op1 = force_reg (mode, op1);
15753
15754 tmp[0] = op0; tmp[1] = op1;
15755 ix86_expand_vector_move_misalign (mode, tmp);
15756 return;
15757 }
15758
15759 /* Make operand1 a register if it isn't already. */
15760 if (can_create_pseudo_p ()
15761 && !register_operand (op0, mode)
15762 && !register_operand (op1, mode))
15763 {
15764 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15765 return;
15766 }
15767
15768 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15769 }
15770
15771 /* Split 32-byte AVX unaligned load and store if needed. */
15772
15773 static void
15774 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15775 {
15776 rtx m;
15777 rtx (*extract) (rtx, rtx, rtx);
15778 rtx (*move_unaligned) (rtx, rtx);
15779 enum machine_mode mode;
15780
15781 switch (GET_MODE (op0))
15782 {
15783 default:
15784 gcc_unreachable ();
15785 case V32QImode:
15786 extract = gen_avx_vextractf128v32qi;
15787 move_unaligned = gen_avx_movdqu256;
15788 mode = V16QImode;
15789 break;
15790 case V8SFmode:
15791 extract = gen_avx_vextractf128v8sf;
15792 move_unaligned = gen_avx_movups256;
15793 mode = V4SFmode;
15794 break;
15795 case V4DFmode:
15796 extract = gen_avx_vextractf128v4df;
15797 move_unaligned = gen_avx_movupd256;
15798 mode = V2DFmode;
15799 break;
15800 }
15801
15802 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15803 {
15804 rtx r = gen_reg_rtx (mode);
15805 m = adjust_address (op1, mode, 0);
15806 emit_move_insn (r, m);
15807 m = adjust_address (op1, mode, 16);
15808 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15809 emit_move_insn (op0, r);
15810 }
15811 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15812 {
15813 m = adjust_address (op0, mode, 0);
15814 emit_insn (extract (m, op1, const0_rtx));
15815 m = adjust_address (op0, mode, 16);
15816 emit_insn (extract (m, op1, const1_rtx));
15817 }
15818 else
15819 emit_insn (move_unaligned (op0, op1));
15820 }
15821
15822 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15823 straight to ix86_expand_vector_move. */
15824 /* Code generation for scalar reg-reg moves of single and double precision data:
15825 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15826 movaps reg, reg
15827 else
15828 movss reg, reg
15829 if (x86_sse_partial_reg_dependency == true)
15830 movapd reg, reg
15831 else
15832 movsd reg, reg
15833
15834 Code generation for scalar loads of double precision data:
15835 if (x86_sse_split_regs == true)
15836 movlpd mem, reg (gas syntax)
15837 else
15838 movsd mem, reg
15839
15840 Code generation for unaligned packed loads of single precision data
15841 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15842 if (x86_sse_unaligned_move_optimal)
15843 movups mem, reg
15844
15845 if (x86_sse_partial_reg_dependency == true)
15846 {
15847 xorps reg, reg
15848 movlps mem, reg
15849 movhps mem+8, reg
15850 }
15851 else
15852 {
15853 movlps mem, reg
15854 movhps mem+8, reg
15855 }
15856
15857 Code generation for unaligned packed loads of double precision data
15858 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15859 if (x86_sse_unaligned_move_optimal)
15860 movupd mem, reg
15861
15862 if (x86_sse_split_regs == true)
15863 {
15864 movlpd mem, reg
15865 movhpd mem+8, reg
15866 }
15867 else
15868 {
15869 movsd mem, reg
15870 movhpd mem+8, reg
15871 }
15872 */
15873
15874 void
15875 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15876 {
15877 rtx op0, op1, m;
15878
15879 op0 = operands[0];
15880 op1 = operands[1];
15881
15882 if (TARGET_AVX
15883 && GET_MODE_SIZE (mode) == 32)
15884 {
15885 switch (GET_MODE_CLASS (mode))
15886 {
15887 case MODE_VECTOR_INT:
15888 case MODE_INT:
15889 op0 = gen_lowpart (V32QImode, op0);
15890 op1 = gen_lowpart (V32QImode, op1);
15891 /* FALLTHRU */
15892
15893 case MODE_VECTOR_FLOAT:
15894 ix86_avx256_split_vector_move_misalign (op0, op1);
15895 break;
15896
15897 default:
15898 gcc_unreachable ();
15899 }
15900
15901 return;
15902 }
15903
15904 if (MEM_P (op1))
15905 {
15906 /* ??? If we have typed data, then it would appear that using
15907 movdqu is the only way to get unaligned data loaded with
15908 integer type. */
15909 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15910 {
15911 op0 = gen_lowpart (V16QImode, op0);
15912 op1 = gen_lowpart (V16QImode, op1);
15913 /* We will eventually emit movups based on insn attributes. */
15914 emit_insn (gen_sse2_movdqu (op0, op1));
15915 }
15916 else if (TARGET_SSE2 && mode == V2DFmode)
15917 {
15918 rtx zero;
15919
15920 if (TARGET_AVX
15921 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15922 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15923 || optimize_function_for_size_p (cfun))
15924 {
15925 /* We will eventually emit movups based on insn attributes. */
15926 emit_insn (gen_sse2_movupd (op0, op1));
15927 return;
15928 }
15929
15930 /* When SSE registers are split into halves, we can avoid
15931 writing to the top half twice. */
15932 if (TARGET_SSE_SPLIT_REGS)
15933 {
15934 emit_clobber (op0);
15935 zero = op0;
15936 }
15937 else
15938 {
15939 /* ??? Not sure about the best option for the Intel chips.
15940 The following would seem to satisfy; the register is
15941 entirely cleared, breaking the dependency chain. We
15942 then store to the upper half, with a dependency depth
15943 of one. A rumor has it that Intel recommends two movsd
15944 followed by an unpacklpd, but this is unconfirmed. And
15945 given that the dependency depth of the unpacklpd would
15946 still be one, I'm not sure why this would be better. */
15947 zero = CONST0_RTX (V2DFmode);
15948 }
15949
15950 m = adjust_address (op1, DFmode, 0);
15951 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15952 m = adjust_address (op1, DFmode, 8);
15953 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15954 }
15955 else
15956 {
15957 if (TARGET_AVX
15958 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
15959 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15960 || optimize_function_for_size_p (cfun))
15961 {
15962 op0 = gen_lowpart (V4SFmode, op0);
15963 op1 = gen_lowpart (V4SFmode, op1);
15964 emit_insn (gen_sse_movups (op0, op1));
15965 return;
15966 }
15967
15968 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15969 emit_move_insn (op0, CONST0_RTX (mode));
15970 else
15971 emit_clobber (op0);
15972
15973 if (mode != V4SFmode)
15974 op0 = gen_lowpart (V4SFmode, op0);
15975
15976 m = adjust_address (op1, V2SFmode, 0);
15977 emit_insn (gen_sse_loadlps (op0, op0, m));
15978 m = adjust_address (op1, V2SFmode, 8);
15979 emit_insn (gen_sse_loadhps (op0, op0, m));
15980 }
15981 }
15982 else if (MEM_P (op0))
15983 {
15984 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15985 {
15986 op0 = gen_lowpart (V16QImode, op0);
15987 op1 = gen_lowpart (V16QImode, op1);
15988 /* We will eventually emit movups based on insn attributes. */
15989 emit_insn (gen_sse2_movdqu (op0, op1));
15990 }
15991 else if (TARGET_SSE2 && mode == V2DFmode)
15992 {
15993 if (TARGET_AVX
15994 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
15995 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
15996 || optimize_function_for_size_p (cfun))
15997 /* We will eventually emit movups based on insn attributes. */
15998 emit_insn (gen_sse2_movupd (op0, op1));
15999 else
16000 {
16001 m = adjust_address (op0, DFmode, 0);
16002 emit_insn (gen_sse2_storelpd (m, op1));
16003 m = adjust_address (op0, DFmode, 8);
16004 emit_insn (gen_sse2_storehpd (m, op1));
16005 }
16006 }
16007 else
16008 {
16009 if (mode != V4SFmode)
16010 op1 = gen_lowpart (V4SFmode, op1);
16011
16012 if (TARGET_AVX
16013 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16014 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16015 || optimize_function_for_size_p (cfun))
16016 {
16017 op0 = gen_lowpart (V4SFmode, op0);
16018 emit_insn (gen_sse_movups (op0, op1));
16019 }
16020 else
16021 {
16022 m = adjust_address (op0, V2SFmode, 0);
16023 emit_insn (gen_sse_storelps (m, op1));
16024 m = adjust_address (op0, V2SFmode, 8);
16025 emit_insn (gen_sse_storehps (m, op1));
16026 }
16027 }
16028 }
16029 else
16030 gcc_unreachable ();
16031 }
16032
16033 /* Expand a push in MODE. This is some mode for which we do not support
16034 proper push instructions, at least from the registers that we expect
16035 the value to live in. */
16036
16037 void
16038 ix86_expand_push (enum machine_mode mode, rtx x)
16039 {
16040 rtx tmp;
16041
16042 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16043 GEN_INT (-GET_MODE_SIZE (mode)),
16044 stack_pointer_rtx, 1, OPTAB_DIRECT);
16045 if (tmp != stack_pointer_rtx)
16046 emit_move_insn (stack_pointer_rtx, tmp);
16047
16048 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16049
16050 /* When we push an operand onto stack, it has to be aligned at least
16051 at the function argument boundary. However since we don't have
16052 the argument type, we can't determine the actual argument
16053 boundary. */
16054 emit_move_insn (tmp, x);
16055 }
16056
16057 /* Helper function of ix86_fixup_binary_operands to canonicalize
16058 operand order. Returns true if the operands should be swapped. */
16059
16060 static bool
16061 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16062 rtx operands[])
16063 {
16064 rtx dst = operands[0];
16065 rtx src1 = operands[1];
16066 rtx src2 = operands[2];
16067
16068 /* If the operation is not commutative, we can't do anything. */
16069 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16070 return false;
16071
16072 /* Highest priority is that src1 should match dst. */
16073 if (rtx_equal_p (dst, src1))
16074 return false;
16075 if (rtx_equal_p (dst, src2))
16076 return true;
16077
16078 /* Next highest priority is that immediate constants come second. */
16079 if (immediate_operand (src2, mode))
16080 return false;
16081 if (immediate_operand (src1, mode))
16082 return true;
16083
16084 /* Lowest priority is that memory references should come second. */
16085 if (MEM_P (src2))
16086 return false;
16087 if (MEM_P (src1))
16088 return true;
16089
16090 return false;
16091 }
16092
16093
16094 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16095 destination to use for the operation. If different from the true
16096 destination in operands[0], a copy operation will be required. */
16097
16098 rtx
16099 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16100 rtx operands[])
16101 {
16102 rtx dst = operands[0];
16103 rtx src1 = operands[1];
16104 rtx src2 = operands[2];
16105
16106 /* Canonicalize operand order. */
16107 if (ix86_swap_binary_operands_p (code, mode, operands))
16108 {
16109 rtx temp;
16110
16111 /* It is invalid to swap operands of different modes. */
16112 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16113
16114 temp = src1;
16115 src1 = src2;
16116 src2 = temp;
16117 }
16118
16119 /* Both source operands cannot be in memory. */
16120 if (MEM_P (src1) && MEM_P (src2))
16121 {
16122 /* Optimization: Only read from memory once. */
16123 if (rtx_equal_p (src1, src2))
16124 {
16125 src2 = force_reg (mode, src2);
16126 src1 = src2;
16127 }
16128 else
16129 src2 = force_reg (mode, src2);
16130 }
16131
16132 /* If the destination is memory, and we do not have matching source
16133 operands, do things in registers. */
16134 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16135 dst = gen_reg_rtx (mode);
16136
16137 /* Source 1 cannot be a constant. */
16138 if (CONSTANT_P (src1))
16139 src1 = force_reg (mode, src1);
16140
16141 /* Source 1 cannot be a non-matching memory. */
16142 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16143 src1 = force_reg (mode, src1);
16144
16145 /* Improve address combine. */
16146 if (code == PLUS
16147 && GET_MODE_CLASS (mode) == MODE_INT
16148 && MEM_P (src2))
16149 src2 = force_reg (mode, src2);
16150
16151 operands[1] = src1;
16152 operands[2] = src2;
16153 return dst;
16154 }
16155
16156 /* Similarly, but assume that the destination has already been
16157 set up properly. */
16158
16159 void
16160 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16161 enum machine_mode mode, rtx operands[])
16162 {
16163 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16164 gcc_assert (dst == operands[0]);
16165 }
16166
16167 /* Attempt to expand a binary operator. Make the expansion closer to the
16168 actual machine, then just general_operand, which will allow 3 separate
16169 memory references (one output, two input) in a single insn. */
16170
16171 void
16172 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16173 rtx operands[])
16174 {
16175 rtx src1, src2, dst, op, clob;
16176
16177 dst = ix86_fixup_binary_operands (code, mode, operands);
16178 src1 = operands[1];
16179 src2 = operands[2];
16180
16181 /* Emit the instruction. */
16182
16183 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16184 if (reload_in_progress)
16185 {
16186 /* Reload doesn't know about the flags register, and doesn't know that
16187 it doesn't want to clobber it. We can only do this with PLUS. */
16188 gcc_assert (code == PLUS);
16189 emit_insn (op);
16190 }
16191 else if (reload_completed
16192 && code == PLUS
16193 && !rtx_equal_p (dst, src1))
16194 {
16195 /* This is going to be an LEA; avoid splitting it later. */
16196 emit_insn (op);
16197 }
16198 else
16199 {
16200 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16201 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16202 }
16203
16204 /* Fix up the destination if needed. */
16205 if (dst != operands[0])
16206 emit_move_insn (operands[0], dst);
16207 }
16208
16209 /* Return TRUE or FALSE depending on whether the binary operator meets the
16210 appropriate constraints. */
16211
16212 bool
16213 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16214 rtx operands[3])
16215 {
16216 rtx dst = operands[0];
16217 rtx src1 = operands[1];
16218 rtx src2 = operands[2];
16219
16220 /* Both source operands cannot be in memory. */
16221 if (MEM_P (src1) && MEM_P (src2))
16222 return false;
16223
16224 /* Canonicalize operand order for commutative operators. */
16225 if (ix86_swap_binary_operands_p (code, mode, operands))
16226 {
16227 rtx temp = src1;
16228 src1 = src2;
16229 src2 = temp;
16230 }
16231
16232 /* If the destination is memory, we must have a matching source operand. */
16233 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16234 return false;
16235
16236 /* Source 1 cannot be a constant. */
16237 if (CONSTANT_P (src1))
16238 return false;
16239
16240 /* Source 1 cannot be a non-matching memory. */
16241 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16242 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16243 return (code == AND
16244 && (mode == HImode
16245 || mode == SImode
16246 || (TARGET_64BIT && mode == DImode))
16247 && satisfies_constraint_L (src2));
16248
16249 return true;
16250 }
16251
16252 /* Attempt to expand a unary operator. Make the expansion closer to the
16253 actual machine, then just general_operand, which will allow 2 separate
16254 memory references (one output, one input) in a single insn. */
16255
16256 void
16257 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16258 rtx operands[])
16259 {
16260 int matching_memory;
16261 rtx src, dst, op, clob;
16262
16263 dst = operands[0];
16264 src = operands[1];
16265
16266 /* If the destination is memory, and we do not have matching source
16267 operands, do things in registers. */
16268 matching_memory = 0;
16269 if (MEM_P (dst))
16270 {
16271 if (rtx_equal_p (dst, src))
16272 matching_memory = 1;
16273 else
16274 dst = gen_reg_rtx (mode);
16275 }
16276
16277 /* When source operand is memory, destination must match. */
16278 if (MEM_P (src) && !matching_memory)
16279 src = force_reg (mode, src);
16280
16281 /* Emit the instruction. */
16282
16283 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16284 if (reload_in_progress || code == NOT)
16285 {
16286 /* Reload doesn't know about the flags register, and doesn't know that
16287 it doesn't want to clobber it. */
16288 gcc_assert (code == NOT);
16289 emit_insn (op);
16290 }
16291 else
16292 {
16293 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16294 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16295 }
16296
16297 /* Fix up the destination if needed. */
16298 if (dst != operands[0])
16299 emit_move_insn (operands[0], dst);
16300 }
16301
16302 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16303 divisor are within the range [0-255]. */
16304
16305 void
16306 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16307 bool signed_p)
16308 {
16309 rtx end_label, qimode_label;
16310 rtx insn, div, mod;
16311 rtx scratch, tmp0, tmp1, tmp2;
16312 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16313 rtx (*gen_zero_extend) (rtx, rtx);
16314 rtx (*gen_test_ccno_1) (rtx, rtx);
16315
16316 switch (mode)
16317 {
16318 case SImode:
16319 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16320 gen_test_ccno_1 = gen_testsi_ccno_1;
16321 gen_zero_extend = gen_zero_extendqisi2;
16322 break;
16323 case DImode:
16324 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16325 gen_test_ccno_1 = gen_testdi_ccno_1;
16326 gen_zero_extend = gen_zero_extendqidi2;
16327 break;
16328 default:
16329 gcc_unreachable ();
16330 }
16331
16332 end_label = gen_label_rtx ();
16333 qimode_label = gen_label_rtx ();
16334
16335 scratch = gen_reg_rtx (mode);
16336
16337 /* Use 8bit unsigned divimod if dividend and divisor are within
16338 the range [0-255]. */
16339 emit_move_insn (scratch, operands[2]);
16340 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16341 scratch, 1, OPTAB_DIRECT);
16342 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16343 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16344 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16345 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16346 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16347 pc_rtx);
16348 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16349 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16350 JUMP_LABEL (insn) = qimode_label;
16351
16352 /* Generate original signed/unsigned divimod. */
16353 div = gen_divmod4_1 (operands[0], operands[1],
16354 operands[2], operands[3]);
16355 emit_insn (div);
16356
16357 /* Branch to the end. */
16358 emit_jump_insn (gen_jump (end_label));
16359 emit_barrier ();
16360
16361 /* Generate 8bit unsigned divide. */
16362 emit_label (qimode_label);
16363 /* Don't use operands[0] for result of 8bit divide since not all
16364 registers support QImode ZERO_EXTRACT. */
16365 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16366 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16367 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16368 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16369
16370 if (signed_p)
16371 {
16372 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16373 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16374 }
16375 else
16376 {
16377 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16378 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16379 }
16380
16381 /* Extract remainder from AH. */
16382 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16383 if (REG_P (operands[1]))
16384 insn = emit_move_insn (operands[1], tmp1);
16385 else
16386 {
16387 /* Need a new scratch register since the old one has result
16388 of 8bit divide. */
16389 scratch = gen_reg_rtx (mode);
16390 emit_move_insn (scratch, tmp1);
16391 insn = emit_move_insn (operands[1], scratch);
16392 }
16393 set_unique_reg_note (insn, REG_EQUAL, mod);
16394
16395 /* Zero extend quotient from AL. */
16396 tmp1 = gen_lowpart (QImode, tmp0);
16397 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16398 set_unique_reg_note (insn, REG_EQUAL, div);
16399
16400 emit_label (end_label);
16401 }
16402
16403 #define LEA_MAX_STALL (3)
16404 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16405
16406 /* Increase given DISTANCE in half-cycles according to
16407 dependencies between PREV and NEXT instructions.
16408 Add 1 half-cycle if there is no dependency and
16409 go to next cycle if there is some dependecy. */
16410
16411 static unsigned int
16412 increase_distance (rtx prev, rtx next, unsigned int distance)
16413 {
16414 df_ref *use_rec;
16415 df_ref *def_rec;
16416
16417 if (!prev || !next)
16418 return distance + (distance & 1) + 2;
16419
16420 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16421 return distance + 1;
16422
16423 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16424 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16425 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16426 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16427 return distance + (distance & 1) + 2;
16428
16429 return distance + 1;
16430 }
16431
16432 /* Function checks if instruction INSN defines register number
16433 REGNO1 or REGNO2. */
16434
16435 static bool
16436 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16437 rtx insn)
16438 {
16439 df_ref *def_rec;
16440
16441 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16442 if (DF_REF_REG_DEF_P (*def_rec)
16443 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16444 && (regno1 == DF_REF_REGNO (*def_rec)
16445 || regno2 == DF_REF_REGNO (*def_rec)))
16446 {
16447 return true;
16448 }
16449
16450 return false;
16451 }
16452
16453 /* Function checks if instruction INSN uses register number
16454 REGNO as a part of address expression. */
16455
16456 static bool
16457 insn_uses_reg_mem (unsigned int regno, rtx insn)
16458 {
16459 df_ref *use_rec;
16460
16461 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16462 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16463 return true;
16464
16465 return false;
16466 }
16467
16468 /* Search backward for non-agu definition of register number REGNO1
16469 or register number REGNO2 in basic block starting from instruction
16470 START up to head of basic block or instruction INSN.
16471
16472 Function puts true value into *FOUND var if definition was found
16473 and false otherwise.
16474
16475 Distance in half-cycles between START and found instruction or head
16476 of BB is added to DISTANCE and returned. */
16477
16478 static int
16479 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16480 rtx insn, int distance,
16481 rtx start, bool *found)
16482 {
16483 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16484 rtx prev = start;
16485 rtx next = NULL;
16486
16487 *found = false;
16488
16489 while (prev
16490 && prev != insn
16491 && distance < LEA_SEARCH_THRESHOLD)
16492 {
16493 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16494 {
16495 distance = increase_distance (prev, next, distance);
16496 if (insn_defines_reg (regno1, regno2, prev))
16497 {
16498 if (recog_memoized (prev) < 0
16499 || get_attr_type (prev) != TYPE_LEA)
16500 {
16501 *found = true;
16502 return distance;
16503 }
16504 }
16505
16506 next = prev;
16507 }
16508 if (prev == BB_HEAD (bb))
16509 break;
16510
16511 prev = PREV_INSN (prev);
16512 }
16513
16514 return distance;
16515 }
16516
16517 /* Search backward for non-agu definition of register number REGNO1
16518 or register number REGNO2 in INSN's basic block until
16519 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16520 2. Reach neighbour BBs boundary, or
16521 3. Reach agu definition.
16522 Returns the distance between the non-agu definition point and INSN.
16523 If no definition point, returns -1. */
16524
16525 static int
16526 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16527 rtx insn)
16528 {
16529 basic_block bb = BLOCK_FOR_INSN (insn);
16530 int distance = 0;
16531 bool found = false;
16532
16533 if (insn != BB_HEAD (bb))
16534 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16535 distance, PREV_INSN (insn),
16536 &found);
16537
16538 if (!found && distance < LEA_SEARCH_THRESHOLD)
16539 {
16540 edge e;
16541 edge_iterator ei;
16542 bool simple_loop = false;
16543
16544 FOR_EACH_EDGE (e, ei, bb->preds)
16545 if (e->src == bb)
16546 {
16547 simple_loop = true;
16548 break;
16549 }
16550
16551 if (simple_loop)
16552 distance = distance_non_agu_define_in_bb (regno1, regno2,
16553 insn, distance,
16554 BB_END (bb), &found);
16555 else
16556 {
16557 int shortest_dist = -1;
16558 bool found_in_bb = false;
16559
16560 FOR_EACH_EDGE (e, ei, bb->preds)
16561 {
16562 int bb_dist
16563 = distance_non_agu_define_in_bb (regno1, regno2,
16564 insn, distance,
16565 BB_END (e->src),
16566 &found_in_bb);
16567 if (found_in_bb)
16568 {
16569 if (shortest_dist < 0)
16570 shortest_dist = bb_dist;
16571 else if (bb_dist > 0)
16572 shortest_dist = MIN (bb_dist, shortest_dist);
16573
16574 found = true;
16575 }
16576 }
16577
16578 distance = shortest_dist;
16579 }
16580 }
16581
16582 /* get_attr_type may modify recog data. We want to make sure
16583 that recog data is valid for instruction INSN, on which
16584 distance_non_agu_define is called. INSN is unchanged here. */
16585 extract_insn_cached (insn);
16586
16587 if (!found)
16588 return -1;
16589
16590 return distance >> 1;
16591 }
16592
16593 /* Return the distance in half-cycles between INSN and the next
16594 insn that uses register number REGNO in memory address added
16595 to DISTANCE. Return -1 if REGNO0 is set.
16596
16597 Put true value into *FOUND if register usage was found and
16598 false otherwise.
16599 Put true value into *REDEFINED if register redefinition was
16600 found and false otherwise. */
16601
16602 static int
16603 distance_agu_use_in_bb (unsigned int regno,
16604 rtx insn, int distance, rtx start,
16605 bool *found, bool *redefined)
16606 {
16607 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16608 rtx next = start;
16609 rtx prev = NULL;
16610
16611 *found = false;
16612 *redefined = false;
16613
16614 while (next
16615 && next != insn
16616 && distance < LEA_SEARCH_THRESHOLD)
16617 {
16618 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16619 {
16620 distance = increase_distance(prev, next, distance);
16621 if (insn_uses_reg_mem (regno, next))
16622 {
16623 /* Return DISTANCE if OP0 is used in memory
16624 address in NEXT. */
16625 *found = true;
16626 return distance;
16627 }
16628
16629 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16630 {
16631 /* Return -1 if OP0 is set in NEXT. */
16632 *redefined = true;
16633 return -1;
16634 }
16635
16636 prev = next;
16637 }
16638
16639 if (next == BB_END (bb))
16640 break;
16641
16642 next = NEXT_INSN (next);
16643 }
16644
16645 return distance;
16646 }
16647
16648 /* Return the distance between INSN and the next insn that uses
16649 register number REGNO0 in memory address. Return -1 if no such
16650 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16651
16652 static int
16653 distance_agu_use (unsigned int regno0, rtx insn)
16654 {
16655 basic_block bb = BLOCK_FOR_INSN (insn);
16656 int distance = 0;
16657 bool found = false;
16658 bool redefined = false;
16659
16660 if (insn != BB_END (bb))
16661 distance = distance_agu_use_in_bb (regno0, insn, distance,
16662 NEXT_INSN (insn),
16663 &found, &redefined);
16664
16665 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16666 {
16667 edge e;
16668 edge_iterator ei;
16669 bool simple_loop = false;
16670
16671 FOR_EACH_EDGE (e, ei, bb->succs)
16672 if (e->dest == bb)
16673 {
16674 simple_loop = true;
16675 break;
16676 }
16677
16678 if (simple_loop)
16679 distance = distance_agu_use_in_bb (regno0, insn,
16680 distance, BB_HEAD (bb),
16681 &found, &redefined);
16682 else
16683 {
16684 int shortest_dist = -1;
16685 bool found_in_bb = false;
16686 bool redefined_in_bb = false;
16687
16688 FOR_EACH_EDGE (e, ei, bb->succs)
16689 {
16690 int bb_dist
16691 = distance_agu_use_in_bb (regno0, insn,
16692 distance, BB_HEAD (e->dest),
16693 &found_in_bb, &redefined_in_bb);
16694 if (found_in_bb)
16695 {
16696 if (shortest_dist < 0)
16697 shortest_dist = bb_dist;
16698 else if (bb_dist > 0)
16699 shortest_dist = MIN (bb_dist, shortest_dist);
16700
16701 found = true;
16702 }
16703 }
16704
16705 distance = shortest_dist;
16706 }
16707 }
16708
16709 if (!found || redefined)
16710 return -1;
16711
16712 return distance >> 1;
16713 }
16714
16715 /* Define this macro to tune LEA priority vs ADD, it take effect when
16716 there is a dilemma of choicing LEA or ADD
16717 Negative value: ADD is more preferred than LEA
16718 Zero: Netrual
16719 Positive value: LEA is more preferred than ADD*/
16720 #define IX86_LEA_PRIORITY 0
16721
16722 /* Return true if usage of lea INSN has performance advantage
16723 over a sequence of instructions. Instructions sequence has
16724 SPLIT_COST cycles higher latency than lea latency. */
16725
16726 bool
16727 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16728 unsigned int regno2, unsigned int split_cost)
16729 {
16730 int dist_define, dist_use;
16731
16732 dist_define = distance_non_agu_define (regno1, regno2, insn);
16733 dist_use = distance_agu_use (regno0, insn);
16734
16735 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16736 {
16737 /* If there is no non AGU operand definition, no AGU
16738 operand usage and split cost is 0 then both lea
16739 and non lea variants have same priority. Currently
16740 we prefer lea for 64 bit code and non lea on 32 bit
16741 code. */
16742 if (dist_use < 0 && split_cost == 0)
16743 return TARGET_64BIT || IX86_LEA_PRIORITY;
16744 else
16745 return true;
16746 }
16747
16748 /* With longer definitions distance lea is more preferable.
16749 Here we change it to take into account splitting cost and
16750 lea priority. */
16751 dist_define += split_cost + IX86_LEA_PRIORITY;
16752
16753 /* If there is no use in memory addess then we just check
16754 that split cost does not exceed AGU stall. */
16755 if (dist_use < 0)
16756 return dist_define >= LEA_MAX_STALL;
16757
16758 /* If this insn has both backward non-agu dependence and forward
16759 agu dependence, the one with short distance takes effect. */
16760 return dist_define >= dist_use;
16761 }
16762
16763 /* Return true if it is legal to clobber flags by INSN and
16764 false otherwise. */
16765
16766 static bool
16767 ix86_ok_to_clobber_flags (rtx insn)
16768 {
16769 basic_block bb = BLOCK_FOR_INSN (insn);
16770 df_ref *use;
16771 bitmap live;
16772
16773 while (insn)
16774 {
16775 if (NONDEBUG_INSN_P (insn))
16776 {
16777 for (use = DF_INSN_USES (insn); *use; use++)
16778 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16779 return false;
16780
16781 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16782 return true;
16783 }
16784
16785 if (insn == BB_END (bb))
16786 break;
16787
16788 insn = NEXT_INSN (insn);
16789 }
16790
16791 live = df_get_live_out(bb);
16792 return !REGNO_REG_SET_P (live, FLAGS_REG);
16793 }
16794
16795 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16796 move and add to avoid AGU stalls. */
16797
16798 bool
16799 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16800 {
16801 unsigned int regno0 = true_regnum (operands[0]);
16802 unsigned int regno1 = true_regnum (operands[1]);
16803 unsigned int regno2 = true_regnum (operands[2]);
16804
16805 /* Check if we need to optimize. */
16806 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16807 return false;
16808
16809 /* Check it is correct to split here. */
16810 if (!ix86_ok_to_clobber_flags(insn))
16811 return false;
16812
16813 /* We need to split only adds with non destructive
16814 destination operand. */
16815 if (regno0 == regno1 || regno0 == regno2)
16816 return false;
16817 else
16818 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16819 }
16820
16821 /* Return true if we should emit lea instruction instead of mov
16822 instruction. */
16823
16824 bool
16825 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16826 {
16827 unsigned int regno0;
16828 unsigned int regno1;
16829
16830 /* Check if we need to optimize. */
16831 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16832 return false;
16833
16834 /* Use lea for reg to reg moves only. */
16835 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16836 return false;
16837
16838 regno0 = true_regnum (operands[0]);
16839 regno1 = true_regnum (operands[1]);
16840
16841 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16842 }
16843
16844 /* Return true if we need to split lea into a sequence of
16845 instructions to avoid AGU stalls. */
16846
16847 bool
16848 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16849 {
16850 unsigned int regno0 = true_regnum (operands[0]) ;
16851 unsigned int regno1 = -1;
16852 unsigned int regno2 = -1;
16853 unsigned int split_cost = 0;
16854 struct ix86_address parts;
16855 int ok;
16856
16857 /* Check we need to optimize. */
16858 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16859 return false;
16860
16861 /* Check it is correct to split here. */
16862 if (!ix86_ok_to_clobber_flags(insn))
16863 return false;
16864
16865 ok = ix86_decompose_address (operands[1], &parts);
16866 gcc_assert (ok);
16867
16868 /* We should not split into add if non legitimate pic
16869 operand is used as displacement. */
16870 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16871 return false;
16872
16873 if (parts.base)
16874 regno1 = true_regnum (parts.base);
16875 if (parts.index)
16876 regno2 = true_regnum (parts.index);
16877
16878 /* Compute how many cycles we will add to execution time
16879 if split lea into a sequence of instructions. */
16880 if (parts.base || parts.index)
16881 {
16882 /* Have to use mov instruction if non desctructive
16883 destination form is used. */
16884 if (regno1 != regno0 && regno2 != regno0)
16885 split_cost += 1;
16886
16887 /* Have to add index to base if both exist. */
16888 if (parts.base && parts.index)
16889 split_cost += 1;
16890
16891 /* Have to use shift and adds if scale is 2 or greater. */
16892 if (parts.scale > 1)
16893 {
16894 if (regno0 != regno1)
16895 split_cost += 1;
16896 else if (regno2 == regno0)
16897 split_cost += 4;
16898 else
16899 split_cost += parts.scale;
16900 }
16901
16902 /* Have to use add instruction with immediate if
16903 disp is non zero. */
16904 if (parts.disp && parts.disp != const0_rtx)
16905 split_cost += 1;
16906
16907 /* Subtract the price of lea. */
16908 split_cost -= 1;
16909 }
16910
16911 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16912 }
16913
16914 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16915 matches destination. RTX includes clobber of FLAGS_REG. */
16916
16917 static void
16918 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16919 rtx dst, rtx src)
16920 {
16921 rtx op, clob;
16922
16923 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16924 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16925
16926 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16927 }
16928
16929 /* Split lea instructions into a sequence of instructions
16930 which are executed on ALU to avoid AGU stalls.
16931 It is assumed that it is allowed to clobber flags register
16932 at lea position. */
16933
16934 extern void
16935 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16936 {
16937 unsigned int regno0 = true_regnum (operands[0]) ;
16938 unsigned int regno1 = INVALID_REGNUM;
16939 unsigned int regno2 = INVALID_REGNUM;
16940 struct ix86_address parts;
16941 rtx tmp;
16942 int ok, adds;
16943
16944 ok = ix86_decompose_address (operands[1], &parts);
16945 gcc_assert (ok);
16946
16947 if (parts.base)
16948 {
16949 if (GET_MODE (parts.base) != mode)
16950 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16951 regno1 = true_regnum (parts.base);
16952 }
16953
16954 if (parts.index)
16955 {
16956 if (GET_MODE (parts.index) != mode)
16957 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16958 regno2 = true_regnum (parts.index);
16959 }
16960
16961 if (parts.scale > 1)
16962 {
16963 /* Case r1 = r1 + ... */
16964 if (regno1 == regno0)
16965 {
16966 /* If we have a case r1 = r1 + C * r1 then we
16967 should use multiplication which is very
16968 expensive. Assume cost model is wrong if we
16969 have such case here. */
16970 gcc_assert (regno2 != regno0);
16971
16972 for (adds = parts.scale; adds > 0; adds--)
16973 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16974 }
16975 else
16976 {
16977 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16978 if (regno0 != regno2)
16979 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16980
16981 /* Use shift for scaling. */
16982 ix86_emit_binop (ASHIFT, mode, operands[0],
16983 GEN_INT (exact_log2 (parts.scale)));
16984
16985 if (parts.base)
16986 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16987
16988 if (parts.disp && parts.disp != const0_rtx)
16989 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16990 }
16991 }
16992 else if (!parts.base && !parts.index)
16993 {
16994 gcc_assert(parts.disp);
16995 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16996 }
16997 else
16998 {
16999 if (!parts.base)
17000 {
17001 if (regno0 != regno2)
17002 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
17003 }
17004 else if (!parts.index)
17005 {
17006 if (regno0 != regno1)
17007 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17008 }
17009 else
17010 {
17011 if (regno0 == regno1)
17012 tmp = parts.index;
17013 else if (regno0 == regno2)
17014 tmp = parts.base;
17015 else
17016 {
17017 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17018 tmp = parts.index;
17019 }
17020
17021 ix86_emit_binop (PLUS, mode, operands[0], tmp);
17022 }
17023
17024 if (parts.disp && parts.disp != const0_rtx)
17025 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17026 }
17027 }
17028
17029 /* Return true if it is ok to optimize an ADD operation to LEA
17030 operation to avoid flag register consumation. For most processors,
17031 ADD is faster than LEA. For the processors like ATOM, if the
17032 destination register of LEA holds an actual address which will be
17033 used soon, LEA is better and otherwise ADD is better. */
17034
17035 bool
17036 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17037 {
17038 unsigned int regno0 = true_regnum (operands[0]);
17039 unsigned int regno1 = true_regnum (operands[1]);
17040 unsigned int regno2 = true_regnum (operands[2]);
17041
17042 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17043 if (regno0 != regno1 && regno0 != regno2)
17044 return true;
17045
17046 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17047 return false;
17048
17049 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17050 }
17051
17052 /* Return true if destination reg of SET_BODY is shift count of
17053 USE_BODY. */
17054
17055 static bool
17056 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17057 {
17058 rtx set_dest;
17059 rtx shift_rtx;
17060 int i;
17061
17062 /* Retrieve destination of SET_BODY. */
17063 switch (GET_CODE (set_body))
17064 {
17065 case SET:
17066 set_dest = SET_DEST (set_body);
17067 if (!set_dest || !REG_P (set_dest))
17068 return false;
17069 break;
17070 case PARALLEL:
17071 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17072 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17073 use_body))
17074 return true;
17075 default:
17076 return false;
17077 break;
17078 }
17079
17080 /* Retrieve shift count of USE_BODY. */
17081 switch (GET_CODE (use_body))
17082 {
17083 case SET:
17084 shift_rtx = XEXP (use_body, 1);
17085 break;
17086 case PARALLEL:
17087 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17088 if (ix86_dep_by_shift_count_body (set_body,
17089 XVECEXP (use_body, 0, i)))
17090 return true;
17091 default:
17092 return false;
17093 break;
17094 }
17095
17096 if (shift_rtx
17097 && (GET_CODE (shift_rtx) == ASHIFT
17098 || GET_CODE (shift_rtx) == LSHIFTRT
17099 || GET_CODE (shift_rtx) == ASHIFTRT
17100 || GET_CODE (shift_rtx) == ROTATE
17101 || GET_CODE (shift_rtx) == ROTATERT))
17102 {
17103 rtx shift_count = XEXP (shift_rtx, 1);
17104
17105 /* Return true if shift count is dest of SET_BODY. */
17106 if (REG_P (shift_count)
17107 && true_regnum (set_dest) == true_regnum (shift_count))
17108 return true;
17109 }
17110
17111 return false;
17112 }
17113
17114 /* Return true if destination reg of SET_INSN is shift count of
17115 USE_INSN. */
17116
17117 bool
17118 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17119 {
17120 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17121 PATTERN (use_insn));
17122 }
17123
17124 /* Return TRUE or FALSE depending on whether the unary operator meets the
17125 appropriate constraints. */
17126
17127 bool
17128 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17129 enum machine_mode mode ATTRIBUTE_UNUSED,
17130 rtx operands[2] ATTRIBUTE_UNUSED)
17131 {
17132 /* If one of operands is memory, source and destination must match. */
17133 if ((MEM_P (operands[0])
17134 || MEM_P (operands[1]))
17135 && ! rtx_equal_p (operands[0], operands[1]))
17136 return false;
17137 return true;
17138 }
17139
17140 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17141 are ok, keeping in mind the possible movddup alternative. */
17142
17143 bool
17144 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17145 {
17146 if (MEM_P (operands[0]))
17147 return rtx_equal_p (operands[0], operands[1 + high]);
17148 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17149 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17150 return true;
17151 }
17152
17153 /* Post-reload splitter for converting an SF or DFmode value in an
17154 SSE register into an unsigned SImode. */
17155
17156 void
17157 ix86_split_convert_uns_si_sse (rtx operands[])
17158 {
17159 enum machine_mode vecmode;
17160 rtx value, large, zero_or_two31, input, two31, x;
17161
17162 large = operands[1];
17163 zero_or_two31 = operands[2];
17164 input = operands[3];
17165 two31 = operands[4];
17166 vecmode = GET_MODE (large);
17167 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17168
17169 /* Load up the value into the low element. We must ensure that the other
17170 elements are valid floats -- zero is the easiest such value. */
17171 if (MEM_P (input))
17172 {
17173 if (vecmode == V4SFmode)
17174 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17175 else
17176 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17177 }
17178 else
17179 {
17180 input = gen_rtx_REG (vecmode, REGNO (input));
17181 emit_move_insn (value, CONST0_RTX (vecmode));
17182 if (vecmode == V4SFmode)
17183 emit_insn (gen_sse_movss (value, value, input));
17184 else
17185 emit_insn (gen_sse2_movsd (value, value, input));
17186 }
17187
17188 emit_move_insn (large, two31);
17189 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17190
17191 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17192 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17193
17194 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17195 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17196
17197 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17198 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17199
17200 large = gen_rtx_REG (V4SImode, REGNO (large));
17201 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17202
17203 x = gen_rtx_REG (V4SImode, REGNO (value));
17204 if (vecmode == V4SFmode)
17205 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17206 else
17207 emit_insn (gen_sse2_cvttpd2dq (x, value));
17208 value = x;
17209
17210 emit_insn (gen_xorv4si3 (value, value, large));
17211 }
17212
17213 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17214 Expects the 64-bit DImode to be supplied in a pair of integral
17215 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17216 -mfpmath=sse, !optimize_size only. */
17217
17218 void
17219 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17220 {
17221 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17222 rtx int_xmm, fp_xmm;
17223 rtx biases, exponents;
17224 rtx x;
17225
17226 int_xmm = gen_reg_rtx (V4SImode);
17227 if (TARGET_INTER_UNIT_MOVES)
17228 emit_insn (gen_movdi_to_sse (int_xmm, input));
17229 else if (TARGET_SSE_SPLIT_REGS)
17230 {
17231 emit_clobber (int_xmm);
17232 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17233 }
17234 else
17235 {
17236 x = gen_reg_rtx (V2DImode);
17237 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17238 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17239 }
17240
17241 x = gen_rtx_CONST_VECTOR (V4SImode,
17242 gen_rtvec (4, GEN_INT (0x43300000UL),
17243 GEN_INT (0x45300000UL),
17244 const0_rtx, const0_rtx));
17245 exponents = validize_mem (force_const_mem (V4SImode, x));
17246
17247 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17248 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17249
17250 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17251 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17252 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17253 (0x1.0p84 + double(fp_value_hi_xmm)).
17254 Note these exponents differ by 32. */
17255
17256 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17257
17258 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17259 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17260 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17261 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17262 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17263 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17264 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17265 biases = validize_mem (force_const_mem (V2DFmode, biases));
17266 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17267
17268 /* Add the upper and lower DFmode values together. */
17269 if (TARGET_SSE3)
17270 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17271 else
17272 {
17273 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17274 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17275 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17276 }
17277
17278 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17279 }
17280
17281 /* Not used, but eases macroization of patterns. */
17282 void
17283 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17284 rtx input ATTRIBUTE_UNUSED)
17285 {
17286 gcc_unreachable ();
17287 }
17288
17289 /* Convert an unsigned SImode value into a DFmode. Only currently used
17290 for SSE, but applicable anywhere. */
17291
17292 void
17293 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17294 {
17295 REAL_VALUE_TYPE TWO31r;
17296 rtx x, fp;
17297
17298 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17299 NULL, 1, OPTAB_DIRECT);
17300
17301 fp = gen_reg_rtx (DFmode);
17302 emit_insn (gen_floatsidf2 (fp, x));
17303
17304 real_ldexp (&TWO31r, &dconst1, 31);
17305 x = const_double_from_real_value (TWO31r, DFmode);
17306
17307 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17308 if (x != target)
17309 emit_move_insn (target, x);
17310 }
17311
17312 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17313 32-bit mode; otherwise we have a direct convert instruction. */
17314
17315 void
17316 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17317 {
17318 REAL_VALUE_TYPE TWO32r;
17319 rtx fp_lo, fp_hi, x;
17320
17321 fp_lo = gen_reg_rtx (DFmode);
17322 fp_hi = gen_reg_rtx (DFmode);
17323
17324 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17325
17326 real_ldexp (&TWO32r, &dconst1, 32);
17327 x = const_double_from_real_value (TWO32r, DFmode);
17328 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17329
17330 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17331
17332 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17333 0, OPTAB_DIRECT);
17334 if (x != target)
17335 emit_move_insn (target, x);
17336 }
17337
17338 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17339 For x86_32, -mfpmath=sse, !optimize_size only. */
17340 void
17341 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17342 {
17343 REAL_VALUE_TYPE ONE16r;
17344 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17345
17346 real_ldexp (&ONE16r, &dconst1, 16);
17347 x = const_double_from_real_value (ONE16r, SFmode);
17348 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17349 NULL, 0, OPTAB_DIRECT);
17350 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17351 NULL, 0, OPTAB_DIRECT);
17352 fp_hi = gen_reg_rtx (SFmode);
17353 fp_lo = gen_reg_rtx (SFmode);
17354 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17355 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17356 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17357 0, OPTAB_DIRECT);
17358 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17359 0, OPTAB_DIRECT);
17360 if (!rtx_equal_p (target, fp_hi))
17361 emit_move_insn (target, fp_hi);
17362 }
17363
17364 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17365 a vector of unsigned ints VAL to vector of floats TARGET. */
17366
17367 void
17368 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17369 {
17370 rtx tmp[8];
17371 REAL_VALUE_TYPE TWO16r;
17372 enum machine_mode intmode = GET_MODE (val);
17373 enum machine_mode fltmode = GET_MODE (target);
17374 rtx (*cvt) (rtx, rtx);
17375
17376 if (intmode == V4SImode)
17377 cvt = gen_floatv4siv4sf2;
17378 else
17379 cvt = gen_floatv8siv8sf2;
17380 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17381 tmp[0] = force_reg (intmode, tmp[0]);
17382 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17383 OPTAB_DIRECT);
17384 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17385 NULL_RTX, 1, OPTAB_DIRECT);
17386 tmp[3] = gen_reg_rtx (fltmode);
17387 emit_insn (cvt (tmp[3], tmp[1]));
17388 tmp[4] = gen_reg_rtx (fltmode);
17389 emit_insn (cvt (tmp[4], tmp[2]));
17390 real_ldexp (&TWO16r, &dconst1, 16);
17391 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17392 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17393 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17394 OPTAB_DIRECT);
17395 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17396 OPTAB_DIRECT);
17397 if (tmp[7] != target)
17398 emit_move_insn (target, tmp[7]);
17399 }
17400
17401 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17402 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17403 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17404 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17405
17406 rtx
17407 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17408 {
17409 REAL_VALUE_TYPE TWO31r;
17410 rtx two31r, tmp[4];
17411 enum machine_mode mode = GET_MODE (val);
17412 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17413 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17414 rtx (*cmp) (rtx, rtx, rtx, rtx);
17415 int i;
17416
17417 for (i = 0; i < 3; i++)
17418 tmp[i] = gen_reg_rtx (mode);
17419 real_ldexp (&TWO31r, &dconst1, 31);
17420 two31r = const_double_from_real_value (TWO31r, scalarmode);
17421 two31r = ix86_build_const_vector (mode, 1, two31r);
17422 two31r = force_reg (mode, two31r);
17423 switch (mode)
17424 {
17425 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17426 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17427 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17428 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17429 default: gcc_unreachable ();
17430 }
17431 tmp[3] = gen_rtx_LE (mode, two31r, val);
17432 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17433 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17434 0, OPTAB_DIRECT);
17435 if (intmode == V4SImode || TARGET_AVX2)
17436 *xorp = expand_simple_binop (intmode, ASHIFT,
17437 gen_lowpart (intmode, tmp[0]),
17438 GEN_INT (31), NULL_RTX, 0,
17439 OPTAB_DIRECT);
17440 else
17441 {
17442 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17443 two31 = ix86_build_const_vector (intmode, 1, two31);
17444 *xorp = expand_simple_binop (intmode, AND,
17445 gen_lowpart (intmode, tmp[0]),
17446 two31, NULL_RTX, 0,
17447 OPTAB_DIRECT);
17448 }
17449 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17450 0, OPTAB_DIRECT);
17451 }
17452
17453 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17454 then replicate the value for all elements of the vector
17455 register. */
17456
17457 rtx
17458 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17459 {
17460 int i, n_elt;
17461 rtvec v;
17462 enum machine_mode scalar_mode;
17463
17464 switch (mode)
17465 {
17466 case V32QImode:
17467 case V16QImode:
17468 case V16HImode:
17469 case V8HImode:
17470 case V8SImode:
17471 case V4SImode:
17472 case V4DImode:
17473 case V2DImode:
17474 gcc_assert (vect);
17475 case V8SFmode:
17476 case V4SFmode:
17477 case V4DFmode:
17478 case V2DFmode:
17479 n_elt = GET_MODE_NUNITS (mode);
17480 v = rtvec_alloc (n_elt);
17481 scalar_mode = GET_MODE_INNER (mode);
17482
17483 RTVEC_ELT (v, 0) = value;
17484
17485 for (i = 1; i < n_elt; ++i)
17486 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17487
17488 return gen_rtx_CONST_VECTOR (mode, v);
17489
17490 default:
17491 gcc_unreachable ();
17492 }
17493 }
17494
17495 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17496 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17497 for an SSE register. If VECT is true, then replicate the mask for
17498 all elements of the vector register. If INVERT is true, then create
17499 a mask excluding the sign bit. */
17500
17501 rtx
17502 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17503 {
17504 enum machine_mode vec_mode, imode;
17505 HOST_WIDE_INT hi, lo;
17506 int shift = 63;
17507 rtx v;
17508 rtx mask;
17509
17510 /* Find the sign bit, sign extended to 2*HWI. */
17511 switch (mode)
17512 {
17513 case V8SImode:
17514 case V4SImode:
17515 case V8SFmode:
17516 case V4SFmode:
17517 vec_mode = mode;
17518 mode = GET_MODE_INNER (mode);
17519 imode = SImode;
17520 lo = 0x80000000, hi = lo < 0;
17521 break;
17522
17523 case V4DImode:
17524 case V2DImode:
17525 case V4DFmode:
17526 case V2DFmode:
17527 vec_mode = mode;
17528 mode = GET_MODE_INNER (mode);
17529 imode = DImode;
17530 if (HOST_BITS_PER_WIDE_INT >= 64)
17531 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17532 else
17533 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17534 break;
17535
17536 case TImode:
17537 case TFmode:
17538 vec_mode = VOIDmode;
17539 if (HOST_BITS_PER_WIDE_INT >= 64)
17540 {
17541 imode = TImode;
17542 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17543 }
17544 else
17545 {
17546 rtvec vec;
17547
17548 imode = DImode;
17549 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17550
17551 if (invert)
17552 {
17553 lo = ~lo, hi = ~hi;
17554 v = constm1_rtx;
17555 }
17556 else
17557 v = const0_rtx;
17558
17559 mask = immed_double_const (lo, hi, imode);
17560
17561 vec = gen_rtvec (2, v, mask);
17562 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17563 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17564
17565 return v;
17566 }
17567 break;
17568
17569 default:
17570 gcc_unreachable ();
17571 }
17572
17573 if (invert)
17574 lo = ~lo, hi = ~hi;
17575
17576 /* Force this value into the low part of a fp vector constant. */
17577 mask = immed_double_const (lo, hi, imode);
17578 mask = gen_lowpart (mode, mask);
17579
17580 if (vec_mode == VOIDmode)
17581 return force_reg (mode, mask);
17582
17583 v = ix86_build_const_vector (vec_mode, vect, mask);
17584 return force_reg (vec_mode, v);
17585 }
17586
17587 /* Generate code for floating point ABS or NEG. */
17588
17589 void
17590 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17591 rtx operands[])
17592 {
17593 rtx mask, set, dst, src;
17594 bool use_sse = false;
17595 bool vector_mode = VECTOR_MODE_P (mode);
17596 enum machine_mode vmode = mode;
17597
17598 if (vector_mode)
17599 use_sse = true;
17600 else if (mode == TFmode)
17601 use_sse = true;
17602 else if (TARGET_SSE_MATH)
17603 {
17604 use_sse = SSE_FLOAT_MODE_P (mode);
17605 if (mode == SFmode)
17606 vmode = V4SFmode;
17607 else if (mode == DFmode)
17608 vmode = V2DFmode;
17609 }
17610
17611 /* NEG and ABS performed with SSE use bitwise mask operations.
17612 Create the appropriate mask now. */
17613 if (use_sse)
17614 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17615 else
17616 mask = NULL_RTX;
17617
17618 dst = operands[0];
17619 src = operands[1];
17620
17621 set = gen_rtx_fmt_e (code, mode, src);
17622 set = gen_rtx_SET (VOIDmode, dst, set);
17623
17624 if (mask)
17625 {
17626 rtx use, clob;
17627 rtvec par;
17628
17629 use = gen_rtx_USE (VOIDmode, mask);
17630 if (vector_mode)
17631 par = gen_rtvec (2, set, use);
17632 else
17633 {
17634 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17635 par = gen_rtvec (3, set, use, clob);
17636 }
17637 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17638 }
17639 else
17640 emit_insn (set);
17641 }
17642
17643 /* Expand a copysign operation. Special case operand 0 being a constant. */
17644
17645 void
17646 ix86_expand_copysign (rtx operands[])
17647 {
17648 enum machine_mode mode, vmode;
17649 rtx dest, op0, op1, mask, nmask;
17650
17651 dest = operands[0];
17652 op0 = operands[1];
17653 op1 = operands[2];
17654
17655 mode = GET_MODE (dest);
17656
17657 if (mode == SFmode)
17658 vmode = V4SFmode;
17659 else if (mode == DFmode)
17660 vmode = V2DFmode;
17661 else
17662 vmode = mode;
17663
17664 if (GET_CODE (op0) == CONST_DOUBLE)
17665 {
17666 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17667
17668 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17669 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17670
17671 if (mode == SFmode || mode == DFmode)
17672 {
17673 if (op0 == CONST0_RTX (mode))
17674 op0 = CONST0_RTX (vmode);
17675 else
17676 {
17677 rtx v = ix86_build_const_vector (vmode, false, op0);
17678
17679 op0 = force_reg (vmode, v);
17680 }
17681 }
17682 else if (op0 != CONST0_RTX (mode))
17683 op0 = force_reg (mode, op0);
17684
17685 mask = ix86_build_signbit_mask (vmode, 0, 0);
17686
17687 if (mode == SFmode)
17688 copysign_insn = gen_copysignsf3_const;
17689 else if (mode == DFmode)
17690 copysign_insn = gen_copysigndf3_const;
17691 else
17692 copysign_insn = gen_copysigntf3_const;
17693
17694 emit_insn (copysign_insn (dest, op0, op1, mask));
17695 }
17696 else
17697 {
17698 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17699
17700 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17701 mask = ix86_build_signbit_mask (vmode, 0, 0);
17702
17703 if (mode == SFmode)
17704 copysign_insn = gen_copysignsf3_var;
17705 else if (mode == DFmode)
17706 copysign_insn = gen_copysigndf3_var;
17707 else
17708 copysign_insn = gen_copysigntf3_var;
17709
17710 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17711 }
17712 }
17713
17714 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17715 be a constant, and so has already been expanded into a vector constant. */
17716
17717 void
17718 ix86_split_copysign_const (rtx operands[])
17719 {
17720 enum machine_mode mode, vmode;
17721 rtx dest, op0, mask, x;
17722
17723 dest = operands[0];
17724 op0 = operands[1];
17725 mask = operands[3];
17726
17727 mode = GET_MODE (dest);
17728 vmode = GET_MODE (mask);
17729
17730 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17731 x = gen_rtx_AND (vmode, dest, mask);
17732 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17733
17734 if (op0 != CONST0_RTX (vmode))
17735 {
17736 x = gen_rtx_IOR (vmode, dest, op0);
17737 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17738 }
17739 }
17740
17741 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17742 so we have to do two masks. */
17743
17744 void
17745 ix86_split_copysign_var (rtx operands[])
17746 {
17747 enum machine_mode mode, vmode;
17748 rtx dest, scratch, op0, op1, mask, nmask, x;
17749
17750 dest = operands[0];
17751 scratch = operands[1];
17752 op0 = operands[2];
17753 op1 = operands[3];
17754 nmask = operands[4];
17755 mask = operands[5];
17756
17757 mode = GET_MODE (dest);
17758 vmode = GET_MODE (mask);
17759
17760 if (rtx_equal_p (op0, op1))
17761 {
17762 /* Shouldn't happen often (it's useless, obviously), but when it does
17763 we'd generate incorrect code if we continue below. */
17764 emit_move_insn (dest, op0);
17765 return;
17766 }
17767
17768 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17769 {
17770 gcc_assert (REGNO (op1) == REGNO (scratch));
17771
17772 x = gen_rtx_AND (vmode, scratch, mask);
17773 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17774
17775 dest = mask;
17776 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17777 x = gen_rtx_NOT (vmode, dest);
17778 x = gen_rtx_AND (vmode, x, op0);
17779 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17780 }
17781 else
17782 {
17783 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17784 {
17785 x = gen_rtx_AND (vmode, scratch, mask);
17786 }
17787 else /* alternative 2,4 */
17788 {
17789 gcc_assert (REGNO (mask) == REGNO (scratch));
17790 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17791 x = gen_rtx_AND (vmode, scratch, op1);
17792 }
17793 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17794
17795 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17796 {
17797 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17798 x = gen_rtx_AND (vmode, dest, nmask);
17799 }
17800 else /* alternative 3,4 */
17801 {
17802 gcc_assert (REGNO (nmask) == REGNO (dest));
17803 dest = nmask;
17804 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17805 x = gen_rtx_AND (vmode, dest, op0);
17806 }
17807 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17808 }
17809
17810 x = gen_rtx_IOR (vmode, dest, scratch);
17811 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17812 }
17813
17814 /* Return TRUE or FALSE depending on whether the first SET in INSN
17815 has source and destination with matching CC modes, and that the
17816 CC mode is at least as constrained as REQ_MODE. */
17817
17818 bool
17819 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17820 {
17821 rtx set;
17822 enum machine_mode set_mode;
17823
17824 set = PATTERN (insn);
17825 if (GET_CODE (set) == PARALLEL)
17826 set = XVECEXP (set, 0, 0);
17827 gcc_assert (GET_CODE (set) == SET);
17828 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17829
17830 set_mode = GET_MODE (SET_DEST (set));
17831 switch (set_mode)
17832 {
17833 case CCNOmode:
17834 if (req_mode != CCNOmode
17835 && (req_mode != CCmode
17836 || XEXP (SET_SRC (set), 1) != const0_rtx))
17837 return false;
17838 break;
17839 case CCmode:
17840 if (req_mode == CCGCmode)
17841 return false;
17842 /* FALLTHRU */
17843 case CCGCmode:
17844 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17845 return false;
17846 /* FALLTHRU */
17847 case CCGOCmode:
17848 if (req_mode == CCZmode)
17849 return false;
17850 /* FALLTHRU */
17851 case CCZmode:
17852 break;
17853
17854 case CCAmode:
17855 case CCCmode:
17856 case CCOmode:
17857 case CCSmode:
17858 if (set_mode != req_mode)
17859 return false;
17860 break;
17861
17862 default:
17863 gcc_unreachable ();
17864 }
17865
17866 return GET_MODE (SET_SRC (set)) == set_mode;
17867 }
17868
17869 /* Generate insn patterns to do an integer compare of OPERANDS. */
17870
17871 static rtx
17872 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17873 {
17874 enum machine_mode cmpmode;
17875 rtx tmp, flags;
17876
17877 cmpmode = SELECT_CC_MODE (code, op0, op1);
17878 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17879
17880 /* This is very simple, but making the interface the same as in the
17881 FP case makes the rest of the code easier. */
17882 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17883 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17884
17885 /* Return the test that should be put into the flags user, i.e.
17886 the bcc, scc, or cmov instruction. */
17887 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17888 }
17889
17890 /* Figure out whether to use ordered or unordered fp comparisons.
17891 Return the appropriate mode to use. */
17892
17893 enum machine_mode
17894 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17895 {
17896 /* ??? In order to make all comparisons reversible, we do all comparisons
17897 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17898 all forms trapping and nontrapping comparisons, we can make inequality
17899 comparisons trapping again, since it results in better code when using
17900 FCOM based compares. */
17901 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17902 }
17903
17904 enum machine_mode
17905 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17906 {
17907 enum machine_mode mode = GET_MODE (op0);
17908
17909 if (SCALAR_FLOAT_MODE_P (mode))
17910 {
17911 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17912 return ix86_fp_compare_mode (code);
17913 }
17914
17915 switch (code)
17916 {
17917 /* Only zero flag is needed. */
17918 case EQ: /* ZF=0 */
17919 case NE: /* ZF!=0 */
17920 return CCZmode;
17921 /* Codes needing carry flag. */
17922 case GEU: /* CF=0 */
17923 case LTU: /* CF=1 */
17924 /* Detect overflow checks. They need just the carry flag. */
17925 if (GET_CODE (op0) == PLUS
17926 && rtx_equal_p (op1, XEXP (op0, 0)))
17927 return CCCmode;
17928 else
17929 return CCmode;
17930 case GTU: /* CF=0 & ZF=0 */
17931 case LEU: /* CF=1 | ZF=1 */
17932 /* Detect overflow checks. They need just the carry flag. */
17933 if (GET_CODE (op0) == MINUS
17934 && rtx_equal_p (op1, XEXP (op0, 0)))
17935 return CCCmode;
17936 else
17937 return CCmode;
17938 /* Codes possibly doable only with sign flag when
17939 comparing against zero. */
17940 case GE: /* SF=OF or SF=0 */
17941 case LT: /* SF<>OF or SF=1 */
17942 if (op1 == const0_rtx)
17943 return CCGOCmode;
17944 else
17945 /* For other cases Carry flag is not required. */
17946 return CCGCmode;
17947 /* Codes doable only with sign flag when comparing
17948 against zero, but we miss jump instruction for it
17949 so we need to use relational tests against overflow
17950 that thus needs to be zero. */
17951 case GT: /* ZF=0 & SF=OF */
17952 case LE: /* ZF=1 | SF<>OF */
17953 if (op1 == const0_rtx)
17954 return CCNOmode;
17955 else
17956 return CCGCmode;
17957 /* strcmp pattern do (use flags) and combine may ask us for proper
17958 mode. */
17959 case USE:
17960 return CCmode;
17961 default:
17962 gcc_unreachable ();
17963 }
17964 }
17965
17966 /* Return the fixed registers used for condition codes. */
17967
17968 static bool
17969 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17970 {
17971 *p1 = FLAGS_REG;
17972 *p2 = FPSR_REG;
17973 return true;
17974 }
17975
17976 /* If two condition code modes are compatible, return a condition code
17977 mode which is compatible with both. Otherwise, return
17978 VOIDmode. */
17979
17980 static enum machine_mode
17981 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17982 {
17983 if (m1 == m2)
17984 return m1;
17985
17986 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17987 return VOIDmode;
17988
17989 if ((m1 == CCGCmode && m2 == CCGOCmode)
17990 || (m1 == CCGOCmode && m2 == CCGCmode))
17991 return CCGCmode;
17992
17993 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17994 return m2;
17995 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17996 return m1;
17997
17998 switch (m1)
17999 {
18000 default:
18001 gcc_unreachable ();
18002
18003 case CCmode:
18004 case CCGCmode:
18005 case CCGOCmode:
18006 case CCNOmode:
18007 case CCAmode:
18008 case CCCmode:
18009 case CCOmode:
18010 case CCSmode:
18011 case CCZmode:
18012 switch (m2)
18013 {
18014 default:
18015 return VOIDmode;
18016
18017 case CCmode:
18018 case CCGCmode:
18019 case CCGOCmode:
18020 case CCNOmode:
18021 case CCAmode:
18022 case CCCmode:
18023 case CCOmode:
18024 case CCSmode:
18025 case CCZmode:
18026 return CCmode;
18027 }
18028
18029 case CCFPmode:
18030 case CCFPUmode:
18031 /* These are only compatible with themselves, which we already
18032 checked above. */
18033 return VOIDmode;
18034 }
18035 }
18036
18037
18038 /* Return a comparison we can do and that it is equivalent to
18039 swap_condition (code) apart possibly from orderedness.
18040 But, never change orderedness if TARGET_IEEE_FP, returning
18041 UNKNOWN in that case if necessary. */
18042
18043 static enum rtx_code
18044 ix86_fp_swap_condition (enum rtx_code code)
18045 {
18046 switch (code)
18047 {
18048 case GT: /* GTU - CF=0 & ZF=0 */
18049 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18050 case GE: /* GEU - CF=0 */
18051 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18052 case UNLT: /* LTU - CF=1 */
18053 return TARGET_IEEE_FP ? UNKNOWN : GT;
18054 case UNLE: /* LEU - CF=1 | ZF=1 */
18055 return TARGET_IEEE_FP ? UNKNOWN : GE;
18056 default:
18057 return swap_condition (code);
18058 }
18059 }
18060
18061 /* Return cost of comparison CODE using the best strategy for performance.
18062 All following functions do use number of instructions as a cost metrics.
18063 In future this should be tweaked to compute bytes for optimize_size and
18064 take into account performance of various instructions on various CPUs. */
18065
18066 static int
18067 ix86_fp_comparison_cost (enum rtx_code code)
18068 {
18069 int arith_cost;
18070
18071 /* The cost of code using bit-twiddling on %ah. */
18072 switch (code)
18073 {
18074 case UNLE:
18075 case UNLT:
18076 case LTGT:
18077 case GT:
18078 case GE:
18079 case UNORDERED:
18080 case ORDERED:
18081 case UNEQ:
18082 arith_cost = 4;
18083 break;
18084 case LT:
18085 case NE:
18086 case EQ:
18087 case UNGE:
18088 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18089 break;
18090 case LE:
18091 case UNGT:
18092 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18093 break;
18094 default:
18095 gcc_unreachable ();
18096 }
18097
18098 switch (ix86_fp_comparison_strategy (code))
18099 {
18100 case IX86_FPCMP_COMI:
18101 return arith_cost > 4 ? 3 : 2;
18102 case IX86_FPCMP_SAHF:
18103 return arith_cost > 4 ? 4 : 3;
18104 default:
18105 return arith_cost;
18106 }
18107 }
18108
18109 /* Return strategy to use for floating-point. We assume that fcomi is always
18110 preferrable where available, since that is also true when looking at size
18111 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18112
18113 enum ix86_fpcmp_strategy
18114 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18115 {
18116 /* Do fcomi/sahf based test when profitable. */
18117
18118 if (TARGET_CMOVE)
18119 return IX86_FPCMP_COMI;
18120
18121 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18122 return IX86_FPCMP_SAHF;
18123
18124 return IX86_FPCMP_ARITH;
18125 }
18126
18127 /* Swap, force into registers, or otherwise massage the two operands
18128 to a fp comparison. The operands are updated in place; the new
18129 comparison code is returned. */
18130
18131 static enum rtx_code
18132 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18133 {
18134 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18135 rtx op0 = *pop0, op1 = *pop1;
18136 enum machine_mode op_mode = GET_MODE (op0);
18137 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18138
18139 /* All of the unordered compare instructions only work on registers.
18140 The same is true of the fcomi compare instructions. The XFmode
18141 compare instructions require registers except when comparing
18142 against zero or when converting operand 1 from fixed point to
18143 floating point. */
18144
18145 if (!is_sse
18146 && (fpcmp_mode == CCFPUmode
18147 || (op_mode == XFmode
18148 && ! (standard_80387_constant_p (op0) == 1
18149 || standard_80387_constant_p (op1) == 1)
18150 && GET_CODE (op1) != FLOAT)
18151 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18152 {
18153 op0 = force_reg (op_mode, op0);
18154 op1 = force_reg (op_mode, op1);
18155 }
18156 else
18157 {
18158 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18159 things around if they appear profitable, otherwise force op0
18160 into a register. */
18161
18162 if (standard_80387_constant_p (op0) == 0
18163 || (MEM_P (op0)
18164 && ! (standard_80387_constant_p (op1) == 0
18165 || MEM_P (op1))))
18166 {
18167 enum rtx_code new_code = ix86_fp_swap_condition (code);
18168 if (new_code != UNKNOWN)
18169 {
18170 rtx tmp;
18171 tmp = op0, op0 = op1, op1 = tmp;
18172 code = new_code;
18173 }
18174 }
18175
18176 if (!REG_P (op0))
18177 op0 = force_reg (op_mode, op0);
18178
18179 if (CONSTANT_P (op1))
18180 {
18181 int tmp = standard_80387_constant_p (op1);
18182 if (tmp == 0)
18183 op1 = validize_mem (force_const_mem (op_mode, op1));
18184 else if (tmp == 1)
18185 {
18186 if (TARGET_CMOVE)
18187 op1 = force_reg (op_mode, op1);
18188 }
18189 else
18190 op1 = force_reg (op_mode, op1);
18191 }
18192 }
18193
18194 /* Try to rearrange the comparison to make it cheaper. */
18195 if (ix86_fp_comparison_cost (code)
18196 > ix86_fp_comparison_cost (swap_condition (code))
18197 && (REG_P (op1) || can_create_pseudo_p ()))
18198 {
18199 rtx tmp;
18200 tmp = op0, op0 = op1, op1 = tmp;
18201 code = swap_condition (code);
18202 if (!REG_P (op0))
18203 op0 = force_reg (op_mode, op0);
18204 }
18205
18206 *pop0 = op0;
18207 *pop1 = op1;
18208 return code;
18209 }
18210
18211 /* Convert comparison codes we use to represent FP comparison to integer
18212 code that will result in proper branch. Return UNKNOWN if no such code
18213 is available. */
18214
18215 enum rtx_code
18216 ix86_fp_compare_code_to_integer (enum rtx_code code)
18217 {
18218 switch (code)
18219 {
18220 case GT:
18221 return GTU;
18222 case GE:
18223 return GEU;
18224 case ORDERED:
18225 case UNORDERED:
18226 return code;
18227 break;
18228 case UNEQ:
18229 return EQ;
18230 break;
18231 case UNLT:
18232 return LTU;
18233 break;
18234 case UNLE:
18235 return LEU;
18236 break;
18237 case LTGT:
18238 return NE;
18239 break;
18240 default:
18241 return UNKNOWN;
18242 }
18243 }
18244
18245 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18246
18247 static rtx
18248 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18249 {
18250 enum machine_mode fpcmp_mode, intcmp_mode;
18251 rtx tmp, tmp2;
18252
18253 fpcmp_mode = ix86_fp_compare_mode (code);
18254 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18255
18256 /* Do fcomi/sahf based test when profitable. */
18257 switch (ix86_fp_comparison_strategy (code))
18258 {
18259 case IX86_FPCMP_COMI:
18260 intcmp_mode = fpcmp_mode;
18261 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18262 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18263 tmp);
18264 emit_insn (tmp);
18265 break;
18266
18267 case IX86_FPCMP_SAHF:
18268 intcmp_mode = fpcmp_mode;
18269 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18270 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18271 tmp);
18272
18273 if (!scratch)
18274 scratch = gen_reg_rtx (HImode);
18275 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18276 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18277 break;
18278
18279 case IX86_FPCMP_ARITH:
18280 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18281 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18282 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18283 if (!scratch)
18284 scratch = gen_reg_rtx (HImode);
18285 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18286
18287 /* In the unordered case, we have to check C2 for NaN's, which
18288 doesn't happen to work out to anything nice combination-wise.
18289 So do some bit twiddling on the value we've got in AH to come
18290 up with an appropriate set of condition codes. */
18291
18292 intcmp_mode = CCNOmode;
18293 switch (code)
18294 {
18295 case GT:
18296 case UNGT:
18297 if (code == GT || !TARGET_IEEE_FP)
18298 {
18299 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18300 code = EQ;
18301 }
18302 else
18303 {
18304 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18305 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18306 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18307 intcmp_mode = CCmode;
18308 code = GEU;
18309 }
18310 break;
18311 case LT:
18312 case UNLT:
18313 if (code == LT && TARGET_IEEE_FP)
18314 {
18315 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18316 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18317 intcmp_mode = CCmode;
18318 code = EQ;
18319 }
18320 else
18321 {
18322 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18323 code = NE;
18324 }
18325 break;
18326 case GE:
18327 case UNGE:
18328 if (code == GE || !TARGET_IEEE_FP)
18329 {
18330 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18331 code = EQ;
18332 }
18333 else
18334 {
18335 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18336 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18337 code = NE;
18338 }
18339 break;
18340 case LE:
18341 case UNLE:
18342 if (code == LE && TARGET_IEEE_FP)
18343 {
18344 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18345 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18346 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18347 intcmp_mode = CCmode;
18348 code = LTU;
18349 }
18350 else
18351 {
18352 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18353 code = NE;
18354 }
18355 break;
18356 case EQ:
18357 case UNEQ:
18358 if (code == EQ && TARGET_IEEE_FP)
18359 {
18360 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18361 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18362 intcmp_mode = CCmode;
18363 code = EQ;
18364 }
18365 else
18366 {
18367 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18368 code = NE;
18369 }
18370 break;
18371 case NE:
18372 case LTGT:
18373 if (code == NE && TARGET_IEEE_FP)
18374 {
18375 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18376 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18377 GEN_INT (0x40)));
18378 code = NE;
18379 }
18380 else
18381 {
18382 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18383 code = EQ;
18384 }
18385 break;
18386
18387 case UNORDERED:
18388 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18389 code = NE;
18390 break;
18391 case ORDERED:
18392 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18393 code = EQ;
18394 break;
18395
18396 default:
18397 gcc_unreachable ();
18398 }
18399 break;
18400
18401 default:
18402 gcc_unreachable();
18403 }
18404
18405 /* Return the test that should be put into the flags user, i.e.
18406 the bcc, scc, or cmov instruction. */
18407 return gen_rtx_fmt_ee (code, VOIDmode,
18408 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18409 const0_rtx);
18410 }
18411
18412 static rtx
18413 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18414 {
18415 rtx ret;
18416
18417 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18418 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18419
18420 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18421 {
18422 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18423 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18424 }
18425 else
18426 ret = ix86_expand_int_compare (code, op0, op1);
18427
18428 return ret;
18429 }
18430
18431 void
18432 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18433 {
18434 enum machine_mode mode = GET_MODE (op0);
18435 rtx tmp;
18436
18437 switch (mode)
18438 {
18439 case SFmode:
18440 case DFmode:
18441 case XFmode:
18442 case QImode:
18443 case HImode:
18444 case SImode:
18445 simple:
18446 tmp = ix86_expand_compare (code, op0, op1);
18447 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18448 gen_rtx_LABEL_REF (VOIDmode, label),
18449 pc_rtx);
18450 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18451 return;
18452
18453 case DImode:
18454 if (TARGET_64BIT)
18455 goto simple;
18456 case TImode:
18457 /* Expand DImode branch into multiple compare+branch. */
18458 {
18459 rtx lo[2], hi[2], label2;
18460 enum rtx_code code1, code2, code3;
18461 enum machine_mode submode;
18462
18463 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18464 {
18465 tmp = op0, op0 = op1, op1 = tmp;
18466 code = swap_condition (code);
18467 }
18468
18469 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18470 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18471
18472 submode = mode == DImode ? SImode : DImode;
18473
18474 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18475 avoid two branches. This costs one extra insn, so disable when
18476 optimizing for size. */
18477
18478 if ((code == EQ || code == NE)
18479 && (!optimize_insn_for_size_p ()
18480 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18481 {
18482 rtx xor0, xor1;
18483
18484 xor1 = hi[0];
18485 if (hi[1] != const0_rtx)
18486 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18487 NULL_RTX, 0, OPTAB_WIDEN);
18488
18489 xor0 = lo[0];
18490 if (lo[1] != const0_rtx)
18491 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18492 NULL_RTX, 0, OPTAB_WIDEN);
18493
18494 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18495 NULL_RTX, 0, OPTAB_WIDEN);
18496
18497 ix86_expand_branch (code, tmp, const0_rtx, label);
18498 return;
18499 }
18500
18501 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18502 op1 is a constant and the low word is zero, then we can just
18503 examine the high word. Similarly for low word -1 and
18504 less-or-equal-than or greater-than. */
18505
18506 if (CONST_INT_P (hi[1]))
18507 switch (code)
18508 {
18509 case LT: case LTU: case GE: case GEU:
18510 if (lo[1] == const0_rtx)
18511 {
18512 ix86_expand_branch (code, hi[0], hi[1], label);
18513 return;
18514 }
18515 break;
18516 case LE: case LEU: case GT: case GTU:
18517 if (lo[1] == constm1_rtx)
18518 {
18519 ix86_expand_branch (code, hi[0], hi[1], label);
18520 return;
18521 }
18522 break;
18523 default:
18524 break;
18525 }
18526
18527 /* Otherwise, we need two or three jumps. */
18528
18529 label2 = gen_label_rtx ();
18530
18531 code1 = code;
18532 code2 = swap_condition (code);
18533 code3 = unsigned_condition (code);
18534
18535 switch (code)
18536 {
18537 case LT: case GT: case LTU: case GTU:
18538 break;
18539
18540 case LE: code1 = LT; code2 = GT; break;
18541 case GE: code1 = GT; code2 = LT; break;
18542 case LEU: code1 = LTU; code2 = GTU; break;
18543 case GEU: code1 = GTU; code2 = LTU; break;
18544
18545 case EQ: code1 = UNKNOWN; code2 = NE; break;
18546 case NE: code2 = UNKNOWN; break;
18547
18548 default:
18549 gcc_unreachable ();
18550 }
18551
18552 /*
18553 * a < b =>
18554 * if (hi(a) < hi(b)) goto true;
18555 * if (hi(a) > hi(b)) goto false;
18556 * if (lo(a) < lo(b)) goto true;
18557 * false:
18558 */
18559
18560 if (code1 != UNKNOWN)
18561 ix86_expand_branch (code1, hi[0], hi[1], label);
18562 if (code2 != UNKNOWN)
18563 ix86_expand_branch (code2, hi[0], hi[1], label2);
18564
18565 ix86_expand_branch (code3, lo[0], lo[1], label);
18566
18567 if (code2 != UNKNOWN)
18568 emit_label (label2);
18569 return;
18570 }
18571
18572 default:
18573 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18574 goto simple;
18575 }
18576 }
18577
18578 /* Split branch based on floating point condition. */
18579 void
18580 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18581 rtx target1, rtx target2, rtx tmp, rtx pushed)
18582 {
18583 rtx condition;
18584 rtx i;
18585
18586 if (target2 != pc_rtx)
18587 {
18588 rtx tmp = target2;
18589 code = reverse_condition_maybe_unordered (code);
18590 target2 = target1;
18591 target1 = tmp;
18592 }
18593
18594 condition = ix86_expand_fp_compare (code, op1, op2,
18595 tmp);
18596
18597 /* Remove pushed operand from stack. */
18598 if (pushed)
18599 ix86_free_from_memory (GET_MODE (pushed));
18600
18601 i = emit_jump_insn (gen_rtx_SET
18602 (VOIDmode, pc_rtx,
18603 gen_rtx_IF_THEN_ELSE (VOIDmode,
18604 condition, target1, target2)));
18605 if (split_branch_probability >= 0)
18606 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18607 }
18608
18609 void
18610 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18611 {
18612 rtx ret;
18613
18614 gcc_assert (GET_MODE (dest) == QImode);
18615
18616 ret = ix86_expand_compare (code, op0, op1);
18617 PUT_MODE (ret, QImode);
18618 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18619 }
18620
18621 /* Expand comparison setting or clearing carry flag. Return true when
18622 successful and set pop for the operation. */
18623 static bool
18624 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18625 {
18626 enum machine_mode mode =
18627 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18628
18629 /* Do not handle double-mode compares that go through special path. */
18630 if (mode == (TARGET_64BIT ? TImode : DImode))
18631 return false;
18632
18633 if (SCALAR_FLOAT_MODE_P (mode))
18634 {
18635 rtx compare_op, compare_seq;
18636
18637 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18638
18639 /* Shortcut: following common codes never translate
18640 into carry flag compares. */
18641 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18642 || code == ORDERED || code == UNORDERED)
18643 return false;
18644
18645 /* These comparisons require zero flag; swap operands so they won't. */
18646 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18647 && !TARGET_IEEE_FP)
18648 {
18649 rtx tmp = op0;
18650 op0 = op1;
18651 op1 = tmp;
18652 code = swap_condition (code);
18653 }
18654
18655 /* Try to expand the comparison and verify that we end up with
18656 carry flag based comparison. This fails to be true only when
18657 we decide to expand comparison using arithmetic that is not
18658 too common scenario. */
18659 start_sequence ();
18660 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18661 compare_seq = get_insns ();
18662 end_sequence ();
18663
18664 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18665 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18666 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18667 else
18668 code = GET_CODE (compare_op);
18669
18670 if (code != LTU && code != GEU)
18671 return false;
18672
18673 emit_insn (compare_seq);
18674 *pop = compare_op;
18675 return true;
18676 }
18677
18678 if (!INTEGRAL_MODE_P (mode))
18679 return false;
18680
18681 switch (code)
18682 {
18683 case LTU:
18684 case GEU:
18685 break;
18686
18687 /* Convert a==0 into (unsigned)a<1. */
18688 case EQ:
18689 case NE:
18690 if (op1 != const0_rtx)
18691 return false;
18692 op1 = const1_rtx;
18693 code = (code == EQ ? LTU : GEU);
18694 break;
18695
18696 /* Convert a>b into b<a or a>=b-1. */
18697 case GTU:
18698 case LEU:
18699 if (CONST_INT_P (op1))
18700 {
18701 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18702 /* Bail out on overflow. We still can swap operands but that
18703 would force loading of the constant into register. */
18704 if (op1 == const0_rtx
18705 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18706 return false;
18707 code = (code == GTU ? GEU : LTU);
18708 }
18709 else
18710 {
18711 rtx tmp = op1;
18712 op1 = op0;
18713 op0 = tmp;
18714 code = (code == GTU ? LTU : GEU);
18715 }
18716 break;
18717
18718 /* Convert a>=0 into (unsigned)a<0x80000000. */
18719 case LT:
18720 case GE:
18721 if (mode == DImode || op1 != const0_rtx)
18722 return false;
18723 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18724 code = (code == LT ? GEU : LTU);
18725 break;
18726 case LE:
18727 case GT:
18728 if (mode == DImode || op1 != constm1_rtx)
18729 return false;
18730 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18731 code = (code == LE ? GEU : LTU);
18732 break;
18733
18734 default:
18735 return false;
18736 }
18737 /* Swapping operands may cause constant to appear as first operand. */
18738 if (!nonimmediate_operand (op0, VOIDmode))
18739 {
18740 if (!can_create_pseudo_p ())
18741 return false;
18742 op0 = force_reg (mode, op0);
18743 }
18744 *pop = ix86_expand_compare (code, op0, op1);
18745 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18746 return true;
18747 }
18748
18749 bool
18750 ix86_expand_int_movcc (rtx operands[])
18751 {
18752 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18753 rtx compare_seq, compare_op;
18754 enum machine_mode mode = GET_MODE (operands[0]);
18755 bool sign_bit_compare_p = false;
18756 rtx op0 = XEXP (operands[1], 0);
18757 rtx op1 = XEXP (operands[1], 1);
18758
18759 if (GET_MODE (op0) == TImode
18760 || (GET_MODE (op0) == DImode
18761 && !TARGET_64BIT))
18762 return false;
18763
18764 start_sequence ();
18765 compare_op = ix86_expand_compare (code, op0, op1);
18766 compare_seq = get_insns ();
18767 end_sequence ();
18768
18769 compare_code = GET_CODE (compare_op);
18770
18771 if ((op1 == const0_rtx && (code == GE || code == LT))
18772 || (op1 == constm1_rtx && (code == GT || code == LE)))
18773 sign_bit_compare_p = true;
18774
18775 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18776 HImode insns, we'd be swallowed in word prefix ops. */
18777
18778 if ((mode != HImode || TARGET_FAST_PREFIX)
18779 && (mode != (TARGET_64BIT ? TImode : DImode))
18780 && CONST_INT_P (operands[2])
18781 && CONST_INT_P (operands[3]))
18782 {
18783 rtx out = operands[0];
18784 HOST_WIDE_INT ct = INTVAL (operands[2]);
18785 HOST_WIDE_INT cf = INTVAL (operands[3]);
18786 HOST_WIDE_INT diff;
18787
18788 diff = ct - cf;
18789 /* Sign bit compares are better done using shifts than we do by using
18790 sbb. */
18791 if (sign_bit_compare_p
18792 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18793 {
18794 /* Detect overlap between destination and compare sources. */
18795 rtx tmp = out;
18796
18797 if (!sign_bit_compare_p)
18798 {
18799 rtx flags;
18800 bool fpcmp = false;
18801
18802 compare_code = GET_CODE (compare_op);
18803
18804 flags = XEXP (compare_op, 0);
18805
18806 if (GET_MODE (flags) == CCFPmode
18807 || GET_MODE (flags) == CCFPUmode)
18808 {
18809 fpcmp = true;
18810 compare_code
18811 = ix86_fp_compare_code_to_integer (compare_code);
18812 }
18813
18814 /* To simplify rest of code, restrict to the GEU case. */
18815 if (compare_code == LTU)
18816 {
18817 HOST_WIDE_INT tmp = ct;
18818 ct = cf;
18819 cf = tmp;
18820 compare_code = reverse_condition (compare_code);
18821 code = reverse_condition (code);
18822 }
18823 else
18824 {
18825 if (fpcmp)
18826 PUT_CODE (compare_op,
18827 reverse_condition_maybe_unordered
18828 (GET_CODE (compare_op)));
18829 else
18830 PUT_CODE (compare_op,
18831 reverse_condition (GET_CODE (compare_op)));
18832 }
18833 diff = ct - cf;
18834
18835 if (reg_overlap_mentioned_p (out, op0)
18836 || reg_overlap_mentioned_p (out, op1))
18837 tmp = gen_reg_rtx (mode);
18838
18839 if (mode == DImode)
18840 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18841 else
18842 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18843 flags, compare_op));
18844 }
18845 else
18846 {
18847 if (code == GT || code == GE)
18848 code = reverse_condition (code);
18849 else
18850 {
18851 HOST_WIDE_INT tmp = ct;
18852 ct = cf;
18853 cf = tmp;
18854 diff = ct - cf;
18855 }
18856 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18857 }
18858
18859 if (diff == 1)
18860 {
18861 /*
18862 * cmpl op0,op1
18863 * sbbl dest,dest
18864 * [addl dest, ct]
18865 *
18866 * Size 5 - 8.
18867 */
18868 if (ct)
18869 tmp = expand_simple_binop (mode, PLUS,
18870 tmp, GEN_INT (ct),
18871 copy_rtx (tmp), 1, OPTAB_DIRECT);
18872 }
18873 else if (cf == -1)
18874 {
18875 /*
18876 * cmpl op0,op1
18877 * sbbl dest,dest
18878 * orl $ct, dest
18879 *
18880 * Size 8.
18881 */
18882 tmp = expand_simple_binop (mode, IOR,
18883 tmp, GEN_INT (ct),
18884 copy_rtx (tmp), 1, OPTAB_DIRECT);
18885 }
18886 else if (diff == -1 && ct)
18887 {
18888 /*
18889 * cmpl op0,op1
18890 * sbbl dest,dest
18891 * notl dest
18892 * [addl dest, cf]
18893 *
18894 * Size 8 - 11.
18895 */
18896 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18897 if (cf)
18898 tmp = expand_simple_binop (mode, PLUS,
18899 copy_rtx (tmp), GEN_INT (cf),
18900 copy_rtx (tmp), 1, OPTAB_DIRECT);
18901 }
18902 else
18903 {
18904 /*
18905 * cmpl op0,op1
18906 * sbbl dest,dest
18907 * [notl dest]
18908 * andl cf - ct, dest
18909 * [addl dest, ct]
18910 *
18911 * Size 8 - 11.
18912 */
18913
18914 if (cf == 0)
18915 {
18916 cf = ct;
18917 ct = 0;
18918 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18919 }
18920
18921 tmp = expand_simple_binop (mode, AND,
18922 copy_rtx (tmp),
18923 gen_int_mode (cf - ct, mode),
18924 copy_rtx (tmp), 1, OPTAB_DIRECT);
18925 if (ct)
18926 tmp = expand_simple_binop (mode, PLUS,
18927 copy_rtx (tmp), GEN_INT (ct),
18928 copy_rtx (tmp), 1, OPTAB_DIRECT);
18929 }
18930
18931 if (!rtx_equal_p (tmp, out))
18932 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18933
18934 return true;
18935 }
18936
18937 if (diff < 0)
18938 {
18939 enum machine_mode cmp_mode = GET_MODE (op0);
18940
18941 HOST_WIDE_INT tmp;
18942 tmp = ct, ct = cf, cf = tmp;
18943 diff = -diff;
18944
18945 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18946 {
18947 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18948
18949 /* We may be reversing unordered compare to normal compare, that
18950 is not valid in general (we may convert non-trapping condition
18951 to trapping one), however on i386 we currently emit all
18952 comparisons unordered. */
18953 compare_code = reverse_condition_maybe_unordered (compare_code);
18954 code = reverse_condition_maybe_unordered (code);
18955 }
18956 else
18957 {
18958 compare_code = reverse_condition (compare_code);
18959 code = reverse_condition (code);
18960 }
18961 }
18962
18963 compare_code = UNKNOWN;
18964 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18965 && CONST_INT_P (op1))
18966 {
18967 if (op1 == const0_rtx
18968 && (code == LT || code == GE))
18969 compare_code = code;
18970 else if (op1 == constm1_rtx)
18971 {
18972 if (code == LE)
18973 compare_code = LT;
18974 else if (code == GT)
18975 compare_code = GE;
18976 }
18977 }
18978
18979 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18980 if (compare_code != UNKNOWN
18981 && GET_MODE (op0) == GET_MODE (out)
18982 && (cf == -1 || ct == -1))
18983 {
18984 /* If lea code below could be used, only optimize
18985 if it results in a 2 insn sequence. */
18986
18987 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18988 || diff == 3 || diff == 5 || diff == 9)
18989 || (compare_code == LT && ct == -1)
18990 || (compare_code == GE && cf == -1))
18991 {
18992 /*
18993 * notl op1 (if necessary)
18994 * sarl $31, op1
18995 * orl cf, op1
18996 */
18997 if (ct != -1)
18998 {
18999 cf = ct;
19000 ct = -1;
19001 code = reverse_condition (code);
19002 }
19003
19004 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19005
19006 out = expand_simple_binop (mode, IOR,
19007 out, GEN_INT (cf),
19008 out, 1, OPTAB_DIRECT);
19009 if (out != operands[0])
19010 emit_move_insn (operands[0], out);
19011
19012 return true;
19013 }
19014 }
19015
19016
19017 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19018 || diff == 3 || diff == 5 || diff == 9)
19019 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19020 && (mode != DImode
19021 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19022 {
19023 /*
19024 * xorl dest,dest
19025 * cmpl op1,op2
19026 * setcc dest
19027 * lea cf(dest*(ct-cf)),dest
19028 *
19029 * Size 14.
19030 *
19031 * This also catches the degenerate setcc-only case.
19032 */
19033
19034 rtx tmp;
19035 int nops;
19036
19037 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19038
19039 nops = 0;
19040 /* On x86_64 the lea instruction operates on Pmode, so we need
19041 to get arithmetics done in proper mode to match. */
19042 if (diff == 1)
19043 tmp = copy_rtx (out);
19044 else
19045 {
19046 rtx out1;
19047 out1 = copy_rtx (out);
19048 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19049 nops++;
19050 if (diff & 1)
19051 {
19052 tmp = gen_rtx_PLUS (mode, tmp, out1);
19053 nops++;
19054 }
19055 }
19056 if (cf != 0)
19057 {
19058 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19059 nops++;
19060 }
19061 if (!rtx_equal_p (tmp, out))
19062 {
19063 if (nops == 1)
19064 out = force_operand (tmp, copy_rtx (out));
19065 else
19066 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19067 }
19068 if (!rtx_equal_p (out, operands[0]))
19069 emit_move_insn (operands[0], copy_rtx (out));
19070
19071 return true;
19072 }
19073
19074 /*
19075 * General case: Jumpful:
19076 * xorl dest,dest cmpl op1, op2
19077 * cmpl op1, op2 movl ct, dest
19078 * setcc dest jcc 1f
19079 * decl dest movl cf, dest
19080 * andl (cf-ct),dest 1:
19081 * addl ct,dest
19082 *
19083 * Size 20. Size 14.
19084 *
19085 * This is reasonably steep, but branch mispredict costs are
19086 * high on modern cpus, so consider failing only if optimizing
19087 * for space.
19088 */
19089
19090 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19091 && BRANCH_COST (optimize_insn_for_speed_p (),
19092 false) >= 2)
19093 {
19094 if (cf == 0)
19095 {
19096 enum machine_mode cmp_mode = GET_MODE (op0);
19097
19098 cf = ct;
19099 ct = 0;
19100
19101 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19102 {
19103 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19104
19105 /* We may be reversing unordered compare to normal compare,
19106 that is not valid in general (we may convert non-trapping
19107 condition to trapping one), however on i386 we currently
19108 emit all comparisons unordered. */
19109 code = reverse_condition_maybe_unordered (code);
19110 }
19111 else
19112 {
19113 code = reverse_condition (code);
19114 if (compare_code != UNKNOWN)
19115 compare_code = reverse_condition (compare_code);
19116 }
19117 }
19118
19119 if (compare_code != UNKNOWN)
19120 {
19121 /* notl op1 (if needed)
19122 sarl $31, op1
19123 andl (cf-ct), op1
19124 addl ct, op1
19125
19126 For x < 0 (resp. x <= -1) there will be no notl,
19127 so if possible swap the constants to get rid of the
19128 complement.
19129 True/false will be -1/0 while code below (store flag
19130 followed by decrement) is 0/-1, so the constants need
19131 to be exchanged once more. */
19132
19133 if (compare_code == GE || !cf)
19134 {
19135 code = reverse_condition (code);
19136 compare_code = LT;
19137 }
19138 else
19139 {
19140 HOST_WIDE_INT tmp = cf;
19141 cf = ct;
19142 ct = tmp;
19143 }
19144
19145 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19146 }
19147 else
19148 {
19149 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19150
19151 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19152 constm1_rtx,
19153 copy_rtx (out), 1, OPTAB_DIRECT);
19154 }
19155
19156 out = expand_simple_binop (mode, AND, copy_rtx (out),
19157 gen_int_mode (cf - ct, mode),
19158 copy_rtx (out), 1, OPTAB_DIRECT);
19159 if (ct)
19160 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19161 copy_rtx (out), 1, OPTAB_DIRECT);
19162 if (!rtx_equal_p (out, operands[0]))
19163 emit_move_insn (operands[0], copy_rtx (out));
19164
19165 return true;
19166 }
19167 }
19168
19169 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19170 {
19171 /* Try a few things more with specific constants and a variable. */
19172
19173 optab op;
19174 rtx var, orig_out, out, tmp;
19175
19176 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19177 return false;
19178
19179 /* If one of the two operands is an interesting constant, load a
19180 constant with the above and mask it in with a logical operation. */
19181
19182 if (CONST_INT_P (operands[2]))
19183 {
19184 var = operands[3];
19185 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19186 operands[3] = constm1_rtx, op = and_optab;
19187 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19188 operands[3] = const0_rtx, op = ior_optab;
19189 else
19190 return false;
19191 }
19192 else if (CONST_INT_P (operands[3]))
19193 {
19194 var = operands[2];
19195 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19196 operands[2] = constm1_rtx, op = and_optab;
19197 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19198 operands[2] = const0_rtx, op = ior_optab;
19199 else
19200 return false;
19201 }
19202 else
19203 return false;
19204
19205 orig_out = operands[0];
19206 tmp = gen_reg_rtx (mode);
19207 operands[0] = tmp;
19208
19209 /* Recurse to get the constant loaded. */
19210 if (ix86_expand_int_movcc (operands) == 0)
19211 return false;
19212
19213 /* Mask in the interesting variable. */
19214 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19215 OPTAB_WIDEN);
19216 if (!rtx_equal_p (out, orig_out))
19217 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19218
19219 return true;
19220 }
19221
19222 /*
19223 * For comparison with above,
19224 *
19225 * movl cf,dest
19226 * movl ct,tmp
19227 * cmpl op1,op2
19228 * cmovcc tmp,dest
19229 *
19230 * Size 15.
19231 */
19232
19233 if (! nonimmediate_operand (operands[2], mode))
19234 operands[2] = force_reg (mode, operands[2]);
19235 if (! nonimmediate_operand (operands[3], mode))
19236 operands[3] = force_reg (mode, operands[3]);
19237
19238 if (! register_operand (operands[2], VOIDmode)
19239 && (mode == QImode
19240 || ! register_operand (operands[3], VOIDmode)))
19241 operands[2] = force_reg (mode, operands[2]);
19242
19243 if (mode == QImode
19244 && ! register_operand (operands[3], VOIDmode))
19245 operands[3] = force_reg (mode, operands[3]);
19246
19247 emit_insn (compare_seq);
19248 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19249 gen_rtx_IF_THEN_ELSE (mode,
19250 compare_op, operands[2],
19251 operands[3])));
19252 return true;
19253 }
19254
19255 /* Swap, force into registers, or otherwise massage the two operands
19256 to an sse comparison with a mask result. Thus we differ a bit from
19257 ix86_prepare_fp_compare_args which expects to produce a flags result.
19258
19259 The DEST operand exists to help determine whether to commute commutative
19260 operators. The POP0/POP1 operands are updated in place. The new
19261 comparison code is returned, or UNKNOWN if not implementable. */
19262
19263 static enum rtx_code
19264 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19265 rtx *pop0, rtx *pop1)
19266 {
19267 rtx tmp;
19268
19269 switch (code)
19270 {
19271 case LTGT:
19272 case UNEQ:
19273 /* AVX supports all the needed comparisons. */
19274 if (TARGET_AVX)
19275 break;
19276 /* We have no LTGT as an operator. We could implement it with
19277 NE & ORDERED, but this requires an extra temporary. It's
19278 not clear that it's worth it. */
19279 return UNKNOWN;
19280
19281 case LT:
19282 case LE:
19283 case UNGT:
19284 case UNGE:
19285 /* These are supported directly. */
19286 break;
19287
19288 case EQ:
19289 case NE:
19290 case UNORDERED:
19291 case ORDERED:
19292 /* AVX has 3 operand comparisons, no need to swap anything. */
19293 if (TARGET_AVX)
19294 break;
19295 /* For commutative operators, try to canonicalize the destination
19296 operand to be first in the comparison - this helps reload to
19297 avoid extra moves. */
19298 if (!dest || !rtx_equal_p (dest, *pop1))
19299 break;
19300 /* FALLTHRU */
19301
19302 case GE:
19303 case GT:
19304 case UNLE:
19305 case UNLT:
19306 /* These are not supported directly before AVX, and furthermore
19307 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19308 comparison operands to transform into something that is
19309 supported. */
19310 tmp = *pop0;
19311 *pop0 = *pop1;
19312 *pop1 = tmp;
19313 code = swap_condition (code);
19314 break;
19315
19316 default:
19317 gcc_unreachable ();
19318 }
19319
19320 return code;
19321 }
19322
19323 /* Detect conditional moves that exactly match min/max operational
19324 semantics. Note that this is IEEE safe, as long as we don't
19325 interchange the operands.
19326
19327 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19328 and TRUE if the operation is successful and instructions are emitted. */
19329
19330 static bool
19331 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19332 rtx cmp_op1, rtx if_true, rtx if_false)
19333 {
19334 enum machine_mode mode;
19335 bool is_min;
19336 rtx tmp;
19337
19338 if (code == LT)
19339 ;
19340 else if (code == UNGE)
19341 {
19342 tmp = if_true;
19343 if_true = if_false;
19344 if_false = tmp;
19345 }
19346 else
19347 return false;
19348
19349 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19350 is_min = true;
19351 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19352 is_min = false;
19353 else
19354 return false;
19355
19356 mode = GET_MODE (dest);
19357
19358 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19359 but MODE may be a vector mode and thus not appropriate. */
19360 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19361 {
19362 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19363 rtvec v;
19364
19365 if_true = force_reg (mode, if_true);
19366 v = gen_rtvec (2, if_true, if_false);
19367 tmp = gen_rtx_UNSPEC (mode, v, u);
19368 }
19369 else
19370 {
19371 code = is_min ? SMIN : SMAX;
19372 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19373 }
19374
19375 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19376 return true;
19377 }
19378
19379 /* Expand an sse vector comparison. Return the register with the result. */
19380
19381 static rtx
19382 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19383 rtx op_true, rtx op_false)
19384 {
19385 enum machine_mode mode = GET_MODE (dest);
19386 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19387 rtx x;
19388
19389 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19390 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19391 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19392
19393 if (optimize
19394 || reg_overlap_mentioned_p (dest, op_true)
19395 || reg_overlap_mentioned_p (dest, op_false))
19396 dest = gen_reg_rtx (mode);
19397
19398 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19399 if (cmp_mode != mode)
19400 {
19401 x = force_reg (cmp_mode, x);
19402 convert_move (dest, x, false);
19403 }
19404 else
19405 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19406
19407 return dest;
19408 }
19409
19410 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19411 operations. This is used for both scalar and vector conditional moves. */
19412
19413 static void
19414 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19415 {
19416 enum machine_mode mode = GET_MODE (dest);
19417 rtx t2, t3, x;
19418
19419 if (vector_all_ones_operand (op_true, mode)
19420 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19421 {
19422 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19423 }
19424 else if (op_false == CONST0_RTX (mode))
19425 {
19426 op_true = force_reg (mode, op_true);
19427 x = gen_rtx_AND (mode, cmp, op_true);
19428 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19429 }
19430 else if (op_true == CONST0_RTX (mode))
19431 {
19432 op_false = force_reg (mode, op_false);
19433 x = gen_rtx_NOT (mode, cmp);
19434 x = gen_rtx_AND (mode, x, op_false);
19435 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19436 }
19437 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19438 {
19439 op_false = force_reg (mode, op_false);
19440 x = gen_rtx_IOR (mode, cmp, op_false);
19441 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19442 }
19443 else if (TARGET_XOP)
19444 {
19445 op_true = force_reg (mode, op_true);
19446
19447 if (!nonimmediate_operand (op_false, mode))
19448 op_false = force_reg (mode, op_false);
19449
19450 emit_insn (gen_rtx_SET (mode, dest,
19451 gen_rtx_IF_THEN_ELSE (mode, cmp,
19452 op_true,
19453 op_false)));
19454 }
19455 else
19456 {
19457 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19458
19459 if (!nonimmediate_operand (op_true, mode))
19460 op_true = force_reg (mode, op_true);
19461
19462 op_false = force_reg (mode, op_false);
19463
19464 switch (mode)
19465 {
19466 case V4SFmode:
19467 if (TARGET_SSE4_1)
19468 gen = gen_sse4_1_blendvps;
19469 break;
19470 case V2DFmode:
19471 if (TARGET_SSE4_1)
19472 gen = gen_sse4_1_blendvpd;
19473 break;
19474 case V16QImode:
19475 case V8HImode:
19476 case V4SImode:
19477 case V2DImode:
19478 if (TARGET_SSE4_1)
19479 {
19480 gen = gen_sse4_1_pblendvb;
19481 dest = gen_lowpart (V16QImode, dest);
19482 op_false = gen_lowpart (V16QImode, op_false);
19483 op_true = gen_lowpart (V16QImode, op_true);
19484 cmp = gen_lowpart (V16QImode, cmp);
19485 }
19486 break;
19487 case V8SFmode:
19488 if (TARGET_AVX)
19489 gen = gen_avx_blendvps256;
19490 break;
19491 case V4DFmode:
19492 if (TARGET_AVX)
19493 gen = gen_avx_blendvpd256;
19494 break;
19495 case V32QImode:
19496 case V16HImode:
19497 case V8SImode:
19498 case V4DImode:
19499 if (TARGET_AVX2)
19500 {
19501 gen = gen_avx2_pblendvb;
19502 dest = gen_lowpart (V32QImode, dest);
19503 op_false = gen_lowpart (V32QImode, op_false);
19504 op_true = gen_lowpart (V32QImode, op_true);
19505 cmp = gen_lowpart (V32QImode, cmp);
19506 }
19507 break;
19508 default:
19509 break;
19510 }
19511
19512 if (gen != NULL)
19513 emit_insn (gen (dest, op_false, op_true, cmp));
19514 else
19515 {
19516 op_true = force_reg (mode, op_true);
19517
19518 t2 = gen_reg_rtx (mode);
19519 if (optimize)
19520 t3 = gen_reg_rtx (mode);
19521 else
19522 t3 = dest;
19523
19524 x = gen_rtx_AND (mode, op_true, cmp);
19525 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19526
19527 x = gen_rtx_NOT (mode, cmp);
19528 x = gen_rtx_AND (mode, x, op_false);
19529 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19530
19531 x = gen_rtx_IOR (mode, t3, t2);
19532 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19533 }
19534 }
19535 }
19536
19537 /* Expand a floating-point conditional move. Return true if successful. */
19538
19539 bool
19540 ix86_expand_fp_movcc (rtx operands[])
19541 {
19542 enum machine_mode mode = GET_MODE (operands[0]);
19543 enum rtx_code code = GET_CODE (operands[1]);
19544 rtx tmp, compare_op;
19545 rtx op0 = XEXP (operands[1], 0);
19546 rtx op1 = XEXP (operands[1], 1);
19547
19548 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19549 {
19550 enum machine_mode cmode;
19551
19552 /* Since we've no cmove for sse registers, don't force bad register
19553 allocation just to gain access to it. Deny movcc when the
19554 comparison mode doesn't match the move mode. */
19555 cmode = GET_MODE (op0);
19556 if (cmode == VOIDmode)
19557 cmode = GET_MODE (op1);
19558 if (cmode != mode)
19559 return false;
19560
19561 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19562 if (code == UNKNOWN)
19563 return false;
19564
19565 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19566 operands[2], operands[3]))
19567 return true;
19568
19569 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19570 operands[2], operands[3]);
19571 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19572 return true;
19573 }
19574
19575 /* The floating point conditional move instructions don't directly
19576 support conditions resulting from a signed integer comparison. */
19577
19578 compare_op = ix86_expand_compare (code, op0, op1);
19579 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19580 {
19581 tmp = gen_reg_rtx (QImode);
19582 ix86_expand_setcc (tmp, code, op0, op1);
19583
19584 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19585 }
19586
19587 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19588 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19589 operands[2], operands[3])));
19590
19591 return true;
19592 }
19593
19594 /* Expand a floating-point vector conditional move; a vcond operation
19595 rather than a movcc operation. */
19596
19597 bool
19598 ix86_expand_fp_vcond (rtx operands[])
19599 {
19600 enum rtx_code code = GET_CODE (operands[3]);
19601 rtx cmp;
19602
19603 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19604 &operands[4], &operands[5]);
19605 if (code == UNKNOWN)
19606 {
19607 rtx temp;
19608 switch (GET_CODE (operands[3]))
19609 {
19610 case LTGT:
19611 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19612 operands[5], operands[0], operands[0]);
19613 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19614 operands[5], operands[1], operands[2]);
19615 code = AND;
19616 break;
19617 case UNEQ:
19618 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19619 operands[5], operands[0], operands[0]);
19620 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19621 operands[5], operands[1], operands[2]);
19622 code = IOR;
19623 break;
19624 default:
19625 gcc_unreachable ();
19626 }
19627 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19628 OPTAB_DIRECT);
19629 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19630 return true;
19631 }
19632
19633 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19634 operands[5], operands[1], operands[2]))
19635 return true;
19636
19637 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19638 operands[1], operands[2]);
19639 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19640 return true;
19641 }
19642
19643 /* Expand a signed/unsigned integral vector conditional move. */
19644
19645 bool
19646 ix86_expand_int_vcond (rtx operands[])
19647 {
19648 enum machine_mode data_mode = GET_MODE (operands[0]);
19649 enum machine_mode mode = GET_MODE (operands[4]);
19650 enum rtx_code code = GET_CODE (operands[3]);
19651 bool negate = false;
19652 rtx x, cop0, cop1;
19653
19654 cop0 = operands[4];
19655 cop1 = operands[5];
19656
19657 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19658 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19659 if ((code == LT || code == GE)
19660 && data_mode == mode
19661 && cop1 == CONST0_RTX (mode)
19662 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19663 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19664 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19665 && (GET_MODE_SIZE (data_mode) == 16
19666 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19667 {
19668 rtx negop = operands[2 - (code == LT)];
19669 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19670 if (negop == CONST1_RTX (data_mode))
19671 {
19672 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19673 operands[0], 1, OPTAB_DIRECT);
19674 if (res != operands[0])
19675 emit_move_insn (operands[0], res);
19676 return true;
19677 }
19678 else if (GET_MODE_INNER (data_mode) != DImode
19679 && vector_all_ones_operand (negop, data_mode))
19680 {
19681 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19682 operands[0], 0, OPTAB_DIRECT);
19683 if (res != operands[0])
19684 emit_move_insn (operands[0], res);
19685 return true;
19686 }
19687 }
19688
19689 if (!nonimmediate_operand (cop1, mode))
19690 cop1 = force_reg (mode, cop1);
19691 if (!general_operand (operands[1], data_mode))
19692 operands[1] = force_reg (data_mode, operands[1]);
19693 if (!general_operand (operands[2], data_mode))
19694 operands[2] = force_reg (data_mode, operands[2]);
19695
19696 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19697 if (TARGET_XOP
19698 && (mode == V16QImode || mode == V8HImode
19699 || mode == V4SImode || mode == V2DImode))
19700 ;
19701 else
19702 {
19703 /* Canonicalize the comparison to EQ, GT, GTU. */
19704 switch (code)
19705 {
19706 case EQ:
19707 case GT:
19708 case GTU:
19709 break;
19710
19711 case NE:
19712 case LE:
19713 case LEU:
19714 code = reverse_condition (code);
19715 negate = true;
19716 break;
19717
19718 case GE:
19719 case GEU:
19720 code = reverse_condition (code);
19721 negate = true;
19722 /* FALLTHRU */
19723
19724 case LT:
19725 case LTU:
19726 code = swap_condition (code);
19727 x = cop0, cop0 = cop1, cop1 = x;
19728 break;
19729
19730 default:
19731 gcc_unreachable ();
19732 }
19733
19734 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19735 if (mode == V2DImode)
19736 {
19737 switch (code)
19738 {
19739 case EQ:
19740 /* SSE4.1 supports EQ. */
19741 if (!TARGET_SSE4_1)
19742 return false;
19743 break;
19744
19745 case GT:
19746 case GTU:
19747 /* SSE4.2 supports GT/GTU. */
19748 if (!TARGET_SSE4_2)
19749 return false;
19750 break;
19751
19752 default:
19753 gcc_unreachable ();
19754 }
19755 }
19756
19757 /* Unsigned parallel compare is not supported by the hardware.
19758 Play some tricks to turn this into a signed comparison
19759 against 0. */
19760 if (code == GTU)
19761 {
19762 cop0 = force_reg (mode, cop0);
19763
19764 switch (mode)
19765 {
19766 case V8SImode:
19767 case V4DImode:
19768 case V4SImode:
19769 case V2DImode:
19770 {
19771 rtx t1, t2, mask;
19772 rtx (*gen_sub3) (rtx, rtx, rtx);
19773
19774 switch (mode)
19775 {
19776 case V8SImode: gen_sub3 = gen_subv8si3; break;
19777 case V4DImode: gen_sub3 = gen_subv4di3; break;
19778 case V4SImode: gen_sub3 = gen_subv4si3; break;
19779 case V2DImode: gen_sub3 = gen_subv2di3; break;
19780 default:
19781 gcc_unreachable ();
19782 }
19783 /* Subtract (-(INT MAX) - 1) from both operands to make
19784 them signed. */
19785 mask = ix86_build_signbit_mask (mode, true, false);
19786 t1 = gen_reg_rtx (mode);
19787 emit_insn (gen_sub3 (t1, cop0, mask));
19788
19789 t2 = gen_reg_rtx (mode);
19790 emit_insn (gen_sub3 (t2, cop1, mask));
19791
19792 cop0 = t1;
19793 cop1 = t2;
19794 code = GT;
19795 }
19796 break;
19797
19798 case V32QImode:
19799 case V16HImode:
19800 case V16QImode:
19801 case V8HImode:
19802 /* Perform a parallel unsigned saturating subtraction. */
19803 x = gen_reg_rtx (mode);
19804 emit_insn (gen_rtx_SET (VOIDmode, x,
19805 gen_rtx_US_MINUS (mode, cop0, cop1)));
19806
19807 cop0 = x;
19808 cop1 = CONST0_RTX (mode);
19809 code = EQ;
19810 negate = !negate;
19811 break;
19812
19813 default:
19814 gcc_unreachable ();
19815 }
19816 }
19817 }
19818
19819 /* Allow the comparison to be done in one mode, but the movcc to
19820 happen in another mode. */
19821 if (data_mode == mode)
19822 {
19823 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19824 operands[1+negate], operands[2-negate]);
19825 }
19826 else
19827 {
19828 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19829 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19830 code, cop0, cop1,
19831 operands[1+negate], operands[2-negate]);
19832 x = gen_lowpart (data_mode, x);
19833 }
19834
19835 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19836 operands[2-negate]);
19837 return true;
19838 }
19839
19840 /* Expand a variable vector permutation. */
19841
19842 void
19843 ix86_expand_vec_perm (rtx operands[])
19844 {
19845 rtx target = operands[0];
19846 rtx op0 = operands[1];
19847 rtx op1 = operands[2];
19848 rtx mask = operands[3];
19849 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19850 enum machine_mode mode = GET_MODE (op0);
19851 enum machine_mode maskmode = GET_MODE (mask);
19852 int w, e, i;
19853 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19854
19855 /* Number of elements in the vector. */
19856 w = GET_MODE_NUNITS (mode);
19857 e = GET_MODE_UNIT_SIZE (mode);
19858 gcc_assert (w <= 32);
19859
19860 if (TARGET_AVX2)
19861 {
19862 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19863 {
19864 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19865 an constant shuffle operand. With a tiny bit of effort we can
19866 use VPERMD instead. A re-interpretation stall for V4DFmode is
19867 unfortunate but there's no avoiding it.
19868 Similarly for V16HImode we don't have instructions for variable
19869 shuffling, while for V32QImode we can use after preparing suitable
19870 masks vpshufb; vpshufb; vpermq; vpor. */
19871
19872 if (mode == V16HImode)
19873 {
19874 maskmode = mode = V32QImode;
19875 w = 32;
19876 e = 1;
19877 }
19878 else
19879 {
19880 maskmode = mode = V8SImode;
19881 w = 8;
19882 e = 4;
19883 }
19884 t1 = gen_reg_rtx (maskmode);
19885
19886 /* Replicate the low bits of the V4DImode mask into V8SImode:
19887 mask = { A B C D }
19888 t1 = { A A B B C C D D }. */
19889 for (i = 0; i < w / 2; ++i)
19890 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19891 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19892 vt = force_reg (maskmode, vt);
19893 mask = gen_lowpart (maskmode, mask);
19894 if (maskmode == V8SImode)
19895 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
19896 else
19897 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19898
19899 /* Multiply the shuffle indicies by two. */
19900 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19901 OPTAB_DIRECT);
19902
19903 /* Add one to the odd shuffle indicies:
19904 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19905 for (i = 0; i < w / 2; ++i)
19906 {
19907 vec[i * 2] = const0_rtx;
19908 vec[i * 2 + 1] = const1_rtx;
19909 }
19910 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19911 vt = force_const_mem (maskmode, vt);
19912 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19913 OPTAB_DIRECT);
19914
19915 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19916 operands[3] = mask = t1;
19917 target = gen_lowpart (mode, target);
19918 op0 = gen_lowpart (mode, op0);
19919 op1 = gen_lowpart (mode, op1);
19920 }
19921
19922 switch (mode)
19923 {
19924 case V8SImode:
19925 /* The VPERMD and VPERMPS instructions already properly ignore
19926 the high bits of the shuffle elements. No need for us to
19927 perform an AND ourselves. */
19928 if (one_operand_shuffle)
19929 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
19930 else
19931 {
19932 t1 = gen_reg_rtx (V8SImode);
19933 t2 = gen_reg_rtx (V8SImode);
19934 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
19935 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
19936 goto merge_two;
19937 }
19938 return;
19939
19940 case V8SFmode:
19941 mask = gen_lowpart (V8SFmode, mask);
19942 if (one_operand_shuffle)
19943 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
19944 else
19945 {
19946 t1 = gen_reg_rtx (V8SFmode);
19947 t2 = gen_reg_rtx (V8SFmode);
19948 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
19949 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
19950 goto merge_two;
19951 }
19952 return;
19953
19954 case V4SImode:
19955 /* By combining the two 128-bit input vectors into one 256-bit
19956 input vector, we can use VPERMD and VPERMPS for the full
19957 two-operand shuffle. */
19958 t1 = gen_reg_rtx (V8SImode);
19959 t2 = gen_reg_rtx (V8SImode);
19960 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19961 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19962 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
19963 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19964 return;
19965
19966 case V4SFmode:
19967 t1 = gen_reg_rtx (V8SFmode);
19968 t2 = gen_reg_rtx (V8SImode);
19969 mask = gen_lowpart (V4SImode, mask);
19970 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19971 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19972 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
19973 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19974 return;
19975
19976 case V32QImode:
19977 t1 = gen_reg_rtx (V32QImode);
19978 t2 = gen_reg_rtx (V32QImode);
19979 t3 = gen_reg_rtx (V32QImode);
19980 vt2 = GEN_INT (128);
19981 for (i = 0; i < 32; i++)
19982 vec[i] = vt2;
19983 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19984 vt = force_reg (V32QImode, vt);
19985 for (i = 0; i < 32; i++)
19986 vec[i] = i < 16 ? vt2 : const0_rtx;
19987 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19988 vt2 = force_reg (V32QImode, vt2);
19989 /* From mask create two adjusted masks, which contain the same
19990 bits as mask in the low 7 bits of each vector element.
19991 The first mask will have the most significant bit clear
19992 if it requests element from the same 128-bit lane
19993 and MSB set if it requests element from the other 128-bit lane.
19994 The second mask will have the opposite values of the MSB,
19995 and additionally will have its 128-bit lanes swapped.
19996 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19997 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19998 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19999 stands for other 12 bytes. */
20000 /* The bit whether element is from the same lane or the other
20001 lane is bit 4, so shift it up by 3 to the MSB position. */
20002 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20003 gen_lowpart (V4DImode, mask),
20004 GEN_INT (3)));
20005 /* Clear MSB bits from the mask just in case it had them set. */
20006 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20007 /* After this t1 will have MSB set for elements from other lane. */
20008 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20009 /* Clear bits other than MSB. */
20010 emit_insn (gen_andv32qi3 (t1, t1, vt));
20011 /* Or in the lower bits from mask into t3. */
20012 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20013 /* And invert MSB bits in t1, so MSB is set for elements from the same
20014 lane. */
20015 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20016 /* Swap 128-bit lanes in t3. */
20017 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20018 gen_lowpart (V4DImode, t3),
20019 const2_rtx, GEN_INT (3),
20020 const0_rtx, const1_rtx));
20021 /* And or in the lower bits from mask into t1. */
20022 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20023 if (one_operand_shuffle)
20024 {
20025 /* Each of these shuffles will put 0s in places where
20026 element from the other 128-bit lane is needed, otherwise
20027 will shuffle in the requested value. */
20028 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20029 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20030 /* For t3 the 128-bit lanes are swapped again. */
20031 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20032 gen_lowpart (V4DImode, t3),
20033 const2_rtx, GEN_INT (3),
20034 const0_rtx, const1_rtx));
20035 /* And oring both together leads to the result. */
20036 emit_insn (gen_iorv32qi3 (target, t1, t3));
20037 return;
20038 }
20039
20040 t4 = gen_reg_rtx (V32QImode);
20041 /* Similarly to the above one_operand_shuffle code,
20042 just for repeated twice for each operand. merge_two:
20043 code will merge the two results together. */
20044 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20045 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20046 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20047 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20048 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20049 gen_lowpart (V4DImode, t4),
20050 const2_rtx, GEN_INT (3),
20051 const0_rtx, const1_rtx));
20052 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20053 gen_lowpart (V4DImode, t3),
20054 const2_rtx, GEN_INT (3),
20055 const0_rtx, const1_rtx));
20056 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20057 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20058 t1 = t4;
20059 t2 = t3;
20060 goto merge_two;
20061
20062 default:
20063 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20064 break;
20065 }
20066 }
20067
20068 if (TARGET_XOP)
20069 {
20070 /* The XOP VPPERM insn supports three inputs. By ignoring the
20071 one_operand_shuffle special case, we avoid creating another
20072 set of constant vectors in memory. */
20073 one_operand_shuffle = false;
20074
20075 /* mask = mask & {2*w-1, ...} */
20076 vt = GEN_INT (2*w - 1);
20077 }
20078 else
20079 {
20080 /* mask = mask & {w-1, ...} */
20081 vt = GEN_INT (w - 1);
20082 }
20083
20084 for (i = 0; i < w; i++)
20085 vec[i] = vt;
20086 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20087 mask = expand_simple_binop (maskmode, AND, mask, vt,
20088 NULL_RTX, 0, OPTAB_DIRECT);
20089
20090 /* For non-QImode operations, convert the word permutation control
20091 into a byte permutation control. */
20092 if (mode != V16QImode)
20093 {
20094 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20095 GEN_INT (exact_log2 (e)),
20096 NULL_RTX, 0, OPTAB_DIRECT);
20097
20098 /* Convert mask to vector of chars. */
20099 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20100
20101 /* Replicate each of the input bytes into byte positions:
20102 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20103 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20104 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20105 for (i = 0; i < 16; ++i)
20106 vec[i] = GEN_INT (i/e * e);
20107 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20108 vt = force_const_mem (V16QImode, vt);
20109 if (TARGET_XOP)
20110 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20111 else
20112 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20113
20114 /* Convert it into the byte positions by doing
20115 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20116 for (i = 0; i < 16; ++i)
20117 vec[i] = GEN_INT (i % e);
20118 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20119 vt = force_const_mem (V16QImode, vt);
20120 emit_insn (gen_addv16qi3 (mask, mask, vt));
20121 }
20122
20123 /* The actual shuffle operations all operate on V16QImode. */
20124 op0 = gen_lowpart (V16QImode, op0);
20125 op1 = gen_lowpart (V16QImode, op1);
20126 target = gen_lowpart (V16QImode, target);
20127
20128 if (TARGET_XOP)
20129 {
20130 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20131 }
20132 else if (one_operand_shuffle)
20133 {
20134 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20135 }
20136 else
20137 {
20138 rtx xops[6];
20139 bool ok;
20140
20141 /* Shuffle the two input vectors independently. */
20142 t1 = gen_reg_rtx (V16QImode);
20143 t2 = gen_reg_rtx (V16QImode);
20144 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20145 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20146
20147 merge_two:
20148 /* Then merge them together. The key is whether any given control
20149 element contained a bit set that indicates the second word. */
20150 mask = operands[3];
20151 vt = GEN_INT (w);
20152 if (maskmode == V2DImode && !TARGET_SSE4_1)
20153 {
20154 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20155 more shuffle to convert the V2DI input mask into a V4SI
20156 input mask. At which point the masking that expand_int_vcond
20157 will work as desired. */
20158 rtx t3 = gen_reg_rtx (V4SImode);
20159 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20160 const0_rtx, const0_rtx,
20161 const2_rtx, const2_rtx));
20162 mask = t3;
20163 maskmode = V4SImode;
20164 e = w = 4;
20165 }
20166
20167 for (i = 0; i < w; i++)
20168 vec[i] = vt;
20169 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20170 vt = force_reg (maskmode, vt);
20171 mask = expand_simple_binop (maskmode, AND, mask, vt,
20172 NULL_RTX, 0, OPTAB_DIRECT);
20173
20174 xops[0] = gen_lowpart (mode, operands[0]);
20175 xops[1] = gen_lowpart (mode, t2);
20176 xops[2] = gen_lowpart (mode, t1);
20177 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20178 xops[4] = mask;
20179 xops[5] = vt;
20180 ok = ix86_expand_int_vcond (xops);
20181 gcc_assert (ok);
20182 }
20183 }
20184
20185 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20186 true if we should do zero extension, else sign extension. HIGH_P is
20187 true if we want the N/2 high elements, else the low elements. */
20188
20189 void
20190 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20191 {
20192 enum machine_mode imode = GET_MODE (src);
20193 rtx tmp;
20194
20195 if (TARGET_SSE4_1)
20196 {
20197 rtx (*unpack)(rtx, rtx);
20198 rtx (*extract)(rtx, rtx) = NULL;
20199 enum machine_mode halfmode = BLKmode;
20200
20201 switch (imode)
20202 {
20203 case V32QImode:
20204 if (unsigned_p)
20205 unpack = gen_avx2_zero_extendv16qiv16hi2;
20206 else
20207 unpack = gen_avx2_sign_extendv16qiv16hi2;
20208 halfmode = V16QImode;
20209 extract
20210 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20211 break;
20212 case V16HImode:
20213 if (unsigned_p)
20214 unpack = gen_avx2_zero_extendv8hiv8si2;
20215 else
20216 unpack = gen_avx2_sign_extendv8hiv8si2;
20217 halfmode = V8HImode;
20218 extract
20219 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20220 break;
20221 case V8SImode:
20222 if (unsigned_p)
20223 unpack = gen_avx2_zero_extendv4siv4di2;
20224 else
20225 unpack = gen_avx2_sign_extendv4siv4di2;
20226 halfmode = V4SImode;
20227 extract
20228 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20229 break;
20230 case V16QImode:
20231 if (unsigned_p)
20232 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20233 else
20234 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20235 break;
20236 case V8HImode:
20237 if (unsigned_p)
20238 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20239 else
20240 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20241 break;
20242 case V4SImode:
20243 if (unsigned_p)
20244 unpack = gen_sse4_1_zero_extendv2siv2di2;
20245 else
20246 unpack = gen_sse4_1_sign_extendv2siv2di2;
20247 break;
20248 default:
20249 gcc_unreachable ();
20250 }
20251
20252 if (GET_MODE_SIZE (imode) == 32)
20253 {
20254 tmp = gen_reg_rtx (halfmode);
20255 emit_insn (extract (tmp, src));
20256 }
20257 else if (high_p)
20258 {
20259 /* Shift higher 8 bytes to lower 8 bytes. */
20260 tmp = gen_reg_rtx (imode);
20261 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20262 gen_lowpart (V1TImode, src),
20263 GEN_INT (64)));
20264 }
20265 else
20266 tmp = src;
20267
20268 emit_insn (unpack (dest, tmp));
20269 }
20270 else
20271 {
20272 rtx (*unpack)(rtx, rtx, rtx);
20273
20274 switch (imode)
20275 {
20276 case V16QImode:
20277 if (high_p)
20278 unpack = gen_vec_interleave_highv16qi;
20279 else
20280 unpack = gen_vec_interleave_lowv16qi;
20281 break;
20282 case V8HImode:
20283 if (high_p)
20284 unpack = gen_vec_interleave_highv8hi;
20285 else
20286 unpack = gen_vec_interleave_lowv8hi;
20287 break;
20288 case V4SImode:
20289 if (high_p)
20290 unpack = gen_vec_interleave_highv4si;
20291 else
20292 unpack = gen_vec_interleave_lowv4si;
20293 break;
20294 default:
20295 gcc_unreachable ();
20296 }
20297
20298 if (unsigned_p)
20299 tmp = force_reg (imode, CONST0_RTX (imode));
20300 else
20301 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20302 src, pc_rtx, pc_rtx);
20303
20304 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20305 }
20306 }
20307
20308 /* Expand conditional increment or decrement using adb/sbb instructions.
20309 The default case using setcc followed by the conditional move can be
20310 done by generic code. */
20311 bool
20312 ix86_expand_int_addcc (rtx operands[])
20313 {
20314 enum rtx_code code = GET_CODE (operands[1]);
20315 rtx flags;
20316 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20317 rtx compare_op;
20318 rtx val = const0_rtx;
20319 bool fpcmp = false;
20320 enum machine_mode mode;
20321 rtx op0 = XEXP (operands[1], 0);
20322 rtx op1 = XEXP (operands[1], 1);
20323
20324 if (operands[3] != const1_rtx
20325 && operands[3] != constm1_rtx)
20326 return false;
20327 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20328 return false;
20329 code = GET_CODE (compare_op);
20330
20331 flags = XEXP (compare_op, 0);
20332
20333 if (GET_MODE (flags) == CCFPmode
20334 || GET_MODE (flags) == CCFPUmode)
20335 {
20336 fpcmp = true;
20337 code = ix86_fp_compare_code_to_integer (code);
20338 }
20339
20340 if (code != LTU)
20341 {
20342 val = constm1_rtx;
20343 if (fpcmp)
20344 PUT_CODE (compare_op,
20345 reverse_condition_maybe_unordered
20346 (GET_CODE (compare_op)));
20347 else
20348 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20349 }
20350
20351 mode = GET_MODE (operands[0]);
20352
20353 /* Construct either adc or sbb insn. */
20354 if ((code == LTU) == (operands[3] == constm1_rtx))
20355 {
20356 switch (mode)
20357 {
20358 case QImode:
20359 insn = gen_subqi3_carry;
20360 break;
20361 case HImode:
20362 insn = gen_subhi3_carry;
20363 break;
20364 case SImode:
20365 insn = gen_subsi3_carry;
20366 break;
20367 case DImode:
20368 insn = gen_subdi3_carry;
20369 break;
20370 default:
20371 gcc_unreachable ();
20372 }
20373 }
20374 else
20375 {
20376 switch (mode)
20377 {
20378 case QImode:
20379 insn = gen_addqi3_carry;
20380 break;
20381 case HImode:
20382 insn = gen_addhi3_carry;
20383 break;
20384 case SImode:
20385 insn = gen_addsi3_carry;
20386 break;
20387 case DImode:
20388 insn = gen_adddi3_carry;
20389 break;
20390 default:
20391 gcc_unreachable ();
20392 }
20393 }
20394 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20395
20396 return true;
20397 }
20398
20399
20400 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20401 but works for floating pointer parameters and nonoffsetable memories.
20402 For pushes, it returns just stack offsets; the values will be saved
20403 in the right order. Maximally three parts are generated. */
20404
20405 static int
20406 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20407 {
20408 int size;
20409
20410 if (!TARGET_64BIT)
20411 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20412 else
20413 size = (GET_MODE_SIZE (mode) + 4) / 8;
20414
20415 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20416 gcc_assert (size >= 2 && size <= 4);
20417
20418 /* Optimize constant pool reference to immediates. This is used by fp
20419 moves, that force all constants to memory to allow combining. */
20420 if (MEM_P (operand) && MEM_READONLY_P (operand))
20421 {
20422 rtx tmp = maybe_get_pool_constant (operand);
20423 if (tmp)
20424 operand = tmp;
20425 }
20426
20427 if (MEM_P (operand) && !offsettable_memref_p (operand))
20428 {
20429 /* The only non-offsetable memories we handle are pushes. */
20430 int ok = push_operand (operand, VOIDmode);
20431
20432 gcc_assert (ok);
20433
20434 operand = copy_rtx (operand);
20435 PUT_MODE (operand, word_mode);
20436 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20437 return size;
20438 }
20439
20440 if (GET_CODE (operand) == CONST_VECTOR)
20441 {
20442 enum machine_mode imode = int_mode_for_mode (mode);
20443 /* Caution: if we looked through a constant pool memory above,
20444 the operand may actually have a different mode now. That's
20445 ok, since we want to pun this all the way back to an integer. */
20446 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20447 gcc_assert (operand != NULL);
20448 mode = imode;
20449 }
20450
20451 if (!TARGET_64BIT)
20452 {
20453 if (mode == DImode)
20454 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20455 else
20456 {
20457 int i;
20458
20459 if (REG_P (operand))
20460 {
20461 gcc_assert (reload_completed);
20462 for (i = 0; i < size; i++)
20463 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20464 }
20465 else if (offsettable_memref_p (operand))
20466 {
20467 operand = adjust_address (operand, SImode, 0);
20468 parts[0] = operand;
20469 for (i = 1; i < size; i++)
20470 parts[i] = adjust_address (operand, SImode, 4 * i);
20471 }
20472 else if (GET_CODE (operand) == CONST_DOUBLE)
20473 {
20474 REAL_VALUE_TYPE r;
20475 long l[4];
20476
20477 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20478 switch (mode)
20479 {
20480 case TFmode:
20481 real_to_target (l, &r, mode);
20482 parts[3] = gen_int_mode (l[3], SImode);
20483 parts[2] = gen_int_mode (l[2], SImode);
20484 break;
20485 case XFmode:
20486 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20487 parts[2] = gen_int_mode (l[2], SImode);
20488 break;
20489 case DFmode:
20490 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20491 break;
20492 default:
20493 gcc_unreachable ();
20494 }
20495 parts[1] = gen_int_mode (l[1], SImode);
20496 parts[0] = gen_int_mode (l[0], SImode);
20497 }
20498 else
20499 gcc_unreachable ();
20500 }
20501 }
20502 else
20503 {
20504 if (mode == TImode)
20505 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20506 if (mode == XFmode || mode == TFmode)
20507 {
20508 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20509 if (REG_P (operand))
20510 {
20511 gcc_assert (reload_completed);
20512 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20513 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20514 }
20515 else if (offsettable_memref_p (operand))
20516 {
20517 operand = adjust_address (operand, DImode, 0);
20518 parts[0] = operand;
20519 parts[1] = adjust_address (operand, upper_mode, 8);
20520 }
20521 else if (GET_CODE (operand) == CONST_DOUBLE)
20522 {
20523 REAL_VALUE_TYPE r;
20524 long l[4];
20525
20526 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20527 real_to_target (l, &r, mode);
20528
20529 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20530 if (HOST_BITS_PER_WIDE_INT >= 64)
20531 parts[0]
20532 = gen_int_mode
20533 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20534 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20535 DImode);
20536 else
20537 parts[0] = immed_double_const (l[0], l[1], DImode);
20538
20539 if (upper_mode == SImode)
20540 parts[1] = gen_int_mode (l[2], SImode);
20541 else if (HOST_BITS_PER_WIDE_INT >= 64)
20542 parts[1]
20543 = gen_int_mode
20544 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20545 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20546 DImode);
20547 else
20548 parts[1] = immed_double_const (l[2], l[3], DImode);
20549 }
20550 else
20551 gcc_unreachable ();
20552 }
20553 }
20554
20555 return size;
20556 }
20557
20558 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20559 Return false when normal moves are needed; true when all required
20560 insns have been emitted. Operands 2-4 contain the input values
20561 int the correct order; operands 5-7 contain the output values. */
20562
20563 void
20564 ix86_split_long_move (rtx operands[])
20565 {
20566 rtx part[2][4];
20567 int nparts, i, j;
20568 int push = 0;
20569 int collisions = 0;
20570 enum machine_mode mode = GET_MODE (operands[0]);
20571 bool collisionparts[4];
20572
20573 /* The DFmode expanders may ask us to move double.
20574 For 64bit target this is single move. By hiding the fact
20575 here we simplify i386.md splitters. */
20576 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20577 {
20578 /* Optimize constant pool reference to immediates. This is used by
20579 fp moves, that force all constants to memory to allow combining. */
20580
20581 if (MEM_P (operands[1])
20582 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20583 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20584 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20585 if (push_operand (operands[0], VOIDmode))
20586 {
20587 operands[0] = copy_rtx (operands[0]);
20588 PUT_MODE (operands[0], word_mode);
20589 }
20590 else
20591 operands[0] = gen_lowpart (DImode, operands[0]);
20592 operands[1] = gen_lowpart (DImode, operands[1]);
20593 emit_move_insn (operands[0], operands[1]);
20594 return;
20595 }
20596
20597 /* The only non-offsettable memory we handle is push. */
20598 if (push_operand (operands[0], VOIDmode))
20599 push = 1;
20600 else
20601 gcc_assert (!MEM_P (operands[0])
20602 || offsettable_memref_p (operands[0]));
20603
20604 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20605 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20606
20607 /* When emitting push, take care for source operands on the stack. */
20608 if (push && MEM_P (operands[1])
20609 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20610 {
20611 rtx src_base = XEXP (part[1][nparts - 1], 0);
20612
20613 /* Compensate for the stack decrement by 4. */
20614 if (!TARGET_64BIT && nparts == 3
20615 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20616 src_base = plus_constant (Pmode, src_base, 4);
20617
20618 /* src_base refers to the stack pointer and is
20619 automatically decreased by emitted push. */
20620 for (i = 0; i < nparts; i++)
20621 part[1][i] = change_address (part[1][i],
20622 GET_MODE (part[1][i]), src_base);
20623 }
20624
20625 /* We need to do copy in the right order in case an address register
20626 of the source overlaps the destination. */
20627 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20628 {
20629 rtx tmp;
20630
20631 for (i = 0; i < nparts; i++)
20632 {
20633 collisionparts[i]
20634 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20635 if (collisionparts[i])
20636 collisions++;
20637 }
20638
20639 /* Collision in the middle part can be handled by reordering. */
20640 if (collisions == 1 && nparts == 3 && collisionparts [1])
20641 {
20642 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20643 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20644 }
20645 else if (collisions == 1
20646 && nparts == 4
20647 && (collisionparts [1] || collisionparts [2]))
20648 {
20649 if (collisionparts [1])
20650 {
20651 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20652 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20653 }
20654 else
20655 {
20656 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20657 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20658 }
20659 }
20660
20661 /* If there are more collisions, we can't handle it by reordering.
20662 Do an lea to the last part and use only one colliding move. */
20663 else if (collisions > 1)
20664 {
20665 rtx base;
20666
20667 collisions = 1;
20668
20669 base = part[0][nparts - 1];
20670
20671 /* Handle the case when the last part isn't valid for lea.
20672 Happens in 64-bit mode storing the 12-byte XFmode. */
20673 if (GET_MODE (base) != Pmode)
20674 base = gen_rtx_REG (Pmode, REGNO (base));
20675
20676 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20677 part[1][0] = replace_equiv_address (part[1][0], base);
20678 for (i = 1; i < nparts; i++)
20679 {
20680 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20681 part[1][i] = replace_equiv_address (part[1][i], tmp);
20682 }
20683 }
20684 }
20685
20686 if (push)
20687 {
20688 if (!TARGET_64BIT)
20689 {
20690 if (nparts == 3)
20691 {
20692 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20693 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20694 stack_pointer_rtx, GEN_INT (-4)));
20695 emit_move_insn (part[0][2], part[1][2]);
20696 }
20697 else if (nparts == 4)
20698 {
20699 emit_move_insn (part[0][3], part[1][3]);
20700 emit_move_insn (part[0][2], part[1][2]);
20701 }
20702 }
20703 else
20704 {
20705 /* In 64bit mode we don't have 32bit push available. In case this is
20706 register, it is OK - we will just use larger counterpart. We also
20707 retype memory - these comes from attempt to avoid REX prefix on
20708 moving of second half of TFmode value. */
20709 if (GET_MODE (part[1][1]) == SImode)
20710 {
20711 switch (GET_CODE (part[1][1]))
20712 {
20713 case MEM:
20714 part[1][1] = adjust_address (part[1][1], DImode, 0);
20715 break;
20716
20717 case REG:
20718 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20719 break;
20720
20721 default:
20722 gcc_unreachable ();
20723 }
20724
20725 if (GET_MODE (part[1][0]) == SImode)
20726 part[1][0] = part[1][1];
20727 }
20728 }
20729 emit_move_insn (part[0][1], part[1][1]);
20730 emit_move_insn (part[0][0], part[1][0]);
20731 return;
20732 }
20733
20734 /* Choose correct order to not overwrite the source before it is copied. */
20735 if ((REG_P (part[0][0])
20736 && REG_P (part[1][1])
20737 && (REGNO (part[0][0]) == REGNO (part[1][1])
20738 || (nparts == 3
20739 && REGNO (part[0][0]) == REGNO (part[1][2]))
20740 || (nparts == 4
20741 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20742 || (collisions > 0
20743 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20744 {
20745 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20746 {
20747 operands[2 + i] = part[0][j];
20748 operands[6 + i] = part[1][j];
20749 }
20750 }
20751 else
20752 {
20753 for (i = 0; i < nparts; i++)
20754 {
20755 operands[2 + i] = part[0][i];
20756 operands[6 + i] = part[1][i];
20757 }
20758 }
20759
20760 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20761 if (optimize_insn_for_size_p ())
20762 {
20763 for (j = 0; j < nparts - 1; j++)
20764 if (CONST_INT_P (operands[6 + j])
20765 && operands[6 + j] != const0_rtx
20766 && REG_P (operands[2 + j]))
20767 for (i = j; i < nparts - 1; i++)
20768 if (CONST_INT_P (operands[7 + i])
20769 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20770 operands[7 + i] = operands[2 + j];
20771 }
20772
20773 for (i = 0; i < nparts; i++)
20774 emit_move_insn (operands[2 + i], operands[6 + i]);
20775
20776 return;
20777 }
20778
20779 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20780 left shift by a constant, either using a single shift or
20781 a sequence of add instructions. */
20782
20783 static void
20784 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20785 {
20786 rtx (*insn)(rtx, rtx, rtx);
20787
20788 if (count == 1
20789 || (count * ix86_cost->add <= ix86_cost->shift_const
20790 && !optimize_insn_for_size_p ()))
20791 {
20792 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20793 while (count-- > 0)
20794 emit_insn (insn (operand, operand, operand));
20795 }
20796 else
20797 {
20798 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20799 emit_insn (insn (operand, operand, GEN_INT (count)));
20800 }
20801 }
20802
20803 void
20804 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20805 {
20806 rtx (*gen_ashl3)(rtx, rtx, rtx);
20807 rtx (*gen_shld)(rtx, rtx, rtx);
20808 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20809
20810 rtx low[2], high[2];
20811 int count;
20812
20813 if (CONST_INT_P (operands[2]))
20814 {
20815 split_double_mode (mode, operands, 2, low, high);
20816 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20817
20818 if (count >= half_width)
20819 {
20820 emit_move_insn (high[0], low[1]);
20821 emit_move_insn (low[0], const0_rtx);
20822
20823 if (count > half_width)
20824 ix86_expand_ashl_const (high[0], count - half_width, mode);
20825 }
20826 else
20827 {
20828 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20829
20830 if (!rtx_equal_p (operands[0], operands[1]))
20831 emit_move_insn (operands[0], operands[1]);
20832
20833 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20834 ix86_expand_ashl_const (low[0], count, mode);
20835 }
20836 return;
20837 }
20838
20839 split_double_mode (mode, operands, 1, low, high);
20840
20841 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20842
20843 if (operands[1] == const1_rtx)
20844 {
20845 /* Assuming we've chosen a QImode capable registers, then 1 << N
20846 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20847 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20848 {
20849 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20850
20851 ix86_expand_clear (low[0]);
20852 ix86_expand_clear (high[0]);
20853 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20854
20855 d = gen_lowpart (QImode, low[0]);
20856 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20857 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20858 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20859
20860 d = gen_lowpart (QImode, high[0]);
20861 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20862 s = gen_rtx_NE (QImode, flags, const0_rtx);
20863 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20864 }
20865
20866 /* Otherwise, we can get the same results by manually performing
20867 a bit extract operation on bit 5/6, and then performing the two
20868 shifts. The two methods of getting 0/1 into low/high are exactly
20869 the same size. Avoiding the shift in the bit extract case helps
20870 pentium4 a bit; no one else seems to care much either way. */
20871 else
20872 {
20873 enum machine_mode half_mode;
20874 rtx (*gen_lshr3)(rtx, rtx, rtx);
20875 rtx (*gen_and3)(rtx, rtx, rtx);
20876 rtx (*gen_xor3)(rtx, rtx, rtx);
20877 HOST_WIDE_INT bits;
20878 rtx x;
20879
20880 if (mode == DImode)
20881 {
20882 half_mode = SImode;
20883 gen_lshr3 = gen_lshrsi3;
20884 gen_and3 = gen_andsi3;
20885 gen_xor3 = gen_xorsi3;
20886 bits = 5;
20887 }
20888 else
20889 {
20890 half_mode = DImode;
20891 gen_lshr3 = gen_lshrdi3;
20892 gen_and3 = gen_anddi3;
20893 gen_xor3 = gen_xordi3;
20894 bits = 6;
20895 }
20896
20897 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20898 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20899 else
20900 x = gen_lowpart (half_mode, operands[2]);
20901 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20902
20903 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20904 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20905 emit_move_insn (low[0], high[0]);
20906 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20907 }
20908
20909 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20910 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20911 return;
20912 }
20913
20914 if (operands[1] == constm1_rtx)
20915 {
20916 /* For -1 << N, we can avoid the shld instruction, because we
20917 know that we're shifting 0...31/63 ones into a -1. */
20918 emit_move_insn (low[0], constm1_rtx);
20919 if (optimize_insn_for_size_p ())
20920 emit_move_insn (high[0], low[0]);
20921 else
20922 emit_move_insn (high[0], constm1_rtx);
20923 }
20924 else
20925 {
20926 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20927
20928 if (!rtx_equal_p (operands[0], operands[1]))
20929 emit_move_insn (operands[0], operands[1]);
20930
20931 split_double_mode (mode, operands, 1, low, high);
20932 emit_insn (gen_shld (high[0], low[0], operands[2]));
20933 }
20934
20935 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20936
20937 if (TARGET_CMOVE && scratch)
20938 {
20939 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20940 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20941
20942 ix86_expand_clear (scratch);
20943 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20944 }
20945 else
20946 {
20947 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20948 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20949
20950 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20951 }
20952 }
20953
20954 void
20955 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20956 {
20957 rtx (*gen_ashr3)(rtx, rtx, rtx)
20958 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20959 rtx (*gen_shrd)(rtx, rtx, rtx);
20960 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20961
20962 rtx low[2], high[2];
20963 int count;
20964
20965 if (CONST_INT_P (operands[2]))
20966 {
20967 split_double_mode (mode, operands, 2, low, high);
20968 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20969
20970 if (count == GET_MODE_BITSIZE (mode) - 1)
20971 {
20972 emit_move_insn (high[0], high[1]);
20973 emit_insn (gen_ashr3 (high[0], high[0],
20974 GEN_INT (half_width - 1)));
20975 emit_move_insn (low[0], high[0]);
20976
20977 }
20978 else if (count >= half_width)
20979 {
20980 emit_move_insn (low[0], high[1]);
20981 emit_move_insn (high[0], low[0]);
20982 emit_insn (gen_ashr3 (high[0], high[0],
20983 GEN_INT (half_width - 1)));
20984
20985 if (count > half_width)
20986 emit_insn (gen_ashr3 (low[0], low[0],
20987 GEN_INT (count - half_width)));
20988 }
20989 else
20990 {
20991 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20992
20993 if (!rtx_equal_p (operands[0], operands[1]))
20994 emit_move_insn (operands[0], operands[1]);
20995
20996 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20997 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20998 }
20999 }
21000 else
21001 {
21002 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21003
21004 if (!rtx_equal_p (operands[0], operands[1]))
21005 emit_move_insn (operands[0], operands[1]);
21006
21007 split_double_mode (mode, operands, 1, low, high);
21008
21009 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21010 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21011
21012 if (TARGET_CMOVE && scratch)
21013 {
21014 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21015 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21016
21017 emit_move_insn (scratch, high[0]);
21018 emit_insn (gen_ashr3 (scratch, scratch,
21019 GEN_INT (half_width - 1)));
21020 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21021 scratch));
21022 }
21023 else
21024 {
21025 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21026 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21027
21028 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21029 }
21030 }
21031 }
21032
21033 void
21034 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21035 {
21036 rtx (*gen_lshr3)(rtx, rtx, rtx)
21037 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21038 rtx (*gen_shrd)(rtx, rtx, rtx);
21039 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21040
21041 rtx low[2], high[2];
21042 int count;
21043
21044 if (CONST_INT_P (operands[2]))
21045 {
21046 split_double_mode (mode, operands, 2, low, high);
21047 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21048
21049 if (count >= half_width)
21050 {
21051 emit_move_insn (low[0], high[1]);
21052 ix86_expand_clear (high[0]);
21053
21054 if (count > half_width)
21055 emit_insn (gen_lshr3 (low[0], low[0],
21056 GEN_INT (count - half_width)));
21057 }
21058 else
21059 {
21060 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21061
21062 if (!rtx_equal_p (operands[0], operands[1]))
21063 emit_move_insn (operands[0], operands[1]);
21064
21065 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21066 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21067 }
21068 }
21069 else
21070 {
21071 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21072
21073 if (!rtx_equal_p (operands[0], operands[1]))
21074 emit_move_insn (operands[0], operands[1]);
21075
21076 split_double_mode (mode, operands, 1, low, high);
21077
21078 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21079 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21080
21081 if (TARGET_CMOVE && scratch)
21082 {
21083 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21084 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21085
21086 ix86_expand_clear (scratch);
21087 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21088 scratch));
21089 }
21090 else
21091 {
21092 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21093 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21094
21095 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21096 }
21097 }
21098 }
21099
21100 /* Predict just emitted jump instruction to be taken with probability PROB. */
21101 static void
21102 predict_jump (int prob)
21103 {
21104 rtx insn = get_last_insn ();
21105 gcc_assert (JUMP_P (insn));
21106 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21107 }
21108
21109 /* Helper function for the string operations below. Dest VARIABLE whether
21110 it is aligned to VALUE bytes. If true, jump to the label. */
21111 static rtx
21112 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21113 {
21114 rtx label = gen_label_rtx ();
21115 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21116 if (GET_MODE (variable) == DImode)
21117 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21118 else
21119 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21120 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21121 1, label);
21122 if (epilogue)
21123 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21124 else
21125 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21126 return label;
21127 }
21128
21129 /* Adjust COUNTER by the VALUE. */
21130 static void
21131 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21132 {
21133 rtx (*gen_add)(rtx, rtx, rtx)
21134 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21135
21136 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21137 }
21138
21139 /* Zero extend possibly SImode EXP to Pmode register. */
21140 rtx
21141 ix86_zero_extend_to_Pmode (rtx exp)
21142 {
21143 if (GET_MODE (exp) != Pmode)
21144 exp = convert_to_mode (Pmode, exp, 1);
21145 return force_reg (Pmode, exp);
21146 }
21147
21148 /* Divide COUNTREG by SCALE. */
21149 static rtx
21150 scale_counter (rtx countreg, int scale)
21151 {
21152 rtx sc;
21153
21154 if (scale == 1)
21155 return countreg;
21156 if (CONST_INT_P (countreg))
21157 return GEN_INT (INTVAL (countreg) / scale);
21158 gcc_assert (REG_P (countreg));
21159
21160 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21161 GEN_INT (exact_log2 (scale)),
21162 NULL, 1, OPTAB_DIRECT);
21163 return sc;
21164 }
21165
21166 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21167 DImode for constant loop counts. */
21168
21169 static enum machine_mode
21170 counter_mode (rtx count_exp)
21171 {
21172 if (GET_MODE (count_exp) != VOIDmode)
21173 return GET_MODE (count_exp);
21174 if (!CONST_INT_P (count_exp))
21175 return Pmode;
21176 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21177 return DImode;
21178 return SImode;
21179 }
21180
21181 /* When SRCPTR is non-NULL, output simple loop to move memory
21182 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21183 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21184 equivalent loop to set memory by VALUE (supposed to be in MODE).
21185
21186 The size is rounded down to whole number of chunk size moved at once.
21187 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21188
21189
21190 static void
21191 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21192 rtx destptr, rtx srcptr, rtx value,
21193 rtx count, enum machine_mode mode, int unroll,
21194 int expected_size)
21195 {
21196 rtx out_label, top_label, iter, tmp;
21197 enum machine_mode iter_mode = counter_mode (count);
21198 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21199 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21200 rtx size;
21201 rtx x_addr;
21202 rtx y_addr;
21203 int i;
21204
21205 top_label = gen_label_rtx ();
21206 out_label = gen_label_rtx ();
21207 iter = gen_reg_rtx (iter_mode);
21208
21209 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21210 NULL, 1, OPTAB_DIRECT);
21211 /* Those two should combine. */
21212 if (piece_size == const1_rtx)
21213 {
21214 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21215 true, out_label);
21216 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21217 }
21218 emit_move_insn (iter, const0_rtx);
21219
21220 emit_label (top_label);
21221
21222 tmp = convert_modes (Pmode, iter_mode, iter, true);
21223 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21224 destmem = change_address (destmem, mode, x_addr);
21225
21226 if (srcmem)
21227 {
21228 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21229 srcmem = change_address (srcmem, mode, y_addr);
21230
21231 /* When unrolling for chips that reorder memory reads and writes,
21232 we can save registers by using single temporary.
21233 Also using 4 temporaries is overkill in 32bit mode. */
21234 if (!TARGET_64BIT && 0)
21235 {
21236 for (i = 0; i < unroll; i++)
21237 {
21238 if (i)
21239 {
21240 destmem =
21241 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21242 srcmem =
21243 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21244 }
21245 emit_move_insn (destmem, srcmem);
21246 }
21247 }
21248 else
21249 {
21250 rtx tmpreg[4];
21251 gcc_assert (unroll <= 4);
21252 for (i = 0; i < unroll; i++)
21253 {
21254 tmpreg[i] = gen_reg_rtx (mode);
21255 if (i)
21256 {
21257 srcmem =
21258 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21259 }
21260 emit_move_insn (tmpreg[i], srcmem);
21261 }
21262 for (i = 0; i < unroll; i++)
21263 {
21264 if (i)
21265 {
21266 destmem =
21267 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21268 }
21269 emit_move_insn (destmem, tmpreg[i]);
21270 }
21271 }
21272 }
21273 else
21274 for (i = 0; i < unroll; i++)
21275 {
21276 if (i)
21277 destmem =
21278 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21279 emit_move_insn (destmem, value);
21280 }
21281
21282 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21283 true, OPTAB_LIB_WIDEN);
21284 if (tmp != iter)
21285 emit_move_insn (iter, tmp);
21286
21287 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21288 true, top_label);
21289 if (expected_size != -1)
21290 {
21291 expected_size /= GET_MODE_SIZE (mode) * unroll;
21292 if (expected_size == 0)
21293 predict_jump (0);
21294 else if (expected_size > REG_BR_PROB_BASE)
21295 predict_jump (REG_BR_PROB_BASE - 1);
21296 else
21297 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21298 }
21299 else
21300 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21301 iter = ix86_zero_extend_to_Pmode (iter);
21302 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21303 true, OPTAB_LIB_WIDEN);
21304 if (tmp != destptr)
21305 emit_move_insn (destptr, tmp);
21306 if (srcptr)
21307 {
21308 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21309 true, OPTAB_LIB_WIDEN);
21310 if (tmp != srcptr)
21311 emit_move_insn (srcptr, tmp);
21312 }
21313 emit_label (out_label);
21314 }
21315
21316 /* Output "rep; mov" instruction.
21317 Arguments have same meaning as for previous function */
21318 static void
21319 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21320 rtx destptr, rtx srcptr,
21321 rtx count,
21322 enum machine_mode mode)
21323 {
21324 rtx destexp;
21325 rtx srcexp;
21326 rtx countreg;
21327 HOST_WIDE_INT rounded_count;
21328
21329 /* If the size is known, it is shorter to use rep movs. */
21330 if (mode == QImode && CONST_INT_P (count)
21331 && !(INTVAL (count) & 3))
21332 mode = SImode;
21333
21334 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21335 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21336 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21337 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21338 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21339 if (mode != QImode)
21340 {
21341 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21342 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21343 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21344 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21345 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21346 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21347 }
21348 else
21349 {
21350 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21351 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21352 }
21353 if (CONST_INT_P (count))
21354 {
21355 rounded_count = (INTVAL (count)
21356 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21357 destmem = shallow_copy_rtx (destmem);
21358 srcmem = shallow_copy_rtx (srcmem);
21359 set_mem_size (destmem, rounded_count);
21360 set_mem_size (srcmem, rounded_count);
21361 }
21362 else
21363 {
21364 if (MEM_SIZE_KNOWN_P (destmem))
21365 clear_mem_size (destmem);
21366 if (MEM_SIZE_KNOWN_P (srcmem))
21367 clear_mem_size (srcmem);
21368 }
21369 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21370 destexp, srcexp));
21371 }
21372
21373 /* Output "rep; stos" instruction.
21374 Arguments have same meaning as for previous function */
21375 static void
21376 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21377 rtx count, enum machine_mode mode,
21378 rtx orig_value)
21379 {
21380 rtx destexp;
21381 rtx countreg;
21382 HOST_WIDE_INT rounded_count;
21383
21384 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21385 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21386 value = force_reg (mode, gen_lowpart (mode, value));
21387 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21388 if (mode != QImode)
21389 {
21390 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21391 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21392 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21393 }
21394 else
21395 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21396 if (orig_value == const0_rtx && CONST_INT_P (count))
21397 {
21398 rounded_count = (INTVAL (count)
21399 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21400 destmem = shallow_copy_rtx (destmem);
21401 set_mem_size (destmem, rounded_count);
21402 }
21403 else if (MEM_SIZE_KNOWN_P (destmem))
21404 clear_mem_size (destmem);
21405 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21406 }
21407
21408 static void
21409 emit_strmov (rtx destmem, rtx srcmem,
21410 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21411 {
21412 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21413 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21414 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21415 }
21416
21417 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21418 static void
21419 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21420 rtx destptr, rtx srcptr, rtx count, int max_size)
21421 {
21422 rtx src, dest;
21423 if (CONST_INT_P (count))
21424 {
21425 HOST_WIDE_INT countval = INTVAL (count);
21426 int offset = 0;
21427
21428 if ((countval & 0x10) && max_size > 16)
21429 {
21430 if (TARGET_64BIT)
21431 {
21432 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21433 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21434 }
21435 else
21436 gcc_unreachable ();
21437 offset += 16;
21438 }
21439 if ((countval & 0x08) && max_size > 8)
21440 {
21441 if (TARGET_64BIT)
21442 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21443 else
21444 {
21445 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21446 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21447 }
21448 offset += 8;
21449 }
21450 if ((countval & 0x04) && max_size > 4)
21451 {
21452 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21453 offset += 4;
21454 }
21455 if ((countval & 0x02) && max_size > 2)
21456 {
21457 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21458 offset += 2;
21459 }
21460 if ((countval & 0x01) && max_size > 1)
21461 {
21462 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21463 offset += 1;
21464 }
21465 return;
21466 }
21467 if (max_size > 8)
21468 {
21469 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21470 count, 1, OPTAB_DIRECT);
21471 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21472 count, QImode, 1, 4);
21473 return;
21474 }
21475
21476 /* When there are stringops, we can cheaply increase dest and src pointers.
21477 Otherwise we save code size by maintaining offset (zero is readily
21478 available from preceding rep operation) and using x86 addressing modes.
21479 */
21480 if (TARGET_SINGLE_STRINGOP)
21481 {
21482 if (max_size > 4)
21483 {
21484 rtx label = ix86_expand_aligntest (count, 4, true);
21485 src = change_address (srcmem, SImode, srcptr);
21486 dest = change_address (destmem, SImode, destptr);
21487 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21488 emit_label (label);
21489 LABEL_NUSES (label) = 1;
21490 }
21491 if (max_size > 2)
21492 {
21493 rtx label = ix86_expand_aligntest (count, 2, true);
21494 src = change_address (srcmem, HImode, srcptr);
21495 dest = change_address (destmem, HImode, destptr);
21496 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21497 emit_label (label);
21498 LABEL_NUSES (label) = 1;
21499 }
21500 if (max_size > 1)
21501 {
21502 rtx label = ix86_expand_aligntest (count, 1, true);
21503 src = change_address (srcmem, QImode, srcptr);
21504 dest = change_address (destmem, QImode, destptr);
21505 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21506 emit_label (label);
21507 LABEL_NUSES (label) = 1;
21508 }
21509 }
21510 else
21511 {
21512 rtx offset = force_reg (Pmode, const0_rtx);
21513 rtx tmp;
21514
21515 if (max_size > 4)
21516 {
21517 rtx label = ix86_expand_aligntest (count, 4, true);
21518 src = change_address (srcmem, SImode, srcptr);
21519 dest = change_address (destmem, SImode, destptr);
21520 emit_move_insn (dest, src);
21521 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21522 true, OPTAB_LIB_WIDEN);
21523 if (tmp != offset)
21524 emit_move_insn (offset, tmp);
21525 emit_label (label);
21526 LABEL_NUSES (label) = 1;
21527 }
21528 if (max_size > 2)
21529 {
21530 rtx label = ix86_expand_aligntest (count, 2, true);
21531 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21532 src = change_address (srcmem, HImode, tmp);
21533 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21534 dest = change_address (destmem, HImode, tmp);
21535 emit_move_insn (dest, src);
21536 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21537 true, OPTAB_LIB_WIDEN);
21538 if (tmp != offset)
21539 emit_move_insn (offset, tmp);
21540 emit_label (label);
21541 LABEL_NUSES (label) = 1;
21542 }
21543 if (max_size > 1)
21544 {
21545 rtx label = ix86_expand_aligntest (count, 1, true);
21546 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21547 src = change_address (srcmem, QImode, tmp);
21548 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21549 dest = change_address (destmem, QImode, tmp);
21550 emit_move_insn (dest, src);
21551 emit_label (label);
21552 LABEL_NUSES (label) = 1;
21553 }
21554 }
21555 }
21556
21557 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21558 static void
21559 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21560 rtx count, int max_size)
21561 {
21562 count =
21563 expand_simple_binop (counter_mode (count), AND, count,
21564 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21565 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21566 gen_lowpart (QImode, value), count, QImode,
21567 1, max_size / 2);
21568 }
21569
21570 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21571 static void
21572 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21573 {
21574 rtx dest;
21575
21576 if (CONST_INT_P (count))
21577 {
21578 HOST_WIDE_INT countval = INTVAL (count);
21579 int offset = 0;
21580
21581 if ((countval & 0x10) && max_size > 16)
21582 {
21583 if (TARGET_64BIT)
21584 {
21585 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21586 emit_insn (gen_strset (destptr, dest, value));
21587 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21588 emit_insn (gen_strset (destptr, dest, value));
21589 }
21590 else
21591 gcc_unreachable ();
21592 offset += 16;
21593 }
21594 if ((countval & 0x08) && max_size > 8)
21595 {
21596 if (TARGET_64BIT)
21597 {
21598 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21599 emit_insn (gen_strset (destptr, dest, value));
21600 }
21601 else
21602 {
21603 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21604 emit_insn (gen_strset (destptr, dest, value));
21605 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21606 emit_insn (gen_strset (destptr, dest, value));
21607 }
21608 offset += 8;
21609 }
21610 if ((countval & 0x04) && max_size > 4)
21611 {
21612 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21613 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21614 offset += 4;
21615 }
21616 if ((countval & 0x02) && max_size > 2)
21617 {
21618 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21619 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21620 offset += 2;
21621 }
21622 if ((countval & 0x01) && max_size > 1)
21623 {
21624 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21625 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21626 offset += 1;
21627 }
21628 return;
21629 }
21630 if (max_size > 32)
21631 {
21632 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21633 return;
21634 }
21635 if (max_size > 16)
21636 {
21637 rtx label = ix86_expand_aligntest (count, 16, true);
21638 if (TARGET_64BIT)
21639 {
21640 dest = change_address (destmem, DImode, destptr);
21641 emit_insn (gen_strset (destptr, dest, value));
21642 emit_insn (gen_strset (destptr, dest, value));
21643 }
21644 else
21645 {
21646 dest = change_address (destmem, SImode, destptr);
21647 emit_insn (gen_strset (destptr, dest, value));
21648 emit_insn (gen_strset (destptr, dest, value));
21649 emit_insn (gen_strset (destptr, dest, value));
21650 emit_insn (gen_strset (destptr, dest, value));
21651 }
21652 emit_label (label);
21653 LABEL_NUSES (label) = 1;
21654 }
21655 if (max_size > 8)
21656 {
21657 rtx label = ix86_expand_aligntest (count, 8, true);
21658 if (TARGET_64BIT)
21659 {
21660 dest = change_address (destmem, DImode, destptr);
21661 emit_insn (gen_strset (destptr, dest, value));
21662 }
21663 else
21664 {
21665 dest = change_address (destmem, SImode, destptr);
21666 emit_insn (gen_strset (destptr, dest, value));
21667 emit_insn (gen_strset (destptr, dest, value));
21668 }
21669 emit_label (label);
21670 LABEL_NUSES (label) = 1;
21671 }
21672 if (max_size > 4)
21673 {
21674 rtx label = ix86_expand_aligntest (count, 4, true);
21675 dest = change_address (destmem, SImode, destptr);
21676 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21677 emit_label (label);
21678 LABEL_NUSES (label) = 1;
21679 }
21680 if (max_size > 2)
21681 {
21682 rtx label = ix86_expand_aligntest (count, 2, true);
21683 dest = change_address (destmem, HImode, destptr);
21684 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21685 emit_label (label);
21686 LABEL_NUSES (label) = 1;
21687 }
21688 if (max_size > 1)
21689 {
21690 rtx label = ix86_expand_aligntest (count, 1, true);
21691 dest = change_address (destmem, QImode, destptr);
21692 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21693 emit_label (label);
21694 LABEL_NUSES (label) = 1;
21695 }
21696 }
21697
21698 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21699 DESIRED_ALIGNMENT. */
21700 static void
21701 expand_movmem_prologue (rtx destmem, rtx srcmem,
21702 rtx destptr, rtx srcptr, rtx count,
21703 int align, int desired_alignment)
21704 {
21705 if (align <= 1 && desired_alignment > 1)
21706 {
21707 rtx label = ix86_expand_aligntest (destptr, 1, false);
21708 srcmem = change_address (srcmem, QImode, srcptr);
21709 destmem = change_address (destmem, QImode, destptr);
21710 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21711 ix86_adjust_counter (count, 1);
21712 emit_label (label);
21713 LABEL_NUSES (label) = 1;
21714 }
21715 if (align <= 2 && desired_alignment > 2)
21716 {
21717 rtx label = ix86_expand_aligntest (destptr, 2, false);
21718 srcmem = change_address (srcmem, HImode, srcptr);
21719 destmem = change_address (destmem, HImode, destptr);
21720 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21721 ix86_adjust_counter (count, 2);
21722 emit_label (label);
21723 LABEL_NUSES (label) = 1;
21724 }
21725 if (align <= 4 && desired_alignment > 4)
21726 {
21727 rtx label = ix86_expand_aligntest (destptr, 4, false);
21728 srcmem = change_address (srcmem, SImode, srcptr);
21729 destmem = change_address (destmem, SImode, destptr);
21730 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21731 ix86_adjust_counter (count, 4);
21732 emit_label (label);
21733 LABEL_NUSES (label) = 1;
21734 }
21735 gcc_assert (desired_alignment <= 8);
21736 }
21737
21738 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21739 ALIGN_BYTES is how many bytes need to be copied. */
21740 static rtx
21741 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21742 int desired_align, int align_bytes)
21743 {
21744 rtx src = *srcp;
21745 rtx orig_dst = dst;
21746 rtx orig_src = src;
21747 int off = 0;
21748 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21749 if (src_align_bytes >= 0)
21750 src_align_bytes = desired_align - src_align_bytes;
21751 if (align_bytes & 1)
21752 {
21753 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21754 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21755 off = 1;
21756 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21757 }
21758 if (align_bytes & 2)
21759 {
21760 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21761 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21762 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21763 set_mem_align (dst, 2 * BITS_PER_UNIT);
21764 if (src_align_bytes >= 0
21765 && (src_align_bytes & 1) == (align_bytes & 1)
21766 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21767 set_mem_align (src, 2 * BITS_PER_UNIT);
21768 off = 2;
21769 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21770 }
21771 if (align_bytes & 4)
21772 {
21773 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21774 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21775 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21776 set_mem_align (dst, 4 * BITS_PER_UNIT);
21777 if (src_align_bytes >= 0)
21778 {
21779 unsigned int src_align = 0;
21780 if ((src_align_bytes & 3) == (align_bytes & 3))
21781 src_align = 4;
21782 else if ((src_align_bytes & 1) == (align_bytes & 1))
21783 src_align = 2;
21784 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21785 set_mem_align (src, src_align * BITS_PER_UNIT);
21786 }
21787 off = 4;
21788 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21789 }
21790 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21791 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21792 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21793 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21794 if (src_align_bytes >= 0)
21795 {
21796 unsigned int src_align = 0;
21797 if ((src_align_bytes & 7) == (align_bytes & 7))
21798 src_align = 8;
21799 else if ((src_align_bytes & 3) == (align_bytes & 3))
21800 src_align = 4;
21801 else if ((src_align_bytes & 1) == (align_bytes & 1))
21802 src_align = 2;
21803 if (src_align > (unsigned int) desired_align)
21804 src_align = desired_align;
21805 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21806 set_mem_align (src, src_align * BITS_PER_UNIT);
21807 }
21808 if (MEM_SIZE_KNOWN_P (orig_dst))
21809 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21810 if (MEM_SIZE_KNOWN_P (orig_src))
21811 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21812 *srcp = src;
21813 return dst;
21814 }
21815
21816 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21817 DESIRED_ALIGNMENT. */
21818 static void
21819 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21820 int align, int desired_alignment)
21821 {
21822 if (align <= 1 && desired_alignment > 1)
21823 {
21824 rtx label = ix86_expand_aligntest (destptr, 1, false);
21825 destmem = change_address (destmem, QImode, destptr);
21826 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21827 ix86_adjust_counter (count, 1);
21828 emit_label (label);
21829 LABEL_NUSES (label) = 1;
21830 }
21831 if (align <= 2 && desired_alignment > 2)
21832 {
21833 rtx label = ix86_expand_aligntest (destptr, 2, false);
21834 destmem = change_address (destmem, HImode, destptr);
21835 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21836 ix86_adjust_counter (count, 2);
21837 emit_label (label);
21838 LABEL_NUSES (label) = 1;
21839 }
21840 if (align <= 4 && desired_alignment > 4)
21841 {
21842 rtx label = ix86_expand_aligntest (destptr, 4, false);
21843 destmem = change_address (destmem, SImode, destptr);
21844 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21845 ix86_adjust_counter (count, 4);
21846 emit_label (label);
21847 LABEL_NUSES (label) = 1;
21848 }
21849 gcc_assert (desired_alignment <= 8);
21850 }
21851
21852 /* Set enough from DST to align DST known to by aligned by ALIGN to
21853 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21854 static rtx
21855 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21856 int desired_align, int align_bytes)
21857 {
21858 int off = 0;
21859 rtx orig_dst = dst;
21860 if (align_bytes & 1)
21861 {
21862 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21863 off = 1;
21864 emit_insn (gen_strset (destreg, dst,
21865 gen_lowpart (QImode, value)));
21866 }
21867 if (align_bytes & 2)
21868 {
21869 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21870 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21871 set_mem_align (dst, 2 * BITS_PER_UNIT);
21872 off = 2;
21873 emit_insn (gen_strset (destreg, dst,
21874 gen_lowpart (HImode, value)));
21875 }
21876 if (align_bytes & 4)
21877 {
21878 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21879 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21880 set_mem_align (dst, 4 * BITS_PER_UNIT);
21881 off = 4;
21882 emit_insn (gen_strset (destreg, dst,
21883 gen_lowpart (SImode, value)));
21884 }
21885 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21886 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21887 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21888 if (MEM_SIZE_KNOWN_P (orig_dst))
21889 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21890 return dst;
21891 }
21892
21893 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21894 static enum stringop_alg
21895 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21896 int *dynamic_check)
21897 {
21898 const struct stringop_algs * algs;
21899 bool optimize_for_speed;
21900 /* Algorithms using the rep prefix want at least edi and ecx;
21901 additionally, memset wants eax and memcpy wants esi. Don't
21902 consider such algorithms if the user has appropriated those
21903 registers for their own purposes. */
21904 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21905 || (memset
21906 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21907
21908 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21909 || (alg != rep_prefix_1_byte \
21910 && alg != rep_prefix_4_byte \
21911 && alg != rep_prefix_8_byte))
21912 const struct processor_costs *cost;
21913
21914 /* Even if the string operation call is cold, we still might spend a lot
21915 of time processing large blocks. */
21916 if (optimize_function_for_size_p (cfun)
21917 || (optimize_insn_for_size_p ()
21918 && expected_size != -1 && expected_size < 256))
21919 optimize_for_speed = false;
21920 else
21921 optimize_for_speed = true;
21922
21923 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21924
21925 *dynamic_check = -1;
21926 if (memset)
21927 algs = &cost->memset[TARGET_64BIT != 0];
21928 else
21929 algs = &cost->memcpy[TARGET_64BIT != 0];
21930 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21931 return ix86_stringop_alg;
21932 /* rep; movq or rep; movl is the smallest variant. */
21933 else if (!optimize_for_speed)
21934 {
21935 if (!count || (count & 3))
21936 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21937 else
21938 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21939 }
21940 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21941 */
21942 else if (expected_size != -1 && expected_size < 4)
21943 return loop_1_byte;
21944 else if (expected_size != -1)
21945 {
21946 unsigned int i;
21947 enum stringop_alg alg = libcall;
21948 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21949 {
21950 /* We get here if the algorithms that were not libcall-based
21951 were rep-prefix based and we are unable to use rep prefixes
21952 based on global register usage. Break out of the loop and
21953 use the heuristic below. */
21954 if (algs->size[i].max == 0)
21955 break;
21956 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21957 {
21958 enum stringop_alg candidate = algs->size[i].alg;
21959
21960 if (candidate != libcall && ALG_USABLE_P (candidate))
21961 alg = candidate;
21962 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21963 last non-libcall inline algorithm. */
21964 if (TARGET_INLINE_ALL_STRINGOPS)
21965 {
21966 /* When the current size is best to be copied by a libcall,
21967 but we are still forced to inline, run the heuristic below
21968 that will pick code for medium sized blocks. */
21969 if (alg != libcall)
21970 return alg;
21971 break;
21972 }
21973 else if (ALG_USABLE_P (candidate))
21974 return candidate;
21975 }
21976 }
21977 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21978 }
21979 /* When asked to inline the call anyway, try to pick meaningful choice.
21980 We look for maximal size of block that is faster to copy by hand and
21981 take blocks of at most of that size guessing that average size will
21982 be roughly half of the block.
21983
21984 If this turns out to be bad, we might simply specify the preferred
21985 choice in ix86_costs. */
21986 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21987 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21988 {
21989 int max = -1;
21990 enum stringop_alg alg;
21991 int i;
21992 bool any_alg_usable_p = true;
21993
21994 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21995 {
21996 enum stringop_alg candidate = algs->size[i].alg;
21997 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21998
21999 if (candidate != libcall && candidate
22000 && ALG_USABLE_P (candidate))
22001 max = algs->size[i].max;
22002 }
22003 /* If there aren't any usable algorithms, then recursing on
22004 smaller sizes isn't going to find anything. Just return the
22005 simple byte-at-a-time copy loop. */
22006 if (!any_alg_usable_p)
22007 {
22008 /* Pick something reasonable. */
22009 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22010 *dynamic_check = 128;
22011 return loop_1_byte;
22012 }
22013 if (max == -1)
22014 max = 4096;
22015 alg = decide_alg (count, max / 2, memset, dynamic_check);
22016 gcc_assert (*dynamic_check == -1);
22017 gcc_assert (alg != libcall);
22018 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22019 *dynamic_check = max;
22020 return alg;
22021 }
22022 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22023 #undef ALG_USABLE_P
22024 }
22025
22026 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22027 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22028 static int
22029 decide_alignment (int align,
22030 enum stringop_alg alg,
22031 int expected_size)
22032 {
22033 int desired_align = 0;
22034 switch (alg)
22035 {
22036 case no_stringop:
22037 gcc_unreachable ();
22038 case loop:
22039 case unrolled_loop:
22040 desired_align = GET_MODE_SIZE (Pmode);
22041 break;
22042 case rep_prefix_8_byte:
22043 desired_align = 8;
22044 break;
22045 case rep_prefix_4_byte:
22046 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22047 copying whole cacheline at once. */
22048 if (TARGET_PENTIUMPRO)
22049 desired_align = 8;
22050 else
22051 desired_align = 4;
22052 break;
22053 case rep_prefix_1_byte:
22054 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22055 copying whole cacheline at once. */
22056 if (TARGET_PENTIUMPRO)
22057 desired_align = 8;
22058 else
22059 desired_align = 1;
22060 break;
22061 case loop_1_byte:
22062 desired_align = 1;
22063 break;
22064 case libcall:
22065 return 0;
22066 }
22067
22068 if (optimize_size)
22069 desired_align = 1;
22070 if (desired_align < align)
22071 desired_align = align;
22072 if (expected_size != -1 && expected_size < 4)
22073 desired_align = align;
22074 return desired_align;
22075 }
22076
22077 /* Return the smallest power of 2 greater than VAL. */
22078 static int
22079 smallest_pow2_greater_than (int val)
22080 {
22081 int ret = 1;
22082 while (ret <= val)
22083 ret <<= 1;
22084 return ret;
22085 }
22086
22087 /* Expand string move (memcpy) operation. Use i386 string operations
22088 when profitable. expand_setmem contains similar code. The code
22089 depends upon architecture, block size and alignment, but always has
22090 the same overall structure:
22091
22092 1) Prologue guard: Conditional that jumps up to epilogues for small
22093 blocks that can be handled by epilogue alone. This is faster
22094 but also needed for correctness, since prologue assume the block
22095 is larger than the desired alignment.
22096
22097 Optional dynamic check for size and libcall for large
22098 blocks is emitted here too, with -minline-stringops-dynamically.
22099
22100 2) Prologue: copy first few bytes in order to get destination
22101 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22102 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22103 copied. We emit either a jump tree on power of two sized
22104 blocks, or a byte loop.
22105
22106 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22107 with specified algorithm.
22108
22109 4) Epilogue: code copying tail of the block that is too small to be
22110 handled by main body (or up to size guarded by prologue guard). */
22111
22112 bool
22113 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22114 rtx expected_align_exp, rtx expected_size_exp)
22115 {
22116 rtx destreg;
22117 rtx srcreg;
22118 rtx label = NULL;
22119 rtx tmp;
22120 rtx jump_around_label = NULL;
22121 HOST_WIDE_INT align = 1;
22122 unsigned HOST_WIDE_INT count = 0;
22123 HOST_WIDE_INT expected_size = -1;
22124 int size_needed = 0, epilogue_size_needed;
22125 int desired_align = 0, align_bytes = 0;
22126 enum stringop_alg alg;
22127 int dynamic_check;
22128 bool need_zero_guard = false;
22129
22130 if (CONST_INT_P (align_exp))
22131 align = INTVAL (align_exp);
22132 /* i386 can do misaligned access on reasonably increased cost. */
22133 if (CONST_INT_P (expected_align_exp)
22134 && INTVAL (expected_align_exp) > align)
22135 align = INTVAL (expected_align_exp);
22136 /* ALIGN is the minimum of destination and source alignment, but we care here
22137 just about destination alignment. */
22138 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22139 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22140
22141 if (CONST_INT_P (count_exp))
22142 count = expected_size = INTVAL (count_exp);
22143 if (CONST_INT_P (expected_size_exp) && count == 0)
22144 expected_size = INTVAL (expected_size_exp);
22145
22146 /* Make sure we don't need to care about overflow later on. */
22147 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22148 return false;
22149
22150 /* Step 0: Decide on preferred algorithm, desired alignment and
22151 size of chunks to be copied by main loop. */
22152
22153 alg = decide_alg (count, expected_size, false, &dynamic_check);
22154 desired_align = decide_alignment (align, alg, expected_size);
22155
22156 if (!TARGET_ALIGN_STRINGOPS)
22157 align = desired_align;
22158
22159 if (alg == libcall)
22160 return false;
22161 gcc_assert (alg != no_stringop);
22162 if (!count)
22163 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22164 destreg = copy_addr_to_reg (XEXP (dst, 0));
22165 srcreg = copy_addr_to_reg (XEXP (src, 0));
22166 switch (alg)
22167 {
22168 case libcall:
22169 case no_stringop:
22170 gcc_unreachable ();
22171 case loop:
22172 need_zero_guard = true;
22173 size_needed = GET_MODE_SIZE (word_mode);
22174 break;
22175 case unrolled_loop:
22176 need_zero_guard = true;
22177 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22178 break;
22179 case rep_prefix_8_byte:
22180 size_needed = 8;
22181 break;
22182 case rep_prefix_4_byte:
22183 size_needed = 4;
22184 break;
22185 case rep_prefix_1_byte:
22186 size_needed = 1;
22187 break;
22188 case loop_1_byte:
22189 need_zero_guard = true;
22190 size_needed = 1;
22191 break;
22192 }
22193
22194 epilogue_size_needed = size_needed;
22195
22196 /* Step 1: Prologue guard. */
22197
22198 /* Alignment code needs count to be in register. */
22199 if (CONST_INT_P (count_exp) && desired_align > align)
22200 {
22201 if (INTVAL (count_exp) > desired_align
22202 && INTVAL (count_exp) > size_needed)
22203 {
22204 align_bytes
22205 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22206 if (align_bytes <= 0)
22207 align_bytes = 0;
22208 else
22209 align_bytes = desired_align - align_bytes;
22210 }
22211 if (align_bytes == 0)
22212 count_exp = force_reg (counter_mode (count_exp), count_exp);
22213 }
22214 gcc_assert (desired_align >= 1 && align >= 1);
22215
22216 /* Ensure that alignment prologue won't copy past end of block. */
22217 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22218 {
22219 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22220 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22221 Make sure it is power of 2. */
22222 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22223
22224 if (count)
22225 {
22226 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22227 {
22228 /* If main algorithm works on QImode, no epilogue is needed.
22229 For small sizes just don't align anything. */
22230 if (size_needed == 1)
22231 desired_align = align;
22232 else
22233 goto epilogue;
22234 }
22235 }
22236 else
22237 {
22238 label = gen_label_rtx ();
22239 emit_cmp_and_jump_insns (count_exp,
22240 GEN_INT (epilogue_size_needed),
22241 LTU, 0, counter_mode (count_exp), 1, label);
22242 if (expected_size == -1 || expected_size < epilogue_size_needed)
22243 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22244 else
22245 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22246 }
22247 }
22248
22249 /* Emit code to decide on runtime whether library call or inline should be
22250 used. */
22251 if (dynamic_check != -1)
22252 {
22253 if (CONST_INT_P (count_exp))
22254 {
22255 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22256 {
22257 emit_block_move_via_libcall (dst, src, count_exp, false);
22258 count_exp = const0_rtx;
22259 goto epilogue;
22260 }
22261 }
22262 else
22263 {
22264 rtx hot_label = gen_label_rtx ();
22265 jump_around_label = gen_label_rtx ();
22266 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22267 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22268 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22269 emit_block_move_via_libcall (dst, src, count_exp, false);
22270 emit_jump (jump_around_label);
22271 emit_label (hot_label);
22272 }
22273 }
22274
22275 /* Step 2: Alignment prologue. */
22276
22277 if (desired_align > align)
22278 {
22279 if (align_bytes == 0)
22280 {
22281 /* Except for the first move in epilogue, we no longer know
22282 constant offset in aliasing info. It don't seems to worth
22283 the pain to maintain it for the first move, so throw away
22284 the info early. */
22285 src = change_address (src, BLKmode, srcreg);
22286 dst = change_address (dst, BLKmode, destreg);
22287 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22288 desired_align);
22289 }
22290 else
22291 {
22292 /* If we know how many bytes need to be stored before dst is
22293 sufficiently aligned, maintain aliasing info accurately. */
22294 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22295 desired_align, align_bytes);
22296 count_exp = plus_constant (counter_mode (count_exp),
22297 count_exp, -align_bytes);
22298 count -= align_bytes;
22299 }
22300 if (need_zero_guard
22301 && (count < (unsigned HOST_WIDE_INT) size_needed
22302 || (align_bytes == 0
22303 && count < ((unsigned HOST_WIDE_INT) size_needed
22304 + desired_align - align))))
22305 {
22306 /* It is possible that we copied enough so the main loop will not
22307 execute. */
22308 gcc_assert (size_needed > 1);
22309 if (label == NULL_RTX)
22310 label = gen_label_rtx ();
22311 emit_cmp_and_jump_insns (count_exp,
22312 GEN_INT (size_needed),
22313 LTU, 0, counter_mode (count_exp), 1, label);
22314 if (expected_size == -1
22315 || expected_size < (desired_align - align) / 2 + size_needed)
22316 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22317 else
22318 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22319 }
22320 }
22321 if (label && size_needed == 1)
22322 {
22323 emit_label (label);
22324 LABEL_NUSES (label) = 1;
22325 label = NULL;
22326 epilogue_size_needed = 1;
22327 }
22328 else if (label == NULL_RTX)
22329 epilogue_size_needed = size_needed;
22330
22331 /* Step 3: Main loop. */
22332
22333 switch (alg)
22334 {
22335 case libcall:
22336 case no_stringop:
22337 gcc_unreachable ();
22338 case loop_1_byte:
22339 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22340 count_exp, QImode, 1, expected_size);
22341 break;
22342 case loop:
22343 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22344 count_exp, word_mode, 1, expected_size);
22345 break;
22346 case unrolled_loop:
22347 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22348 registers for 4 temporaries anyway. */
22349 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22350 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22351 expected_size);
22352 break;
22353 case rep_prefix_8_byte:
22354 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22355 DImode);
22356 break;
22357 case rep_prefix_4_byte:
22358 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22359 SImode);
22360 break;
22361 case rep_prefix_1_byte:
22362 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22363 QImode);
22364 break;
22365 }
22366 /* Adjust properly the offset of src and dest memory for aliasing. */
22367 if (CONST_INT_P (count_exp))
22368 {
22369 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22370 (count / size_needed) * size_needed);
22371 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22372 (count / size_needed) * size_needed);
22373 }
22374 else
22375 {
22376 src = change_address (src, BLKmode, srcreg);
22377 dst = change_address (dst, BLKmode, destreg);
22378 }
22379
22380 /* Step 4: Epilogue to copy the remaining bytes. */
22381 epilogue:
22382 if (label)
22383 {
22384 /* When the main loop is done, COUNT_EXP might hold original count,
22385 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22386 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22387 bytes. Compensate if needed. */
22388
22389 if (size_needed < epilogue_size_needed)
22390 {
22391 tmp =
22392 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22393 GEN_INT (size_needed - 1), count_exp, 1,
22394 OPTAB_DIRECT);
22395 if (tmp != count_exp)
22396 emit_move_insn (count_exp, tmp);
22397 }
22398 emit_label (label);
22399 LABEL_NUSES (label) = 1;
22400 }
22401
22402 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22403 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22404 epilogue_size_needed);
22405 if (jump_around_label)
22406 emit_label (jump_around_label);
22407 return true;
22408 }
22409
22410 /* Helper function for memcpy. For QImode value 0xXY produce
22411 0xXYXYXYXY of wide specified by MODE. This is essentially
22412 a * 0x10101010, but we can do slightly better than
22413 synth_mult by unwinding the sequence by hand on CPUs with
22414 slow multiply. */
22415 static rtx
22416 promote_duplicated_reg (enum machine_mode mode, rtx val)
22417 {
22418 enum machine_mode valmode = GET_MODE (val);
22419 rtx tmp;
22420 int nops = mode == DImode ? 3 : 2;
22421
22422 gcc_assert (mode == SImode || mode == DImode);
22423 if (val == const0_rtx)
22424 return copy_to_mode_reg (mode, const0_rtx);
22425 if (CONST_INT_P (val))
22426 {
22427 HOST_WIDE_INT v = INTVAL (val) & 255;
22428
22429 v |= v << 8;
22430 v |= v << 16;
22431 if (mode == DImode)
22432 v |= (v << 16) << 16;
22433 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22434 }
22435
22436 if (valmode == VOIDmode)
22437 valmode = QImode;
22438 if (valmode != QImode)
22439 val = gen_lowpart (QImode, val);
22440 if (mode == QImode)
22441 return val;
22442 if (!TARGET_PARTIAL_REG_STALL)
22443 nops--;
22444 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22445 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22446 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22447 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22448 {
22449 rtx reg = convert_modes (mode, QImode, val, true);
22450 tmp = promote_duplicated_reg (mode, const1_rtx);
22451 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22452 OPTAB_DIRECT);
22453 }
22454 else
22455 {
22456 rtx reg = convert_modes (mode, QImode, val, true);
22457
22458 if (!TARGET_PARTIAL_REG_STALL)
22459 if (mode == SImode)
22460 emit_insn (gen_movsi_insv_1 (reg, reg));
22461 else
22462 emit_insn (gen_movdi_insv_1 (reg, reg));
22463 else
22464 {
22465 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22466 NULL, 1, OPTAB_DIRECT);
22467 reg =
22468 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22469 }
22470 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22471 NULL, 1, OPTAB_DIRECT);
22472 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22473 if (mode == SImode)
22474 return reg;
22475 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22476 NULL, 1, OPTAB_DIRECT);
22477 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22478 return reg;
22479 }
22480 }
22481
22482 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22483 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22484 alignment from ALIGN to DESIRED_ALIGN. */
22485 static rtx
22486 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22487 {
22488 rtx promoted_val;
22489
22490 if (TARGET_64BIT
22491 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22492 promoted_val = promote_duplicated_reg (DImode, val);
22493 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22494 promoted_val = promote_duplicated_reg (SImode, val);
22495 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22496 promoted_val = promote_duplicated_reg (HImode, val);
22497 else
22498 promoted_val = val;
22499
22500 return promoted_val;
22501 }
22502
22503 /* Expand string clear operation (bzero). Use i386 string operations when
22504 profitable. See expand_movmem comment for explanation of individual
22505 steps performed. */
22506 bool
22507 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22508 rtx expected_align_exp, rtx expected_size_exp)
22509 {
22510 rtx destreg;
22511 rtx label = NULL;
22512 rtx tmp;
22513 rtx jump_around_label = NULL;
22514 HOST_WIDE_INT align = 1;
22515 unsigned HOST_WIDE_INT count = 0;
22516 HOST_WIDE_INT expected_size = -1;
22517 int size_needed = 0, epilogue_size_needed;
22518 int desired_align = 0, align_bytes = 0;
22519 enum stringop_alg alg;
22520 rtx promoted_val = NULL;
22521 bool force_loopy_epilogue = false;
22522 int dynamic_check;
22523 bool need_zero_guard = false;
22524
22525 if (CONST_INT_P (align_exp))
22526 align = INTVAL (align_exp);
22527 /* i386 can do misaligned access on reasonably increased cost. */
22528 if (CONST_INT_P (expected_align_exp)
22529 && INTVAL (expected_align_exp) > align)
22530 align = INTVAL (expected_align_exp);
22531 if (CONST_INT_P (count_exp))
22532 count = expected_size = INTVAL (count_exp);
22533 if (CONST_INT_P (expected_size_exp) && count == 0)
22534 expected_size = INTVAL (expected_size_exp);
22535
22536 /* Make sure we don't need to care about overflow later on. */
22537 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22538 return false;
22539
22540 /* Step 0: Decide on preferred algorithm, desired alignment and
22541 size of chunks to be copied by main loop. */
22542
22543 alg = decide_alg (count, expected_size, true, &dynamic_check);
22544 desired_align = decide_alignment (align, alg, expected_size);
22545
22546 if (!TARGET_ALIGN_STRINGOPS)
22547 align = desired_align;
22548
22549 if (alg == libcall)
22550 return false;
22551 gcc_assert (alg != no_stringop);
22552 if (!count)
22553 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22554 destreg = copy_addr_to_reg (XEXP (dst, 0));
22555 switch (alg)
22556 {
22557 case libcall:
22558 case no_stringop:
22559 gcc_unreachable ();
22560 case loop:
22561 need_zero_guard = true;
22562 size_needed = GET_MODE_SIZE (word_mode);
22563 break;
22564 case unrolled_loop:
22565 need_zero_guard = true;
22566 size_needed = GET_MODE_SIZE (word_mode) * 4;
22567 break;
22568 case rep_prefix_8_byte:
22569 size_needed = 8;
22570 break;
22571 case rep_prefix_4_byte:
22572 size_needed = 4;
22573 break;
22574 case rep_prefix_1_byte:
22575 size_needed = 1;
22576 break;
22577 case loop_1_byte:
22578 need_zero_guard = true;
22579 size_needed = 1;
22580 break;
22581 }
22582 epilogue_size_needed = size_needed;
22583
22584 /* Step 1: Prologue guard. */
22585
22586 /* Alignment code needs count to be in register. */
22587 if (CONST_INT_P (count_exp) && desired_align > align)
22588 {
22589 if (INTVAL (count_exp) > desired_align
22590 && INTVAL (count_exp) > size_needed)
22591 {
22592 align_bytes
22593 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22594 if (align_bytes <= 0)
22595 align_bytes = 0;
22596 else
22597 align_bytes = desired_align - align_bytes;
22598 }
22599 if (align_bytes == 0)
22600 {
22601 enum machine_mode mode = SImode;
22602 if (TARGET_64BIT && (count & ~0xffffffff))
22603 mode = DImode;
22604 count_exp = force_reg (mode, count_exp);
22605 }
22606 }
22607 /* Do the cheap promotion to allow better CSE across the
22608 main loop and epilogue (ie one load of the big constant in the
22609 front of all code. */
22610 if (CONST_INT_P (val_exp))
22611 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22612 desired_align, align);
22613 /* Ensure that alignment prologue won't copy past end of block. */
22614 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22615 {
22616 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22617 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22618 Make sure it is power of 2. */
22619 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22620
22621 /* To improve performance of small blocks, we jump around the VAL
22622 promoting mode. This mean that if the promoted VAL is not constant,
22623 we might not use it in the epilogue and have to use byte
22624 loop variant. */
22625 if (epilogue_size_needed > 2 && !promoted_val)
22626 force_loopy_epilogue = true;
22627 if (count)
22628 {
22629 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22630 {
22631 /* If main algorithm works on QImode, no epilogue is needed.
22632 For small sizes just don't align anything. */
22633 if (size_needed == 1)
22634 desired_align = align;
22635 else
22636 goto epilogue;
22637 }
22638 }
22639 else
22640 {
22641 label = gen_label_rtx ();
22642 emit_cmp_and_jump_insns (count_exp,
22643 GEN_INT (epilogue_size_needed),
22644 LTU, 0, counter_mode (count_exp), 1, label);
22645 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22646 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22647 else
22648 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22649 }
22650 }
22651 if (dynamic_check != -1)
22652 {
22653 rtx hot_label = gen_label_rtx ();
22654 jump_around_label = gen_label_rtx ();
22655 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22656 LEU, 0, counter_mode (count_exp), 1, hot_label);
22657 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22658 set_storage_via_libcall (dst, count_exp, val_exp, false);
22659 emit_jump (jump_around_label);
22660 emit_label (hot_label);
22661 }
22662
22663 /* Step 2: Alignment prologue. */
22664
22665 /* Do the expensive promotion once we branched off the small blocks. */
22666 if (!promoted_val)
22667 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22668 desired_align, align);
22669 gcc_assert (desired_align >= 1 && align >= 1);
22670
22671 if (desired_align > align)
22672 {
22673 if (align_bytes == 0)
22674 {
22675 /* Except for the first move in epilogue, we no longer know
22676 constant offset in aliasing info. It don't seems to worth
22677 the pain to maintain it for the first move, so throw away
22678 the info early. */
22679 dst = change_address (dst, BLKmode, destreg);
22680 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22681 desired_align);
22682 }
22683 else
22684 {
22685 /* If we know how many bytes need to be stored before dst is
22686 sufficiently aligned, maintain aliasing info accurately. */
22687 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22688 desired_align, align_bytes);
22689 count_exp = plus_constant (counter_mode (count_exp),
22690 count_exp, -align_bytes);
22691 count -= align_bytes;
22692 }
22693 if (need_zero_guard
22694 && (count < (unsigned HOST_WIDE_INT) size_needed
22695 || (align_bytes == 0
22696 && count < ((unsigned HOST_WIDE_INT) size_needed
22697 + desired_align - align))))
22698 {
22699 /* It is possible that we copied enough so the main loop will not
22700 execute. */
22701 gcc_assert (size_needed > 1);
22702 if (label == NULL_RTX)
22703 label = gen_label_rtx ();
22704 emit_cmp_and_jump_insns (count_exp,
22705 GEN_INT (size_needed),
22706 LTU, 0, counter_mode (count_exp), 1, label);
22707 if (expected_size == -1
22708 || expected_size < (desired_align - align) / 2 + size_needed)
22709 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22710 else
22711 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22712 }
22713 }
22714 if (label && size_needed == 1)
22715 {
22716 emit_label (label);
22717 LABEL_NUSES (label) = 1;
22718 label = NULL;
22719 promoted_val = val_exp;
22720 epilogue_size_needed = 1;
22721 }
22722 else if (label == NULL_RTX)
22723 epilogue_size_needed = size_needed;
22724
22725 /* Step 3: Main loop. */
22726
22727 switch (alg)
22728 {
22729 case libcall:
22730 case no_stringop:
22731 gcc_unreachable ();
22732 case loop_1_byte:
22733 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22734 count_exp, QImode, 1, expected_size);
22735 break;
22736 case loop:
22737 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22738 count_exp, word_mode, 1, expected_size);
22739 break;
22740 case unrolled_loop:
22741 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22742 count_exp, word_mode, 4, expected_size);
22743 break;
22744 case rep_prefix_8_byte:
22745 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22746 DImode, val_exp);
22747 break;
22748 case rep_prefix_4_byte:
22749 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22750 SImode, val_exp);
22751 break;
22752 case rep_prefix_1_byte:
22753 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22754 QImode, val_exp);
22755 break;
22756 }
22757 /* Adjust properly the offset of src and dest memory for aliasing. */
22758 if (CONST_INT_P (count_exp))
22759 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22760 (count / size_needed) * size_needed);
22761 else
22762 dst = change_address (dst, BLKmode, destreg);
22763
22764 /* Step 4: Epilogue to copy the remaining bytes. */
22765
22766 if (label)
22767 {
22768 /* When the main loop is done, COUNT_EXP might hold original count,
22769 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22770 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22771 bytes. Compensate if needed. */
22772
22773 if (size_needed < epilogue_size_needed)
22774 {
22775 tmp =
22776 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22777 GEN_INT (size_needed - 1), count_exp, 1,
22778 OPTAB_DIRECT);
22779 if (tmp != count_exp)
22780 emit_move_insn (count_exp, tmp);
22781 }
22782 emit_label (label);
22783 LABEL_NUSES (label) = 1;
22784 }
22785 epilogue:
22786 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22787 {
22788 if (force_loopy_epilogue)
22789 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22790 epilogue_size_needed);
22791 else
22792 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22793 epilogue_size_needed);
22794 }
22795 if (jump_around_label)
22796 emit_label (jump_around_label);
22797 return true;
22798 }
22799
22800 /* Expand the appropriate insns for doing strlen if not just doing
22801 repnz; scasb
22802
22803 out = result, initialized with the start address
22804 align_rtx = alignment of the address.
22805 scratch = scratch register, initialized with the startaddress when
22806 not aligned, otherwise undefined
22807
22808 This is just the body. It needs the initializations mentioned above and
22809 some address computing at the end. These things are done in i386.md. */
22810
22811 static void
22812 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22813 {
22814 int align;
22815 rtx tmp;
22816 rtx align_2_label = NULL_RTX;
22817 rtx align_3_label = NULL_RTX;
22818 rtx align_4_label = gen_label_rtx ();
22819 rtx end_0_label = gen_label_rtx ();
22820 rtx mem;
22821 rtx tmpreg = gen_reg_rtx (SImode);
22822 rtx scratch = gen_reg_rtx (SImode);
22823 rtx cmp;
22824
22825 align = 0;
22826 if (CONST_INT_P (align_rtx))
22827 align = INTVAL (align_rtx);
22828
22829 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22830
22831 /* Is there a known alignment and is it less than 4? */
22832 if (align < 4)
22833 {
22834 rtx scratch1 = gen_reg_rtx (Pmode);
22835 emit_move_insn (scratch1, out);
22836 /* Is there a known alignment and is it not 2? */
22837 if (align != 2)
22838 {
22839 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22840 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22841
22842 /* Leave just the 3 lower bits. */
22843 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22844 NULL_RTX, 0, OPTAB_WIDEN);
22845
22846 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22847 Pmode, 1, align_4_label);
22848 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22849 Pmode, 1, align_2_label);
22850 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22851 Pmode, 1, align_3_label);
22852 }
22853 else
22854 {
22855 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22856 check if is aligned to 4 - byte. */
22857
22858 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22859 NULL_RTX, 0, OPTAB_WIDEN);
22860
22861 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22862 Pmode, 1, align_4_label);
22863 }
22864
22865 mem = change_address (src, QImode, out);
22866
22867 /* Now compare the bytes. */
22868
22869 /* Compare the first n unaligned byte on a byte per byte basis. */
22870 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22871 QImode, 1, end_0_label);
22872
22873 /* Increment the address. */
22874 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22875
22876 /* Not needed with an alignment of 2 */
22877 if (align != 2)
22878 {
22879 emit_label (align_2_label);
22880
22881 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22882 end_0_label);
22883
22884 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22885
22886 emit_label (align_3_label);
22887 }
22888
22889 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22890 end_0_label);
22891
22892 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22893 }
22894
22895 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22896 align this loop. It gives only huge programs, but does not help to
22897 speed up. */
22898 emit_label (align_4_label);
22899
22900 mem = change_address (src, SImode, out);
22901 emit_move_insn (scratch, mem);
22902 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22903
22904 /* This formula yields a nonzero result iff one of the bytes is zero.
22905 This saves three branches inside loop and many cycles. */
22906
22907 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22908 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22909 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22910 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22911 gen_int_mode (0x80808080, SImode)));
22912 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22913 align_4_label);
22914
22915 if (TARGET_CMOVE)
22916 {
22917 rtx reg = gen_reg_rtx (SImode);
22918 rtx reg2 = gen_reg_rtx (Pmode);
22919 emit_move_insn (reg, tmpreg);
22920 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22921
22922 /* If zero is not in the first two bytes, move two bytes forward. */
22923 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22924 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22925 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22926 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22927 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22928 reg,
22929 tmpreg)));
22930 /* Emit lea manually to avoid clobbering of flags. */
22931 emit_insn (gen_rtx_SET (SImode, reg2,
22932 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22933
22934 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22935 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22936 emit_insn (gen_rtx_SET (VOIDmode, out,
22937 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22938 reg2,
22939 out)));
22940 }
22941 else
22942 {
22943 rtx end_2_label = gen_label_rtx ();
22944 /* Is zero in the first two bytes? */
22945
22946 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22947 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22948 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22949 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22950 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22951 pc_rtx);
22952 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22953 JUMP_LABEL (tmp) = end_2_label;
22954
22955 /* Not in the first two. Move two bytes forward. */
22956 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22957 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22958
22959 emit_label (end_2_label);
22960
22961 }
22962
22963 /* Avoid branch in fixing the byte. */
22964 tmpreg = gen_lowpart (QImode, tmpreg);
22965 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22966 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22967 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22968 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22969
22970 emit_label (end_0_label);
22971 }
22972
22973 /* Expand strlen. */
22974
22975 bool
22976 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22977 {
22978 rtx addr, scratch1, scratch2, scratch3, scratch4;
22979
22980 /* The generic case of strlen expander is long. Avoid it's
22981 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22982
22983 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22984 && !TARGET_INLINE_ALL_STRINGOPS
22985 && !optimize_insn_for_size_p ()
22986 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22987 return false;
22988
22989 addr = force_reg (Pmode, XEXP (src, 0));
22990 scratch1 = gen_reg_rtx (Pmode);
22991
22992 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22993 && !optimize_insn_for_size_p ())
22994 {
22995 /* Well it seems that some optimizer does not combine a call like
22996 foo(strlen(bar), strlen(bar));
22997 when the move and the subtraction is done here. It does calculate
22998 the length just once when these instructions are done inside of
22999 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23000 often used and I use one fewer register for the lifetime of
23001 output_strlen_unroll() this is better. */
23002
23003 emit_move_insn (out, addr);
23004
23005 ix86_expand_strlensi_unroll_1 (out, src, align);
23006
23007 /* strlensi_unroll_1 returns the address of the zero at the end of
23008 the string, like memchr(), so compute the length by subtracting
23009 the start address. */
23010 emit_insn (ix86_gen_sub3 (out, out, addr));
23011 }
23012 else
23013 {
23014 rtx unspec;
23015
23016 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23017 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23018 return false;
23019
23020 scratch2 = gen_reg_rtx (Pmode);
23021 scratch3 = gen_reg_rtx (Pmode);
23022 scratch4 = force_reg (Pmode, constm1_rtx);
23023
23024 emit_move_insn (scratch3, addr);
23025 eoschar = force_reg (QImode, eoschar);
23026
23027 src = replace_equiv_address_nv (src, scratch3);
23028
23029 /* If .md starts supporting :P, this can be done in .md. */
23030 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23031 scratch4), UNSPEC_SCAS);
23032 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23033 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23034 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23035 }
23036 return true;
23037 }
23038
23039 /* For given symbol (function) construct code to compute address of it's PLT
23040 entry in large x86-64 PIC model. */
23041 rtx
23042 construct_plt_address (rtx symbol)
23043 {
23044 rtx tmp, unspec;
23045
23046 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23047 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23048 gcc_assert (Pmode == DImode);
23049
23050 tmp = gen_reg_rtx (Pmode);
23051 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23052
23053 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23054 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23055 return tmp;
23056 }
23057
23058 rtx
23059 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23060 rtx callarg2,
23061 rtx pop, bool sibcall)
23062 {
23063 /* We need to represent that SI and DI registers are clobbered
23064 by SYSV calls. */
23065 static int clobbered_registers[] = {
23066 XMM6_REG, XMM7_REG, XMM8_REG,
23067 XMM9_REG, XMM10_REG, XMM11_REG,
23068 XMM12_REG, XMM13_REG, XMM14_REG,
23069 XMM15_REG, SI_REG, DI_REG
23070 };
23071 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23072 rtx use = NULL, call;
23073 unsigned int vec_len;
23074
23075 if (pop == const0_rtx)
23076 pop = NULL;
23077 gcc_assert (!TARGET_64BIT || !pop);
23078
23079 if (TARGET_MACHO && !TARGET_64BIT)
23080 {
23081 #if TARGET_MACHO
23082 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23083 fnaddr = machopic_indirect_call_target (fnaddr);
23084 #endif
23085 }
23086 else
23087 {
23088 /* Static functions and indirect calls don't need the pic register. */
23089 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23090 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23091 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23092 use_reg (&use, pic_offset_table_rtx);
23093 }
23094
23095 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23096 {
23097 rtx al = gen_rtx_REG (QImode, AX_REG);
23098 emit_move_insn (al, callarg2);
23099 use_reg (&use, al);
23100 }
23101
23102 if (ix86_cmodel == CM_LARGE_PIC
23103 && MEM_P (fnaddr)
23104 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23105 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23106 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23107 else if (sibcall
23108 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23109 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23110 {
23111 fnaddr = XEXP (fnaddr, 0);
23112 if (GET_MODE (fnaddr) != word_mode)
23113 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23114 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23115 }
23116
23117 vec_len = 0;
23118 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23119 if (retval)
23120 call = gen_rtx_SET (VOIDmode, retval, call);
23121 vec[vec_len++] = call;
23122
23123 if (pop)
23124 {
23125 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23126 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23127 vec[vec_len++] = pop;
23128 }
23129
23130 if (TARGET_64BIT_MS_ABI
23131 && (!callarg2 || INTVAL (callarg2) != -2))
23132 {
23133 unsigned i;
23134
23135 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23136 UNSPEC_MS_TO_SYSV_CALL);
23137
23138 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23139 vec[vec_len++]
23140 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23141 ? TImode : DImode,
23142 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23143 ? TImode : DImode,
23144 clobbered_registers[i]));
23145 }
23146
23147 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23148 if (TARGET_VZEROUPPER)
23149 {
23150 int avx256;
23151 if (cfun->machine->callee_pass_avx256_p)
23152 {
23153 if (cfun->machine->callee_return_avx256_p)
23154 avx256 = callee_return_pass_avx256;
23155 else
23156 avx256 = callee_pass_avx256;
23157 }
23158 else if (cfun->machine->callee_return_avx256_p)
23159 avx256 = callee_return_avx256;
23160 else
23161 avx256 = call_no_avx256;
23162
23163 if (reload_completed)
23164 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23165 else
23166 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23167 gen_rtvec (1, GEN_INT (avx256)),
23168 UNSPEC_CALL_NEEDS_VZEROUPPER);
23169 }
23170
23171 if (vec_len > 1)
23172 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23173 call = emit_call_insn (call);
23174 if (use)
23175 CALL_INSN_FUNCTION_USAGE (call) = use;
23176
23177 return call;
23178 }
23179
23180 void
23181 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23182 {
23183 rtx pat = PATTERN (insn);
23184 rtvec vec = XVEC (pat, 0);
23185 int len = GET_NUM_ELEM (vec) - 1;
23186
23187 /* Strip off the last entry of the parallel. */
23188 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23189 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23190 if (len == 1)
23191 pat = RTVEC_ELT (vec, 0);
23192 else
23193 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23194
23195 emit_insn (gen_avx_vzeroupper (vzeroupper));
23196 emit_call_insn (pat);
23197 }
23198
23199 /* Output the assembly for a call instruction. */
23200
23201 const char *
23202 ix86_output_call_insn (rtx insn, rtx call_op)
23203 {
23204 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23205 bool seh_nop_p = false;
23206 const char *xasm;
23207
23208 if (SIBLING_CALL_P (insn))
23209 {
23210 if (direct_p)
23211 xasm = "jmp\t%P0";
23212 /* SEH epilogue detection requires the indirect branch case
23213 to include REX.W. */
23214 else if (TARGET_SEH)
23215 xasm = "rex.W jmp %A0";
23216 else
23217 xasm = "jmp\t%A0";
23218
23219 output_asm_insn (xasm, &call_op);
23220 return "";
23221 }
23222
23223 /* SEH unwinding can require an extra nop to be emitted in several
23224 circumstances. Determine if we have one of those. */
23225 if (TARGET_SEH)
23226 {
23227 rtx i;
23228
23229 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23230 {
23231 /* If we get to another real insn, we don't need the nop. */
23232 if (INSN_P (i))
23233 break;
23234
23235 /* If we get to the epilogue note, prevent a catch region from
23236 being adjacent to the standard epilogue sequence. If non-
23237 call-exceptions, we'll have done this during epilogue emission. */
23238 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23239 && !flag_non_call_exceptions
23240 && !can_throw_internal (insn))
23241 {
23242 seh_nop_p = true;
23243 break;
23244 }
23245 }
23246
23247 /* If we didn't find a real insn following the call, prevent the
23248 unwinder from looking into the next function. */
23249 if (i == NULL)
23250 seh_nop_p = true;
23251 }
23252
23253 if (direct_p)
23254 xasm = "call\t%P0";
23255 else
23256 xasm = "call\t%A0";
23257
23258 output_asm_insn (xasm, &call_op);
23259
23260 if (seh_nop_p)
23261 return "nop";
23262
23263 return "";
23264 }
23265 \f
23266 /* Clear stack slot assignments remembered from previous functions.
23267 This is called from INIT_EXPANDERS once before RTL is emitted for each
23268 function. */
23269
23270 static struct machine_function *
23271 ix86_init_machine_status (void)
23272 {
23273 struct machine_function *f;
23274
23275 f = ggc_alloc_cleared_machine_function ();
23276 f->use_fast_prologue_epilogue_nregs = -1;
23277 f->tls_descriptor_call_expanded_p = 0;
23278 f->call_abi = ix86_abi;
23279
23280 return f;
23281 }
23282
23283 /* Return a MEM corresponding to a stack slot with mode MODE.
23284 Allocate a new slot if necessary.
23285
23286 The RTL for a function can have several slots available: N is
23287 which slot to use. */
23288
23289 rtx
23290 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23291 {
23292 struct stack_local_entry *s;
23293
23294 gcc_assert (n < MAX_386_STACK_LOCALS);
23295
23296 /* Virtual slot is valid only before vregs are instantiated. */
23297 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23298
23299 for (s = ix86_stack_locals; s; s = s->next)
23300 if (s->mode == mode && s->n == n)
23301 return validize_mem (copy_rtx (s->rtl));
23302
23303 s = ggc_alloc_stack_local_entry ();
23304 s->n = n;
23305 s->mode = mode;
23306 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23307
23308 s->next = ix86_stack_locals;
23309 ix86_stack_locals = s;
23310 return validize_mem (s->rtl);
23311 }
23312 \f
23313 /* Calculate the length of the memory address in the instruction encoding.
23314 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23315 or other prefixes. */
23316
23317 int
23318 memory_address_length (rtx addr)
23319 {
23320 struct ix86_address parts;
23321 rtx base, index, disp;
23322 int len;
23323 int ok;
23324
23325 if (GET_CODE (addr) == PRE_DEC
23326 || GET_CODE (addr) == POST_INC
23327 || GET_CODE (addr) == PRE_MODIFY
23328 || GET_CODE (addr) == POST_MODIFY)
23329 return 0;
23330
23331 ok = ix86_decompose_address (addr, &parts);
23332 gcc_assert (ok);
23333
23334 if (parts.base && GET_CODE (parts.base) == SUBREG)
23335 parts.base = SUBREG_REG (parts.base);
23336 if (parts.index && GET_CODE (parts.index) == SUBREG)
23337 parts.index = SUBREG_REG (parts.index);
23338
23339 base = parts.base;
23340 index = parts.index;
23341 disp = parts.disp;
23342
23343 /* Add length of addr32 prefix. */
23344 len = (GET_CODE (addr) == ZERO_EXTEND
23345 || GET_CODE (addr) == AND);
23346
23347 /* Rule of thumb:
23348 - esp as the base always wants an index,
23349 - ebp as the base always wants a displacement,
23350 - r12 as the base always wants an index,
23351 - r13 as the base always wants a displacement. */
23352
23353 /* Register Indirect. */
23354 if (base && !index && !disp)
23355 {
23356 /* esp (for its index) and ebp (for its displacement) need
23357 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23358 code. */
23359 if (REG_P (addr)
23360 && (addr == arg_pointer_rtx
23361 || addr == frame_pointer_rtx
23362 || REGNO (addr) == SP_REG
23363 || REGNO (addr) == BP_REG
23364 || REGNO (addr) == R12_REG
23365 || REGNO (addr) == R13_REG))
23366 len = 1;
23367 }
23368
23369 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23370 is not disp32, but disp32(%rip), so for disp32
23371 SIB byte is needed, unless print_operand_address
23372 optimizes it into disp32(%rip) or (%rip) is implied
23373 by UNSPEC. */
23374 else if (disp && !base && !index)
23375 {
23376 len = 4;
23377 if (TARGET_64BIT)
23378 {
23379 rtx symbol = disp;
23380
23381 if (GET_CODE (disp) == CONST)
23382 symbol = XEXP (disp, 0);
23383 if (GET_CODE (symbol) == PLUS
23384 && CONST_INT_P (XEXP (symbol, 1)))
23385 symbol = XEXP (symbol, 0);
23386
23387 if (GET_CODE (symbol) != LABEL_REF
23388 && (GET_CODE (symbol) != SYMBOL_REF
23389 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23390 && (GET_CODE (symbol) != UNSPEC
23391 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23392 && XINT (symbol, 1) != UNSPEC_PCREL
23393 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23394 len += 1;
23395 }
23396 }
23397
23398 else
23399 {
23400 /* Find the length of the displacement constant. */
23401 if (disp)
23402 {
23403 if (base && satisfies_constraint_K (disp))
23404 len = 1;
23405 else
23406 len = 4;
23407 }
23408 /* ebp always wants a displacement. Similarly r13. */
23409 else if (base && REG_P (base)
23410 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23411 len = 1;
23412
23413 /* An index requires the two-byte modrm form.... */
23414 if (index
23415 /* ...like esp (or r12), which always wants an index. */
23416 || base == arg_pointer_rtx
23417 || base == frame_pointer_rtx
23418 || (base && REG_P (base)
23419 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23420 len += 1;
23421 }
23422
23423 switch (parts.seg)
23424 {
23425 case SEG_FS:
23426 case SEG_GS:
23427 len += 1;
23428 break;
23429 default:
23430 break;
23431 }
23432
23433 return len;
23434 }
23435
23436 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23437 is set, expect that insn have 8bit immediate alternative. */
23438 int
23439 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23440 {
23441 int len = 0;
23442 int i;
23443 extract_insn_cached (insn);
23444 for (i = recog_data.n_operands - 1; i >= 0; --i)
23445 if (CONSTANT_P (recog_data.operand[i]))
23446 {
23447 enum attr_mode mode = get_attr_mode (insn);
23448
23449 gcc_assert (!len);
23450 if (shortform && CONST_INT_P (recog_data.operand[i]))
23451 {
23452 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23453 switch (mode)
23454 {
23455 case MODE_QI:
23456 len = 1;
23457 continue;
23458 case MODE_HI:
23459 ival = trunc_int_for_mode (ival, HImode);
23460 break;
23461 case MODE_SI:
23462 ival = trunc_int_for_mode (ival, SImode);
23463 break;
23464 default:
23465 break;
23466 }
23467 if (IN_RANGE (ival, -128, 127))
23468 {
23469 len = 1;
23470 continue;
23471 }
23472 }
23473 switch (mode)
23474 {
23475 case MODE_QI:
23476 len = 1;
23477 break;
23478 case MODE_HI:
23479 len = 2;
23480 break;
23481 case MODE_SI:
23482 len = 4;
23483 break;
23484 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23485 case MODE_DI:
23486 len = 4;
23487 break;
23488 default:
23489 fatal_insn ("unknown insn mode", insn);
23490 }
23491 }
23492 return len;
23493 }
23494 /* Compute default value for "length_address" attribute. */
23495 int
23496 ix86_attr_length_address_default (rtx insn)
23497 {
23498 int i;
23499
23500 if (get_attr_type (insn) == TYPE_LEA)
23501 {
23502 rtx set = PATTERN (insn), addr;
23503
23504 if (GET_CODE (set) == PARALLEL)
23505 set = XVECEXP (set, 0, 0);
23506
23507 gcc_assert (GET_CODE (set) == SET);
23508
23509 addr = SET_SRC (set);
23510 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23511 {
23512 if (GET_CODE (addr) == ZERO_EXTEND)
23513 addr = XEXP (addr, 0);
23514 if (GET_CODE (addr) == SUBREG)
23515 addr = SUBREG_REG (addr);
23516 }
23517
23518 return memory_address_length (addr);
23519 }
23520
23521 extract_insn_cached (insn);
23522 for (i = recog_data.n_operands - 1; i >= 0; --i)
23523 if (MEM_P (recog_data.operand[i]))
23524 {
23525 constrain_operands_cached (reload_completed);
23526 if (which_alternative != -1)
23527 {
23528 const char *constraints = recog_data.constraints[i];
23529 int alt = which_alternative;
23530
23531 while (*constraints == '=' || *constraints == '+')
23532 constraints++;
23533 while (alt-- > 0)
23534 while (*constraints++ != ',')
23535 ;
23536 /* Skip ignored operands. */
23537 if (*constraints == 'X')
23538 continue;
23539 }
23540 return memory_address_length (XEXP (recog_data.operand[i], 0));
23541 }
23542 return 0;
23543 }
23544
23545 /* Compute default value for "length_vex" attribute. It includes
23546 2 or 3 byte VEX prefix and 1 opcode byte. */
23547
23548 int
23549 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23550 {
23551 int i;
23552
23553 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23554 byte VEX prefix. */
23555 if (!has_0f_opcode || has_vex_w)
23556 return 3 + 1;
23557
23558 /* We can always use 2 byte VEX prefix in 32bit. */
23559 if (!TARGET_64BIT)
23560 return 2 + 1;
23561
23562 extract_insn_cached (insn);
23563
23564 for (i = recog_data.n_operands - 1; i >= 0; --i)
23565 if (REG_P (recog_data.operand[i]))
23566 {
23567 /* REX.W bit uses 3 byte VEX prefix. */
23568 if (GET_MODE (recog_data.operand[i]) == DImode
23569 && GENERAL_REG_P (recog_data.operand[i]))
23570 return 3 + 1;
23571 }
23572 else
23573 {
23574 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23575 if (MEM_P (recog_data.operand[i])
23576 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23577 return 3 + 1;
23578 }
23579
23580 return 2 + 1;
23581 }
23582 \f
23583 /* Return the maximum number of instructions a cpu can issue. */
23584
23585 static int
23586 ix86_issue_rate (void)
23587 {
23588 switch (ix86_tune)
23589 {
23590 case PROCESSOR_PENTIUM:
23591 case PROCESSOR_ATOM:
23592 case PROCESSOR_K6:
23593 return 2;
23594
23595 case PROCESSOR_PENTIUMPRO:
23596 case PROCESSOR_PENTIUM4:
23597 case PROCESSOR_CORE2_32:
23598 case PROCESSOR_CORE2_64:
23599 case PROCESSOR_COREI7_32:
23600 case PROCESSOR_COREI7_64:
23601 case PROCESSOR_ATHLON:
23602 case PROCESSOR_K8:
23603 case PROCESSOR_AMDFAM10:
23604 case PROCESSOR_NOCONA:
23605 case PROCESSOR_GENERIC32:
23606 case PROCESSOR_GENERIC64:
23607 case PROCESSOR_BDVER1:
23608 case PROCESSOR_BDVER2:
23609 case PROCESSOR_BTVER1:
23610 return 3;
23611
23612 default:
23613 return 1;
23614 }
23615 }
23616
23617 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23618 by DEP_INSN and nothing set by DEP_INSN. */
23619
23620 static bool
23621 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23622 {
23623 rtx set, set2;
23624
23625 /* Simplify the test for uninteresting insns. */
23626 if (insn_type != TYPE_SETCC
23627 && insn_type != TYPE_ICMOV
23628 && insn_type != TYPE_FCMOV
23629 && insn_type != TYPE_IBR)
23630 return false;
23631
23632 if ((set = single_set (dep_insn)) != 0)
23633 {
23634 set = SET_DEST (set);
23635 set2 = NULL_RTX;
23636 }
23637 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23638 && XVECLEN (PATTERN (dep_insn), 0) == 2
23639 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23640 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23641 {
23642 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23643 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23644 }
23645 else
23646 return false;
23647
23648 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23649 return false;
23650
23651 /* This test is true if the dependent insn reads the flags but
23652 not any other potentially set register. */
23653 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23654 return false;
23655
23656 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23657 return false;
23658
23659 return true;
23660 }
23661
23662 /* Return true iff USE_INSN has a memory address with operands set by
23663 SET_INSN. */
23664
23665 bool
23666 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23667 {
23668 int i;
23669 extract_insn_cached (use_insn);
23670 for (i = recog_data.n_operands - 1; i >= 0; --i)
23671 if (MEM_P (recog_data.operand[i]))
23672 {
23673 rtx addr = XEXP (recog_data.operand[i], 0);
23674 return modified_in_p (addr, set_insn) != 0;
23675 }
23676 return false;
23677 }
23678
23679 static int
23680 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23681 {
23682 enum attr_type insn_type, dep_insn_type;
23683 enum attr_memory memory;
23684 rtx set, set2;
23685 int dep_insn_code_number;
23686
23687 /* Anti and output dependencies have zero cost on all CPUs. */
23688 if (REG_NOTE_KIND (link) != 0)
23689 return 0;
23690
23691 dep_insn_code_number = recog_memoized (dep_insn);
23692
23693 /* If we can't recognize the insns, we can't really do anything. */
23694 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23695 return cost;
23696
23697 insn_type = get_attr_type (insn);
23698 dep_insn_type = get_attr_type (dep_insn);
23699
23700 switch (ix86_tune)
23701 {
23702 case PROCESSOR_PENTIUM:
23703 /* Address Generation Interlock adds a cycle of latency. */
23704 if (insn_type == TYPE_LEA)
23705 {
23706 rtx addr = PATTERN (insn);
23707
23708 if (GET_CODE (addr) == PARALLEL)
23709 addr = XVECEXP (addr, 0, 0);
23710
23711 gcc_assert (GET_CODE (addr) == SET);
23712
23713 addr = SET_SRC (addr);
23714 if (modified_in_p (addr, dep_insn))
23715 cost += 1;
23716 }
23717 else if (ix86_agi_dependent (dep_insn, insn))
23718 cost += 1;
23719
23720 /* ??? Compares pair with jump/setcc. */
23721 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23722 cost = 0;
23723
23724 /* Floating point stores require value to be ready one cycle earlier. */
23725 if (insn_type == TYPE_FMOV
23726 && get_attr_memory (insn) == MEMORY_STORE
23727 && !ix86_agi_dependent (dep_insn, insn))
23728 cost += 1;
23729 break;
23730
23731 case PROCESSOR_PENTIUMPRO:
23732 memory = get_attr_memory (insn);
23733
23734 /* INT->FP conversion is expensive. */
23735 if (get_attr_fp_int_src (dep_insn))
23736 cost += 5;
23737
23738 /* There is one cycle extra latency between an FP op and a store. */
23739 if (insn_type == TYPE_FMOV
23740 && (set = single_set (dep_insn)) != NULL_RTX
23741 && (set2 = single_set (insn)) != NULL_RTX
23742 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23743 && MEM_P (SET_DEST (set2)))
23744 cost += 1;
23745
23746 /* Show ability of reorder buffer to hide latency of load by executing
23747 in parallel with previous instruction in case
23748 previous instruction is not needed to compute the address. */
23749 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23750 && !ix86_agi_dependent (dep_insn, insn))
23751 {
23752 /* Claim moves to take one cycle, as core can issue one load
23753 at time and the next load can start cycle later. */
23754 if (dep_insn_type == TYPE_IMOV
23755 || dep_insn_type == TYPE_FMOV)
23756 cost = 1;
23757 else if (cost > 1)
23758 cost--;
23759 }
23760 break;
23761
23762 case PROCESSOR_K6:
23763 memory = get_attr_memory (insn);
23764
23765 /* The esp dependency is resolved before the instruction is really
23766 finished. */
23767 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23768 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23769 return 1;
23770
23771 /* INT->FP conversion is expensive. */
23772 if (get_attr_fp_int_src (dep_insn))
23773 cost += 5;
23774
23775 /* Show ability of reorder buffer to hide latency of load by executing
23776 in parallel with previous instruction in case
23777 previous instruction is not needed to compute the address. */
23778 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23779 && !ix86_agi_dependent (dep_insn, insn))
23780 {
23781 /* Claim moves to take one cycle, as core can issue one load
23782 at time and the next load can start cycle later. */
23783 if (dep_insn_type == TYPE_IMOV
23784 || dep_insn_type == TYPE_FMOV)
23785 cost = 1;
23786 else if (cost > 2)
23787 cost -= 2;
23788 else
23789 cost = 1;
23790 }
23791 break;
23792
23793 case PROCESSOR_ATHLON:
23794 case PROCESSOR_K8:
23795 case PROCESSOR_AMDFAM10:
23796 case PROCESSOR_BDVER1:
23797 case PROCESSOR_BDVER2:
23798 case PROCESSOR_BTVER1:
23799 case PROCESSOR_ATOM:
23800 case PROCESSOR_GENERIC32:
23801 case PROCESSOR_GENERIC64:
23802 memory = get_attr_memory (insn);
23803
23804 /* Show ability of reorder buffer to hide latency of load by executing
23805 in parallel with previous instruction in case
23806 previous instruction is not needed to compute the address. */
23807 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23808 && !ix86_agi_dependent (dep_insn, insn))
23809 {
23810 enum attr_unit unit = get_attr_unit (insn);
23811 int loadcost = 3;
23812
23813 /* Because of the difference between the length of integer and
23814 floating unit pipeline preparation stages, the memory operands
23815 for floating point are cheaper.
23816
23817 ??? For Athlon it the difference is most probably 2. */
23818 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23819 loadcost = 3;
23820 else
23821 loadcost = TARGET_ATHLON ? 2 : 0;
23822
23823 if (cost >= loadcost)
23824 cost -= loadcost;
23825 else
23826 cost = 0;
23827 }
23828
23829 default:
23830 break;
23831 }
23832
23833 return cost;
23834 }
23835
23836 /* How many alternative schedules to try. This should be as wide as the
23837 scheduling freedom in the DFA, but no wider. Making this value too
23838 large results extra work for the scheduler. */
23839
23840 static int
23841 ia32_multipass_dfa_lookahead (void)
23842 {
23843 switch (ix86_tune)
23844 {
23845 case PROCESSOR_PENTIUM:
23846 return 2;
23847
23848 case PROCESSOR_PENTIUMPRO:
23849 case PROCESSOR_K6:
23850 return 1;
23851
23852 case PROCESSOR_CORE2_32:
23853 case PROCESSOR_CORE2_64:
23854 case PROCESSOR_COREI7_32:
23855 case PROCESSOR_COREI7_64:
23856 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23857 as many instructions can be executed on a cycle, i.e.,
23858 issue_rate. I wonder why tuning for many CPUs does not do this. */
23859 return ix86_issue_rate ();
23860
23861 default:
23862 return 0;
23863 }
23864 }
23865
23866 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
23867 execution. It is applied if
23868 (1) IMUL instruction is on the top of list;
23869 (2) There exists the only producer of independent IMUL instruction in
23870 ready list;
23871 (3) Put found producer on the top of ready list.
23872 Returns issue rate. */
23873
23874 static int
23875 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
23876 int clock_var ATTRIBUTE_UNUSED)
23877 {
23878 static int issue_rate = -1;
23879 int n_ready = *pn_ready;
23880 rtx insn, insn1, insn2;
23881 int i;
23882 sd_iterator_def sd_it;
23883 dep_t dep;
23884 int index = -1;
23885
23886 /* Set up issue rate. */
23887 issue_rate = ix86_issue_rate();
23888
23889 /* Do reodering for Atom only. */
23890 if (ix86_tune != PROCESSOR_ATOM)
23891 return issue_rate;
23892 /* Nothing to do if ready list contains only 1 instruction. */
23893 if (n_ready <= 1)
23894 return issue_rate;
23895
23896 /* Check that IMUL instruction is on the top of ready list. */
23897 insn = ready[n_ready - 1];
23898 if (!NONDEBUG_INSN_P (insn))
23899 return issue_rate;
23900 insn = PATTERN (insn);
23901 if (GET_CODE (insn) == PARALLEL)
23902 insn = XVECEXP (insn, 0, 0);
23903 if (GET_CODE (insn) != SET)
23904 return issue_rate;
23905 if (!(GET_CODE (SET_SRC (insn)) == MULT
23906 && GET_MODE (SET_SRC (insn)) == SImode))
23907 return issue_rate;
23908
23909 /* Search for producer of independent IMUL instruction. */
23910 for (i = n_ready - 2; i>= 0; i--)
23911 {
23912 insn = ready[i];
23913 if (!NONDEBUG_INSN_P (insn))
23914 continue;
23915 /* Skip IMUL instruction. */
23916 insn2 = PATTERN (insn);
23917 if (GET_CODE (insn2) == PARALLEL)
23918 insn2 = XVECEXP (insn2, 0, 0);
23919 if (GET_CODE (insn2) == SET
23920 && GET_CODE (SET_SRC (insn2)) == MULT
23921 && GET_MODE (SET_SRC (insn2)) == SImode)
23922 continue;
23923
23924 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
23925 {
23926 rtx con;
23927 con = DEP_CON (dep);
23928 if (!NONDEBUG_INSN_P (con))
23929 continue;
23930 insn1 = PATTERN (con);
23931 if (GET_CODE (insn1) == PARALLEL)
23932 insn1 = XVECEXP (insn1, 0, 0);
23933
23934 if (GET_CODE (insn1) == SET
23935 && GET_CODE (SET_SRC (insn1)) == MULT
23936 && GET_MODE (SET_SRC (insn1)) == SImode)
23937 {
23938 sd_iterator_def sd_it1;
23939 dep_t dep1;
23940 /* Check if there is no other dependee for IMUL. */
23941 index = i;
23942 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
23943 {
23944 rtx pro;
23945 pro = DEP_PRO (dep1);
23946 if (!NONDEBUG_INSN_P (pro))
23947 continue;
23948 if (pro != insn)
23949 index = -1;
23950 }
23951 if (index >= 0)
23952 break;
23953 }
23954 }
23955 if (index >= 0)
23956 break;
23957 }
23958 if (index < 0)
23959 return issue_rate; /* Didn't find IMUL producer. */
23960
23961 if (sched_verbose > 1)
23962 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
23963 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
23964
23965 /* Put IMUL producer (ready[index]) at the top of ready list. */
23966 insn1= ready[index];
23967 for (i = index; i < n_ready - 1; i++)
23968 ready[i] = ready[i + 1];
23969 ready[n_ready - 1] = insn1;
23970
23971 return issue_rate;
23972 }
23973
23974 \f
23975
23976 /* Model decoder of Core 2/i7.
23977 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23978 track the instruction fetch block boundaries and make sure that long
23979 (9+ bytes) instructions are assigned to D0. */
23980
23981 /* Maximum length of an insn that can be handled by
23982 a secondary decoder unit. '8' for Core 2/i7. */
23983 static int core2i7_secondary_decoder_max_insn_size;
23984
23985 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23986 '16' for Core 2/i7. */
23987 static int core2i7_ifetch_block_size;
23988
23989 /* Maximum number of instructions decoder can handle per cycle.
23990 '6' for Core 2/i7. */
23991 static int core2i7_ifetch_block_max_insns;
23992
23993 typedef struct ix86_first_cycle_multipass_data_ *
23994 ix86_first_cycle_multipass_data_t;
23995 typedef const struct ix86_first_cycle_multipass_data_ *
23996 const_ix86_first_cycle_multipass_data_t;
23997
23998 /* A variable to store target state across calls to max_issue within
23999 one cycle. */
24000 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24001 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24002
24003 /* Initialize DATA. */
24004 static void
24005 core2i7_first_cycle_multipass_init (void *_data)
24006 {
24007 ix86_first_cycle_multipass_data_t data
24008 = (ix86_first_cycle_multipass_data_t) _data;
24009
24010 data->ifetch_block_len = 0;
24011 data->ifetch_block_n_insns = 0;
24012 data->ready_try_change = NULL;
24013 data->ready_try_change_size = 0;
24014 }
24015
24016 /* Advancing the cycle; reset ifetch block counts. */
24017 static void
24018 core2i7_dfa_post_advance_cycle (void)
24019 {
24020 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24021
24022 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24023
24024 data->ifetch_block_len = 0;
24025 data->ifetch_block_n_insns = 0;
24026 }
24027
24028 static int min_insn_size (rtx);
24029
24030 /* Filter out insns from ready_try that the core will not be able to issue
24031 on current cycle due to decoder. */
24032 static void
24033 core2i7_first_cycle_multipass_filter_ready_try
24034 (const_ix86_first_cycle_multipass_data_t data,
24035 char *ready_try, int n_ready, bool first_cycle_insn_p)
24036 {
24037 while (n_ready--)
24038 {
24039 rtx insn;
24040 int insn_size;
24041
24042 if (ready_try[n_ready])
24043 continue;
24044
24045 insn = get_ready_element (n_ready);
24046 insn_size = min_insn_size (insn);
24047
24048 if (/* If this is a too long an insn for a secondary decoder ... */
24049 (!first_cycle_insn_p
24050 && insn_size > core2i7_secondary_decoder_max_insn_size)
24051 /* ... or it would not fit into the ifetch block ... */
24052 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24053 /* ... or the decoder is full already ... */
24054 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24055 /* ... mask the insn out. */
24056 {
24057 ready_try[n_ready] = 1;
24058
24059 if (data->ready_try_change)
24060 SET_BIT (data->ready_try_change, n_ready);
24061 }
24062 }
24063 }
24064
24065 /* Prepare for a new round of multipass lookahead scheduling. */
24066 static void
24067 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24068 bool first_cycle_insn_p)
24069 {
24070 ix86_first_cycle_multipass_data_t data
24071 = (ix86_first_cycle_multipass_data_t) _data;
24072 const_ix86_first_cycle_multipass_data_t prev_data
24073 = ix86_first_cycle_multipass_data;
24074
24075 /* Restore the state from the end of the previous round. */
24076 data->ifetch_block_len = prev_data->ifetch_block_len;
24077 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24078
24079 /* Filter instructions that cannot be issued on current cycle due to
24080 decoder restrictions. */
24081 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24082 first_cycle_insn_p);
24083 }
24084
24085 /* INSN is being issued in current solution. Account for its impact on
24086 the decoder model. */
24087 static void
24088 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24089 rtx insn, const void *_prev_data)
24090 {
24091 ix86_first_cycle_multipass_data_t data
24092 = (ix86_first_cycle_multipass_data_t) _data;
24093 const_ix86_first_cycle_multipass_data_t prev_data
24094 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24095
24096 int insn_size = min_insn_size (insn);
24097
24098 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24099 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24100 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24101 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24102
24103 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24104 if (!data->ready_try_change)
24105 {
24106 data->ready_try_change = sbitmap_alloc (n_ready);
24107 data->ready_try_change_size = n_ready;
24108 }
24109 else if (data->ready_try_change_size < n_ready)
24110 {
24111 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24112 n_ready, 0);
24113 data->ready_try_change_size = n_ready;
24114 }
24115 sbitmap_zero (data->ready_try_change);
24116
24117 /* Filter out insns from ready_try that the core will not be able to issue
24118 on current cycle due to decoder. */
24119 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24120 false);
24121 }
24122
24123 /* Revert the effect on ready_try. */
24124 static void
24125 core2i7_first_cycle_multipass_backtrack (const void *_data,
24126 char *ready_try,
24127 int n_ready ATTRIBUTE_UNUSED)
24128 {
24129 const_ix86_first_cycle_multipass_data_t data
24130 = (const_ix86_first_cycle_multipass_data_t) _data;
24131 unsigned int i = 0;
24132 sbitmap_iterator sbi;
24133
24134 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24135 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24136 {
24137 ready_try[i] = 0;
24138 }
24139 }
24140
24141 /* Save the result of multipass lookahead scheduling for the next round. */
24142 static void
24143 core2i7_first_cycle_multipass_end (const void *_data)
24144 {
24145 const_ix86_first_cycle_multipass_data_t data
24146 = (const_ix86_first_cycle_multipass_data_t) _data;
24147 ix86_first_cycle_multipass_data_t next_data
24148 = ix86_first_cycle_multipass_data;
24149
24150 if (data != NULL)
24151 {
24152 next_data->ifetch_block_len = data->ifetch_block_len;
24153 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24154 }
24155 }
24156
24157 /* Deallocate target data. */
24158 static void
24159 core2i7_first_cycle_multipass_fini (void *_data)
24160 {
24161 ix86_first_cycle_multipass_data_t data
24162 = (ix86_first_cycle_multipass_data_t) _data;
24163
24164 if (data->ready_try_change)
24165 {
24166 sbitmap_free (data->ready_try_change);
24167 data->ready_try_change = NULL;
24168 data->ready_try_change_size = 0;
24169 }
24170 }
24171
24172 /* Prepare for scheduling pass. */
24173 static void
24174 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24175 int verbose ATTRIBUTE_UNUSED,
24176 int max_uid ATTRIBUTE_UNUSED)
24177 {
24178 /* Install scheduling hooks for current CPU. Some of these hooks are used
24179 in time-critical parts of the scheduler, so we only set them up when
24180 they are actually used. */
24181 switch (ix86_tune)
24182 {
24183 case PROCESSOR_CORE2_32:
24184 case PROCESSOR_CORE2_64:
24185 case PROCESSOR_COREI7_32:
24186 case PROCESSOR_COREI7_64:
24187 targetm.sched.dfa_post_advance_cycle
24188 = core2i7_dfa_post_advance_cycle;
24189 targetm.sched.first_cycle_multipass_init
24190 = core2i7_first_cycle_multipass_init;
24191 targetm.sched.first_cycle_multipass_begin
24192 = core2i7_first_cycle_multipass_begin;
24193 targetm.sched.first_cycle_multipass_issue
24194 = core2i7_first_cycle_multipass_issue;
24195 targetm.sched.first_cycle_multipass_backtrack
24196 = core2i7_first_cycle_multipass_backtrack;
24197 targetm.sched.first_cycle_multipass_end
24198 = core2i7_first_cycle_multipass_end;
24199 targetm.sched.first_cycle_multipass_fini
24200 = core2i7_first_cycle_multipass_fini;
24201
24202 /* Set decoder parameters. */
24203 core2i7_secondary_decoder_max_insn_size = 8;
24204 core2i7_ifetch_block_size = 16;
24205 core2i7_ifetch_block_max_insns = 6;
24206 break;
24207
24208 default:
24209 targetm.sched.dfa_post_advance_cycle = NULL;
24210 targetm.sched.first_cycle_multipass_init = NULL;
24211 targetm.sched.first_cycle_multipass_begin = NULL;
24212 targetm.sched.first_cycle_multipass_issue = NULL;
24213 targetm.sched.first_cycle_multipass_backtrack = NULL;
24214 targetm.sched.first_cycle_multipass_end = NULL;
24215 targetm.sched.first_cycle_multipass_fini = NULL;
24216 break;
24217 }
24218 }
24219
24220 \f
24221 /* Compute the alignment given to a constant that is being placed in memory.
24222 EXP is the constant and ALIGN is the alignment that the object would
24223 ordinarily have.
24224 The value of this function is used instead of that alignment to align
24225 the object. */
24226
24227 int
24228 ix86_constant_alignment (tree exp, int align)
24229 {
24230 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24231 || TREE_CODE (exp) == INTEGER_CST)
24232 {
24233 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24234 return 64;
24235 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24236 return 128;
24237 }
24238 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24239 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24240 return BITS_PER_WORD;
24241
24242 return align;
24243 }
24244
24245 /* Compute the alignment for a static variable.
24246 TYPE is the data type, and ALIGN is the alignment that
24247 the object would ordinarily have. The value of this function is used
24248 instead of that alignment to align the object. */
24249
24250 int
24251 ix86_data_alignment (tree type, int align)
24252 {
24253 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24254
24255 if (AGGREGATE_TYPE_P (type)
24256 && TYPE_SIZE (type)
24257 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24258 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24259 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24260 && align < max_align)
24261 align = max_align;
24262
24263 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24264 to 16byte boundary. */
24265 if (TARGET_64BIT)
24266 {
24267 if (AGGREGATE_TYPE_P (type)
24268 && TYPE_SIZE (type)
24269 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24270 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24271 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24272 return 128;
24273 }
24274
24275 if (TREE_CODE (type) == ARRAY_TYPE)
24276 {
24277 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24278 return 64;
24279 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24280 return 128;
24281 }
24282 else if (TREE_CODE (type) == COMPLEX_TYPE)
24283 {
24284
24285 if (TYPE_MODE (type) == DCmode && align < 64)
24286 return 64;
24287 if ((TYPE_MODE (type) == XCmode
24288 || TYPE_MODE (type) == TCmode) && align < 128)
24289 return 128;
24290 }
24291 else if ((TREE_CODE (type) == RECORD_TYPE
24292 || TREE_CODE (type) == UNION_TYPE
24293 || TREE_CODE (type) == QUAL_UNION_TYPE)
24294 && TYPE_FIELDS (type))
24295 {
24296 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24297 return 64;
24298 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24299 return 128;
24300 }
24301 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24302 || TREE_CODE (type) == INTEGER_TYPE)
24303 {
24304 if (TYPE_MODE (type) == DFmode && align < 64)
24305 return 64;
24306 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24307 return 128;
24308 }
24309
24310 return align;
24311 }
24312
24313 /* Compute the alignment for a local variable or a stack slot. EXP is
24314 the data type or decl itself, MODE is the widest mode available and
24315 ALIGN is the alignment that the object would ordinarily have. The
24316 value of this macro is used instead of that alignment to align the
24317 object. */
24318
24319 unsigned int
24320 ix86_local_alignment (tree exp, enum machine_mode mode,
24321 unsigned int align)
24322 {
24323 tree type, decl;
24324
24325 if (exp && DECL_P (exp))
24326 {
24327 type = TREE_TYPE (exp);
24328 decl = exp;
24329 }
24330 else
24331 {
24332 type = exp;
24333 decl = NULL;
24334 }
24335
24336 /* Don't do dynamic stack realignment for long long objects with
24337 -mpreferred-stack-boundary=2. */
24338 if (!TARGET_64BIT
24339 && align == 64
24340 && ix86_preferred_stack_boundary < 64
24341 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24342 && (!type || !TYPE_USER_ALIGN (type))
24343 && (!decl || !DECL_USER_ALIGN (decl)))
24344 align = 32;
24345
24346 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24347 register in MODE. We will return the largest alignment of XF
24348 and DF. */
24349 if (!type)
24350 {
24351 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24352 align = GET_MODE_ALIGNMENT (DFmode);
24353 return align;
24354 }
24355
24356 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24357 to 16byte boundary. Exact wording is:
24358
24359 An array uses the same alignment as its elements, except that a local or
24360 global array variable of length at least 16 bytes or
24361 a C99 variable-length array variable always has alignment of at least 16 bytes.
24362
24363 This was added to allow use of aligned SSE instructions at arrays. This
24364 rule is meant for static storage (where compiler can not do the analysis
24365 by itself). We follow it for automatic variables only when convenient.
24366 We fully control everything in the function compiled and functions from
24367 other unit can not rely on the alignment.
24368
24369 Exclude va_list type. It is the common case of local array where
24370 we can not benefit from the alignment. */
24371 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24372 && TARGET_SSE)
24373 {
24374 if (AGGREGATE_TYPE_P (type)
24375 && (va_list_type_node == NULL_TREE
24376 || (TYPE_MAIN_VARIANT (type)
24377 != TYPE_MAIN_VARIANT (va_list_type_node)))
24378 && TYPE_SIZE (type)
24379 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24380 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24381 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24382 return 128;
24383 }
24384 if (TREE_CODE (type) == ARRAY_TYPE)
24385 {
24386 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24387 return 64;
24388 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24389 return 128;
24390 }
24391 else if (TREE_CODE (type) == COMPLEX_TYPE)
24392 {
24393 if (TYPE_MODE (type) == DCmode && align < 64)
24394 return 64;
24395 if ((TYPE_MODE (type) == XCmode
24396 || TYPE_MODE (type) == TCmode) && align < 128)
24397 return 128;
24398 }
24399 else if ((TREE_CODE (type) == RECORD_TYPE
24400 || TREE_CODE (type) == UNION_TYPE
24401 || TREE_CODE (type) == QUAL_UNION_TYPE)
24402 && TYPE_FIELDS (type))
24403 {
24404 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24405 return 64;
24406 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24407 return 128;
24408 }
24409 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24410 || TREE_CODE (type) == INTEGER_TYPE)
24411 {
24412
24413 if (TYPE_MODE (type) == DFmode && align < 64)
24414 return 64;
24415 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24416 return 128;
24417 }
24418 return align;
24419 }
24420
24421 /* Compute the minimum required alignment for dynamic stack realignment
24422 purposes for a local variable, parameter or a stack slot. EXP is
24423 the data type or decl itself, MODE is its mode and ALIGN is the
24424 alignment that the object would ordinarily have. */
24425
24426 unsigned int
24427 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24428 unsigned int align)
24429 {
24430 tree type, decl;
24431
24432 if (exp && DECL_P (exp))
24433 {
24434 type = TREE_TYPE (exp);
24435 decl = exp;
24436 }
24437 else
24438 {
24439 type = exp;
24440 decl = NULL;
24441 }
24442
24443 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24444 return align;
24445
24446 /* Don't do dynamic stack realignment for long long objects with
24447 -mpreferred-stack-boundary=2. */
24448 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24449 && (!type || !TYPE_USER_ALIGN (type))
24450 && (!decl || !DECL_USER_ALIGN (decl)))
24451 return 32;
24452
24453 return align;
24454 }
24455 \f
24456 /* Find a location for the static chain incoming to a nested function.
24457 This is a register, unless all free registers are used by arguments. */
24458
24459 static rtx
24460 ix86_static_chain (const_tree fndecl, bool incoming_p)
24461 {
24462 unsigned regno;
24463
24464 if (!DECL_STATIC_CHAIN (fndecl))
24465 return NULL;
24466
24467 if (TARGET_64BIT)
24468 {
24469 /* We always use R10 in 64-bit mode. */
24470 regno = R10_REG;
24471 }
24472 else
24473 {
24474 tree fntype;
24475 unsigned int ccvt;
24476
24477 /* By default in 32-bit mode we use ECX to pass the static chain. */
24478 regno = CX_REG;
24479
24480 fntype = TREE_TYPE (fndecl);
24481 ccvt = ix86_get_callcvt (fntype);
24482 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24483 {
24484 /* Fastcall functions use ecx/edx for arguments, which leaves
24485 us with EAX for the static chain.
24486 Thiscall functions use ecx for arguments, which also
24487 leaves us with EAX for the static chain. */
24488 regno = AX_REG;
24489 }
24490 else if (ix86_function_regparm (fntype, fndecl) == 3)
24491 {
24492 /* For regparm 3, we have no free call-clobbered registers in
24493 which to store the static chain. In order to implement this,
24494 we have the trampoline push the static chain to the stack.
24495 However, we can't push a value below the return address when
24496 we call the nested function directly, so we have to use an
24497 alternate entry point. For this we use ESI, and have the
24498 alternate entry point push ESI, so that things appear the
24499 same once we're executing the nested function. */
24500 if (incoming_p)
24501 {
24502 if (fndecl == current_function_decl)
24503 ix86_static_chain_on_stack = true;
24504 return gen_frame_mem (SImode,
24505 plus_constant (Pmode,
24506 arg_pointer_rtx, -8));
24507 }
24508 regno = SI_REG;
24509 }
24510 }
24511
24512 return gen_rtx_REG (Pmode, regno);
24513 }
24514
24515 /* Emit RTL insns to initialize the variable parts of a trampoline.
24516 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24517 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24518 to be passed to the target function. */
24519
24520 static void
24521 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24522 {
24523 rtx mem, fnaddr;
24524 int opcode;
24525 int offset = 0;
24526
24527 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24528
24529 if (TARGET_64BIT)
24530 {
24531 int size;
24532
24533 /* Load the function address to r11. Try to load address using
24534 the shorter movl instead of movabs. We may want to support
24535 movq for kernel mode, but kernel does not use trampolines at
24536 the moment. FNADDR is a 32bit address and may not be in
24537 DImode when ptr_mode == SImode. Always use movl in this
24538 case. */
24539 if (ptr_mode == SImode
24540 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24541 {
24542 fnaddr = copy_addr_to_reg (fnaddr);
24543
24544 mem = adjust_address (m_tramp, HImode, offset);
24545 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24546
24547 mem = adjust_address (m_tramp, SImode, offset + 2);
24548 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24549 offset += 6;
24550 }
24551 else
24552 {
24553 mem = adjust_address (m_tramp, HImode, offset);
24554 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24555
24556 mem = adjust_address (m_tramp, DImode, offset + 2);
24557 emit_move_insn (mem, fnaddr);
24558 offset += 10;
24559 }
24560
24561 /* Load static chain using movabs to r10. Use the shorter movl
24562 instead of movabs when ptr_mode == SImode. */
24563 if (ptr_mode == SImode)
24564 {
24565 opcode = 0xba41;
24566 size = 6;
24567 }
24568 else
24569 {
24570 opcode = 0xba49;
24571 size = 10;
24572 }
24573
24574 mem = adjust_address (m_tramp, HImode, offset);
24575 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24576
24577 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24578 emit_move_insn (mem, chain_value);
24579 offset += size;
24580
24581 /* Jump to r11; the last (unused) byte is a nop, only there to
24582 pad the write out to a single 32-bit store. */
24583 mem = adjust_address (m_tramp, SImode, offset);
24584 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24585 offset += 4;
24586 }
24587 else
24588 {
24589 rtx disp, chain;
24590
24591 /* Depending on the static chain location, either load a register
24592 with a constant, or push the constant to the stack. All of the
24593 instructions are the same size. */
24594 chain = ix86_static_chain (fndecl, true);
24595 if (REG_P (chain))
24596 {
24597 switch (REGNO (chain))
24598 {
24599 case AX_REG:
24600 opcode = 0xb8; break;
24601 case CX_REG:
24602 opcode = 0xb9; break;
24603 default:
24604 gcc_unreachable ();
24605 }
24606 }
24607 else
24608 opcode = 0x68;
24609
24610 mem = adjust_address (m_tramp, QImode, offset);
24611 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24612
24613 mem = adjust_address (m_tramp, SImode, offset + 1);
24614 emit_move_insn (mem, chain_value);
24615 offset += 5;
24616
24617 mem = adjust_address (m_tramp, QImode, offset);
24618 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24619
24620 mem = adjust_address (m_tramp, SImode, offset + 1);
24621
24622 /* Compute offset from the end of the jmp to the target function.
24623 In the case in which the trampoline stores the static chain on
24624 the stack, we need to skip the first insn which pushes the
24625 (call-saved) register static chain; this push is 1 byte. */
24626 offset += 5;
24627 disp = expand_binop (SImode, sub_optab, fnaddr,
24628 plus_constant (Pmode, XEXP (m_tramp, 0),
24629 offset - (MEM_P (chain) ? 1 : 0)),
24630 NULL_RTX, 1, OPTAB_DIRECT);
24631 emit_move_insn (mem, disp);
24632 }
24633
24634 gcc_assert (offset <= TRAMPOLINE_SIZE);
24635
24636 #ifdef HAVE_ENABLE_EXECUTE_STACK
24637 #ifdef CHECK_EXECUTE_STACK_ENABLED
24638 if (CHECK_EXECUTE_STACK_ENABLED)
24639 #endif
24640 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24641 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24642 #endif
24643 }
24644 \f
24645 /* The following file contains several enumerations and data structures
24646 built from the definitions in i386-builtin-types.def. */
24647
24648 #include "i386-builtin-types.inc"
24649
24650 /* Table for the ix86 builtin non-function types. */
24651 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24652
24653 /* Retrieve an element from the above table, building some of
24654 the types lazily. */
24655
24656 static tree
24657 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24658 {
24659 unsigned int index;
24660 tree type, itype;
24661
24662 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24663
24664 type = ix86_builtin_type_tab[(int) tcode];
24665 if (type != NULL)
24666 return type;
24667
24668 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24669 if (tcode <= IX86_BT_LAST_VECT)
24670 {
24671 enum machine_mode mode;
24672
24673 index = tcode - IX86_BT_LAST_PRIM - 1;
24674 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24675 mode = ix86_builtin_type_vect_mode[index];
24676
24677 type = build_vector_type_for_mode (itype, mode);
24678 }
24679 else
24680 {
24681 int quals;
24682
24683 index = tcode - IX86_BT_LAST_VECT - 1;
24684 if (tcode <= IX86_BT_LAST_PTR)
24685 quals = TYPE_UNQUALIFIED;
24686 else
24687 quals = TYPE_QUAL_CONST;
24688
24689 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24690 if (quals != TYPE_UNQUALIFIED)
24691 itype = build_qualified_type (itype, quals);
24692
24693 type = build_pointer_type (itype);
24694 }
24695
24696 ix86_builtin_type_tab[(int) tcode] = type;
24697 return type;
24698 }
24699
24700 /* Table for the ix86 builtin function types. */
24701 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24702
24703 /* Retrieve an element from the above table, building some of
24704 the types lazily. */
24705
24706 static tree
24707 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24708 {
24709 tree type;
24710
24711 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24712
24713 type = ix86_builtin_func_type_tab[(int) tcode];
24714 if (type != NULL)
24715 return type;
24716
24717 if (tcode <= IX86_BT_LAST_FUNC)
24718 {
24719 unsigned start = ix86_builtin_func_start[(int) tcode];
24720 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24721 tree rtype, atype, args = void_list_node;
24722 unsigned i;
24723
24724 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24725 for (i = after - 1; i > start; --i)
24726 {
24727 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24728 args = tree_cons (NULL, atype, args);
24729 }
24730
24731 type = build_function_type (rtype, args);
24732 }
24733 else
24734 {
24735 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24736 enum ix86_builtin_func_type icode;
24737
24738 icode = ix86_builtin_func_alias_base[index];
24739 type = ix86_get_builtin_func_type (icode);
24740 }
24741
24742 ix86_builtin_func_type_tab[(int) tcode] = type;
24743 return type;
24744 }
24745
24746
24747 /* Codes for all the SSE/MMX builtins. */
24748 enum ix86_builtins
24749 {
24750 IX86_BUILTIN_ADDPS,
24751 IX86_BUILTIN_ADDSS,
24752 IX86_BUILTIN_DIVPS,
24753 IX86_BUILTIN_DIVSS,
24754 IX86_BUILTIN_MULPS,
24755 IX86_BUILTIN_MULSS,
24756 IX86_BUILTIN_SUBPS,
24757 IX86_BUILTIN_SUBSS,
24758
24759 IX86_BUILTIN_CMPEQPS,
24760 IX86_BUILTIN_CMPLTPS,
24761 IX86_BUILTIN_CMPLEPS,
24762 IX86_BUILTIN_CMPGTPS,
24763 IX86_BUILTIN_CMPGEPS,
24764 IX86_BUILTIN_CMPNEQPS,
24765 IX86_BUILTIN_CMPNLTPS,
24766 IX86_BUILTIN_CMPNLEPS,
24767 IX86_BUILTIN_CMPNGTPS,
24768 IX86_BUILTIN_CMPNGEPS,
24769 IX86_BUILTIN_CMPORDPS,
24770 IX86_BUILTIN_CMPUNORDPS,
24771 IX86_BUILTIN_CMPEQSS,
24772 IX86_BUILTIN_CMPLTSS,
24773 IX86_BUILTIN_CMPLESS,
24774 IX86_BUILTIN_CMPNEQSS,
24775 IX86_BUILTIN_CMPNLTSS,
24776 IX86_BUILTIN_CMPNLESS,
24777 IX86_BUILTIN_CMPNGTSS,
24778 IX86_BUILTIN_CMPNGESS,
24779 IX86_BUILTIN_CMPORDSS,
24780 IX86_BUILTIN_CMPUNORDSS,
24781
24782 IX86_BUILTIN_COMIEQSS,
24783 IX86_BUILTIN_COMILTSS,
24784 IX86_BUILTIN_COMILESS,
24785 IX86_BUILTIN_COMIGTSS,
24786 IX86_BUILTIN_COMIGESS,
24787 IX86_BUILTIN_COMINEQSS,
24788 IX86_BUILTIN_UCOMIEQSS,
24789 IX86_BUILTIN_UCOMILTSS,
24790 IX86_BUILTIN_UCOMILESS,
24791 IX86_BUILTIN_UCOMIGTSS,
24792 IX86_BUILTIN_UCOMIGESS,
24793 IX86_BUILTIN_UCOMINEQSS,
24794
24795 IX86_BUILTIN_CVTPI2PS,
24796 IX86_BUILTIN_CVTPS2PI,
24797 IX86_BUILTIN_CVTSI2SS,
24798 IX86_BUILTIN_CVTSI642SS,
24799 IX86_BUILTIN_CVTSS2SI,
24800 IX86_BUILTIN_CVTSS2SI64,
24801 IX86_BUILTIN_CVTTPS2PI,
24802 IX86_BUILTIN_CVTTSS2SI,
24803 IX86_BUILTIN_CVTTSS2SI64,
24804
24805 IX86_BUILTIN_MAXPS,
24806 IX86_BUILTIN_MAXSS,
24807 IX86_BUILTIN_MINPS,
24808 IX86_BUILTIN_MINSS,
24809
24810 IX86_BUILTIN_LOADUPS,
24811 IX86_BUILTIN_STOREUPS,
24812 IX86_BUILTIN_MOVSS,
24813
24814 IX86_BUILTIN_MOVHLPS,
24815 IX86_BUILTIN_MOVLHPS,
24816 IX86_BUILTIN_LOADHPS,
24817 IX86_BUILTIN_LOADLPS,
24818 IX86_BUILTIN_STOREHPS,
24819 IX86_BUILTIN_STORELPS,
24820
24821 IX86_BUILTIN_MASKMOVQ,
24822 IX86_BUILTIN_MOVMSKPS,
24823 IX86_BUILTIN_PMOVMSKB,
24824
24825 IX86_BUILTIN_MOVNTPS,
24826 IX86_BUILTIN_MOVNTQ,
24827
24828 IX86_BUILTIN_LOADDQU,
24829 IX86_BUILTIN_STOREDQU,
24830
24831 IX86_BUILTIN_PACKSSWB,
24832 IX86_BUILTIN_PACKSSDW,
24833 IX86_BUILTIN_PACKUSWB,
24834
24835 IX86_BUILTIN_PADDB,
24836 IX86_BUILTIN_PADDW,
24837 IX86_BUILTIN_PADDD,
24838 IX86_BUILTIN_PADDQ,
24839 IX86_BUILTIN_PADDSB,
24840 IX86_BUILTIN_PADDSW,
24841 IX86_BUILTIN_PADDUSB,
24842 IX86_BUILTIN_PADDUSW,
24843 IX86_BUILTIN_PSUBB,
24844 IX86_BUILTIN_PSUBW,
24845 IX86_BUILTIN_PSUBD,
24846 IX86_BUILTIN_PSUBQ,
24847 IX86_BUILTIN_PSUBSB,
24848 IX86_BUILTIN_PSUBSW,
24849 IX86_BUILTIN_PSUBUSB,
24850 IX86_BUILTIN_PSUBUSW,
24851
24852 IX86_BUILTIN_PAND,
24853 IX86_BUILTIN_PANDN,
24854 IX86_BUILTIN_POR,
24855 IX86_BUILTIN_PXOR,
24856
24857 IX86_BUILTIN_PAVGB,
24858 IX86_BUILTIN_PAVGW,
24859
24860 IX86_BUILTIN_PCMPEQB,
24861 IX86_BUILTIN_PCMPEQW,
24862 IX86_BUILTIN_PCMPEQD,
24863 IX86_BUILTIN_PCMPGTB,
24864 IX86_BUILTIN_PCMPGTW,
24865 IX86_BUILTIN_PCMPGTD,
24866
24867 IX86_BUILTIN_PMADDWD,
24868
24869 IX86_BUILTIN_PMAXSW,
24870 IX86_BUILTIN_PMAXUB,
24871 IX86_BUILTIN_PMINSW,
24872 IX86_BUILTIN_PMINUB,
24873
24874 IX86_BUILTIN_PMULHUW,
24875 IX86_BUILTIN_PMULHW,
24876 IX86_BUILTIN_PMULLW,
24877
24878 IX86_BUILTIN_PSADBW,
24879 IX86_BUILTIN_PSHUFW,
24880
24881 IX86_BUILTIN_PSLLW,
24882 IX86_BUILTIN_PSLLD,
24883 IX86_BUILTIN_PSLLQ,
24884 IX86_BUILTIN_PSRAW,
24885 IX86_BUILTIN_PSRAD,
24886 IX86_BUILTIN_PSRLW,
24887 IX86_BUILTIN_PSRLD,
24888 IX86_BUILTIN_PSRLQ,
24889 IX86_BUILTIN_PSLLWI,
24890 IX86_BUILTIN_PSLLDI,
24891 IX86_BUILTIN_PSLLQI,
24892 IX86_BUILTIN_PSRAWI,
24893 IX86_BUILTIN_PSRADI,
24894 IX86_BUILTIN_PSRLWI,
24895 IX86_BUILTIN_PSRLDI,
24896 IX86_BUILTIN_PSRLQI,
24897
24898 IX86_BUILTIN_PUNPCKHBW,
24899 IX86_BUILTIN_PUNPCKHWD,
24900 IX86_BUILTIN_PUNPCKHDQ,
24901 IX86_BUILTIN_PUNPCKLBW,
24902 IX86_BUILTIN_PUNPCKLWD,
24903 IX86_BUILTIN_PUNPCKLDQ,
24904
24905 IX86_BUILTIN_SHUFPS,
24906
24907 IX86_BUILTIN_RCPPS,
24908 IX86_BUILTIN_RCPSS,
24909 IX86_BUILTIN_RSQRTPS,
24910 IX86_BUILTIN_RSQRTPS_NR,
24911 IX86_BUILTIN_RSQRTSS,
24912 IX86_BUILTIN_RSQRTF,
24913 IX86_BUILTIN_SQRTPS,
24914 IX86_BUILTIN_SQRTPS_NR,
24915 IX86_BUILTIN_SQRTSS,
24916
24917 IX86_BUILTIN_UNPCKHPS,
24918 IX86_BUILTIN_UNPCKLPS,
24919
24920 IX86_BUILTIN_ANDPS,
24921 IX86_BUILTIN_ANDNPS,
24922 IX86_BUILTIN_ORPS,
24923 IX86_BUILTIN_XORPS,
24924
24925 IX86_BUILTIN_EMMS,
24926 IX86_BUILTIN_LDMXCSR,
24927 IX86_BUILTIN_STMXCSR,
24928 IX86_BUILTIN_SFENCE,
24929
24930 /* 3DNow! Original */
24931 IX86_BUILTIN_FEMMS,
24932 IX86_BUILTIN_PAVGUSB,
24933 IX86_BUILTIN_PF2ID,
24934 IX86_BUILTIN_PFACC,
24935 IX86_BUILTIN_PFADD,
24936 IX86_BUILTIN_PFCMPEQ,
24937 IX86_BUILTIN_PFCMPGE,
24938 IX86_BUILTIN_PFCMPGT,
24939 IX86_BUILTIN_PFMAX,
24940 IX86_BUILTIN_PFMIN,
24941 IX86_BUILTIN_PFMUL,
24942 IX86_BUILTIN_PFRCP,
24943 IX86_BUILTIN_PFRCPIT1,
24944 IX86_BUILTIN_PFRCPIT2,
24945 IX86_BUILTIN_PFRSQIT1,
24946 IX86_BUILTIN_PFRSQRT,
24947 IX86_BUILTIN_PFSUB,
24948 IX86_BUILTIN_PFSUBR,
24949 IX86_BUILTIN_PI2FD,
24950 IX86_BUILTIN_PMULHRW,
24951
24952 /* 3DNow! Athlon Extensions */
24953 IX86_BUILTIN_PF2IW,
24954 IX86_BUILTIN_PFNACC,
24955 IX86_BUILTIN_PFPNACC,
24956 IX86_BUILTIN_PI2FW,
24957 IX86_BUILTIN_PSWAPDSI,
24958 IX86_BUILTIN_PSWAPDSF,
24959
24960 /* SSE2 */
24961 IX86_BUILTIN_ADDPD,
24962 IX86_BUILTIN_ADDSD,
24963 IX86_BUILTIN_DIVPD,
24964 IX86_BUILTIN_DIVSD,
24965 IX86_BUILTIN_MULPD,
24966 IX86_BUILTIN_MULSD,
24967 IX86_BUILTIN_SUBPD,
24968 IX86_BUILTIN_SUBSD,
24969
24970 IX86_BUILTIN_CMPEQPD,
24971 IX86_BUILTIN_CMPLTPD,
24972 IX86_BUILTIN_CMPLEPD,
24973 IX86_BUILTIN_CMPGTPD,
24974 IX86_BUILTIN_CMPGEPD,
24975 IX86_BUILTIN_CMPNEQPD,
24976 IX86_BUILTIN_CMPNLTPD,
24977 IX86_BUILTIN_CMPNLEPD,
24978 IX86_BUILTIN_CMPNGTPD,
24979 IX86_BUILTIN_CMPNGEPD,
24980 IX86_BUILTIN_CMPORDPD,
24981 IX86_BUILTIN_CMPUNORDPD,
24982 IX86_BUILTIN_CMPEQSD,
24983 IX86_BUILTIN_CMPLTSD,
24984 IX86_BUILTIN_CMPLESD,
24985 IX86_BUILTIN_CMPNEQSD,
24986 IX86_BUILTIN_CMPNLTSD,
24987 IX86_BUILTIN_CMPNLESD,
24988 IX86_BUILTIN_CMPORDSD,
24989 IX86_BUILTIN_CMPUNORDSD,
24990
24991 IX86_BUILTIN_COMIEQSD,
24992 IX86_BUILTIN_COMILTSD,
24993 IX86_BUILTIN_COMILESD,
24994 IX86_BUILTIN_COMIGTSD,
24995 IX86_BUILTIN_COMIGESD,
24996 IX86_BUILTIN_COMINEQSD,
24997 IX86_BUILTIN_UCOMIEQSD,
24998 IX86_BUILTIN_UCOMILTSD,
24999 IX86_BUILTIN_UCOMILESD,
25000 IX86_BUILTIN_UCOMIGTSD,
25001 IX86_BUILTIN_UCOMIGESD,
25002 IX86_BUILTIN_UCOMINEQSD,
25003
25004 IX86_BUILTIN_MAXPD,
25005 IX86_BUILTIN_MAXSD,
25006 IX86_BUILTIN_MINPD,
25007 IX86_BUILTIN_MINSD,
25008
25009 IX86_BUILTIN_ANDPD,
25010 IX86_BUILTIN_ANDNPD,
25011 IX86_BUILTIN_ORPD,
25012 IX86_BUILTIN_XORPD,
25013
25014 IX86_BUILTIN_SQRTPD,
25015 IX86_BUILTIN_SQRTSD,
25016
25017 IX86_BUILTIN_UNPCKHPD,
25018 IX86_BUILTIN_UNPCKLPD,
25019
25020 IX86_BUILTIN_SHUFPD,
25021
25022 IX86_BUILTIN_LOADUPD,
25023 IX86_BUILTIN_STOREUPD,
25024 IX86_BUILTIN_MOVSD,
25025
25026 IX86_BUILTIN_LOADHPD,
25027 IX86_BUILTIN_LOADLPD,
25028
25029 IX86_BUILTIN_CVTDQ2PD,
25030 IX86_BUILTIN_CVTDQ2PS,
25031
25032 IX86_BUILTIN_CVTPD2DQ,
25033 IX86_BUILTIN_CVTPD2PI,
25034 IX86_BUILTIN_CVTPD2PS,
25035 IX86_BUILTIN_CVTTPD2DQ,
25036 IX86_BUILTIN_CVTTPD2PI,
25037
25038 IX86_BUILTIN_CVTPI2PD,
25039 IX86_BUILTIN_CVTSI2SD,
25040 IX86_BUILTIN_CVTSI642SD,
25041
25042 IX86_BUILTIN_CVTSD2SI,
25043 IX86_BUILTIN_CVTSD2SI64,
25044 IX86_BUILTIN_CVTSD2SS,
25045 IX86_BUILTIN_CVTSS2SD,
25046 IX86_BUILTIN_CVTTSD2SI,
25047 IX86_BUILTIN_CVTTSD2SI64,
25048
25049 IX86_BUILTIN_CVTPS2DQ,
25050 IX86_BUILTIN_CVTPS2PD,
25051 IX86_BUILTIN_CVTTPS2DQ,
25052
25053 IX86_BUILTIN_MOVNTI,
25054 IX86_BUILTIN_MOVNTI64,
25055 IX86_BUILTIN_MOVNTPD,
25056 IX86_BUILTIN_MOVNTDQ,
25057
25058 IX86_BUILTIN_MOVQ128,
25059
25060 /* SSE2 MMX */
25061 IX86_BUILTIN_MASKMOVDQU,
25062 IX86_BUILTIN_MOVMSKPD,
25063 IX86_BUILTIN_PMOVMSKB128,
25064
25065 IX86_BUILTIN_PACKSSWB128,
25066 IX86_BUILTIN_PACKSSDW128,
25067 IX86_BUILTIN_PACKUSWB128,
25068
25069 IX86_BUILTIN_PADDB128,
25070 IX86_BUILTIN_PADDW128,
25071 IX86_BUILTIN_PADDD128,
25072 IX86_BUILTIN_PADDQ128,
25073 IX86_BUILTIN_PADDSB128,
25074 IX86_BUILTIN_PADDSW128,
25075 IX86_BUILTIN_PADDUSB128,
25076 IX86_BUILTIN_PADDUSW128,
25077 IX86_BUILTIN_PSUBB128,
25078 IX86_BUILTIN_PSUBW128,
25079 IX86_BUILTIN_PSUBD128,
25080 IX86_BUILTIN_PSUBQ128,
25081 IX86_BUILTIN_PSUBSB128,
25082 IX86_BUILTIN_PSUBSW128,
25083 IX86_BUILTIN_PSUBUSB128,
25084 IX86_BUILTIN_PSUBUSW128,
25085
25086 IX86_BUILTIN_PAND128,
25087 IX86_BUILTIN_PANDN128,
25088 IX86_BUILTIN_POR128,
25089 IX86_BUILTIN_PXOR128,
25090
25091 IX86_BUILTIN_PAVGB128,
25092 IX86_BUILTIN_PAVGW128,
25093
25094 IX86_BUILTIN_PCMPEQB128,
25095 IX86_BUILTIN_PCMPEQW128,
25096 IX86_BUILTIN_PCMPEQD128,
25097 IX86_BUILTIN_PCMPGTB128,
25098 IX86_BUILTIN_PCMPGTW128,
25099 IX86_BUILTIN_PCMPGTD128,
25100
25101 IX86_BUILTIN_PMADDWD128,
25102
25103 IX86_BUILTIN_PMAXSW128,
25104 IX86_BUILTIN_PMAXUB128,
25105 IX86_BUILTIN_PMINSW128,
25106 IX86_BUILTIN_PMINUB128,
25107
25108 IX86_BUILTIN_PMULUDQ,
25109 IX86_BUILTIN_PMULUDQ128,
25110 IX86_BUILTIN_PMULHUW128,
25111 IX86_BUILTIN_PMULHW128,
25112 IX86_BUILTIN_PMULLW128,
25113
25114 IX86_BUILTIN_PSADBW128,
25115 IX86_BUILTIN_PSHUFHW,
25116 IX86_BUILTIN_PSHUFLW,
25117 IX86_BUILTIN_PSHUFD,
25118
25119 IX86_BUILTIN_PSLLDQI128,
25120 IX86_BUILTIN_PSLLWI128,
25121 IX86_BUILTIN_PSLLDI128,
25122 IX86_BUILTIN_PSLLQI128,
25123 IX86_BUILTIN_PSRAWI128,
25124 IX86_BUILTIN_PSRADI128,
25125 IX86_BUILTIN_PSRLDQI128,
25126 IX86_BUILTIN_PSRLWI128,
25127 IX86_BUILTIN_PSRLDI128,
25128 IX86_BUILTIN_PSRLQI128,
25129
25130 IX86_BUILTIN_PSLLDQ128,
25131 IX86_BUILTIN_PSLLW128,
25132 IX86_BUILTIN_PSLLD128,
25133 IX86_BUILTIN_PSLLQ128,
25134 IX86_BUILTIN_PSRAW128,
25135 IX86_BUILTIN_PSRAD128,
25136 IX86_BUILTIN_PSRLW128,
25137 IX86_BUILTIN_PSRLD128,
25138 IX86_BUILTIN_PSRLQ128,
25139
25140 IX86_BUILTIN_PUNPCKHBW128,
25141 IX86_BUILTIN_PUNPCKHWD128,
25142 IX86_BUILTIN_PUNPCKHDQ128,
25143 IX86_BUILTIN_PUNPCKHQDQ128,
25144 IX86_BUILTIN_PUNPCKLBW128,
25145 IX86_BUILTIN_PUNPCKLWD128,
25146 IX86_BUILTIN_PUNPCKLDQ128,
25147 IX86_BUILTIN_PUNPCKLQDQ128,
25148
25149 IX86_BUILTIN_CLFLUSH,
25150 IX86_BUILTIN_MFENCE,
25151 IX86_BUILTIN_LFENCE,
25152 IX86_BUILTIN_PAUSE,
25153
25154 IX86_BUILTIN_BSRSI,
25155 IX86_BUILTIN_BSRDI,
25156 IX86_BUILTIN_RDPMC,
25157 IX86_BUILTIN_RDTSC,
25158 IX86_BUILTIN_RDTSCP,
25159 IX86_BUILTIN_ROLQI,
25160 IX86_BUILTIN_ROLHI,
25161 IX86_BUILTIN_RORQI,
25162 IX86_BUILTIN_RORHI,
25163
25164 /* SSE3. */
25165 IX86_BUILTIN_ADDSUBPS,
25166 IX86_BUILTIN_HADDPS,
25167 IX86_BUILTIN_HSUBPS,
25168 IX86_BUILTIN_MOVSHDUP,
25169 IX86_BUILTIN_MOVSLDUP,
25170 IX86_BUILTIN_ADDSUBPD,
25171 IX86_BUILTIN_HADDPD,
25172 IX86_BUILTIN_HSUBPD,
25173 IX86_BUILTIN_LDDQU,
25174
25175 IX86_BUILTIN_MONITOR,
25176 IX86_BUILTIN_MWAIT,
25177
25178 /* SSSE3. */
25179 IX86_BUILTIN_PHADDW,
25180 IX86_BUILTIN_PHADDD,
25181 IX86_BUILTIN_PHADDSW,
25182 IX86_BUILTIN_PHSUBW,
25183 IX86_BUILTIN_PHSUBD,
25184 IX86_BUILTIN_PHSUBSW,
25185 IX86_BUILTIN_PMADDUBSW,
25186 IX86_BUILTIN_PMULHRSW,
25187 IX86_BUILTIN_PSHUFB,
25188 IX86_BUILTIN_PSIGNB,
25189 IX86_BUILTIN_PSIGNW,
25190 IX86_BUILTIN_PSIGND,
25191 IX86_BUILTIN_PALIGNR,
25192 IX86_BUILTIN_PABSB,
25193 IX86_BUILTIN_PABSW,
25194 IX86_BUILTIN_PABSD,
25195
25196 IX86_BUILTIN_PHADDW128,
25197 IX86_BUILTIN_PHADDD128,
25198 IX86_BUILTIN_PHADDSW128,
25199 IX86_BUILTIN_PHSUBW128,
25200 IX86_BUILTIN_PHSUBD128,
25201 IX86_BUILTIN_PHSUBSW128,
25202 IX86_BUILTIN_PMADDUBSW128,
25203 IX86_BUILTIN_PMULHRSW128,
25204 IX86_BUILTIN_PSHUFB128,
25205 IX86_BUILTIN_PSIGNB128,
25206 IX86_BUILTIN_PSIGNW128,
25207 IX86_BUILTIN_PSIGND128,
25208 IX86_BUILTIN_PALIGNR128,
25209 IX86_BUILTIN_PABSB128,
25210 IX86_BUILTIN_PABSW128,
25211 IX86_BUILTIN_PABSD128,
25212
25213 /* AMDFAM10 - SSE4A New Instructions. */
25214 IX86_BUILTIN_MOVNTSD,
25215 IX86_BUILTIN_MOVNTSS,
25216 IX86_BUILTIN_EXTRQI,
25217 IX86_BUILTIN_EXTRQ,
25218 IX86_BUILTIN_INSERTQI,
25219 IX86_BUILTIN_INSERTQ,
25220
25221 /* SSE4.1. */
25222 IX86_BUILTIN_BLENDPD,
25223 IX86_BUILTIN_BLENDPS,
25224 IX86_BUILTIN_BLENDVPD,
25225 IX86_BUILTIN_BLENDVPS,
25226 IX86_BUILTIN_PBLENDVB128,
25227 IX86_BUILTIN_PBLENDW128,
25228
25229 IX86_BUILTIN_DPPD,
25230 IX86_BUILTIN_DPPS,
25231
25232 IX86_BUILTIN_INSERTPS128,
25233
25234 IX86_BUILTIN_MOVNTDQA,
25235 IX86_BUILTIN_MPSADBW128,
25236 IX86_BUILTIN_PACKUSDW128,
25237 IX86_BUILTIN_PCMPEQQ,
25238 IX86_BUILTIN_PHMINPOSUW128,
25239
25240 IX86_BUILTIN_PMAXSB128,
25241 IX86_BUILTIN_PMAXSD128,
25242 IX86_BUILTIN_PMAXUD128,
25243 IX86_BUILTIN_PMAXUW128,
25244
25245 IX86_BUILTIN_PMINSB128,
25246 IX86_BUILTIN_PMINSD128,
25247 IX86_BUILTIN_PMINUD128,
25248 IX86_BUILTIN_PMINUW128,
25249
25250 IX86_BUILTIN_PMOVSXBW128,
25251 IX86_BUILTIN_PMOVSXBD128,
25252 IX86_BUILTIN_PMOVSXBQ128,
25253 IX86_BUILTIN_PMOVSXWD128,
25254 IX86_BUILTIN_PMOVSXWQ128,
25255 IX86_BUILTIN_PMOVSXDQ128,
25256
25257 IX86_BUILTIN_PMOVZXBW128,
25258 IX86_BUILTIN_PMOVZXBD128,
25259 IX86_BUILTIN_PMOVZXBQ128,
25260 IX86_BUILTIN_PMOVZXWD128,
25261 IX86_BUILTIN_PMOVZXWQ128,
25262 IX86_BUILTIN_PMOVZXDQ128,
25263
25264 IX86_BUILTIN_PMULDQ128,
25265 IX86_BUILTIN_PMULLD128,
25266
25267 IX86_BUILTIN_ROUNDSD,
25268 IX86_BUILTIN_ROUNDSS,
25269
25270 IX86_BUILTIN_ROUNDPD,
25271 IX86_BUILTIN_ROUNDPS,
25272
25273 IX86_BUILTIN_FLOORPD,
25274 IX86_BUILTIN_CEILPD,
25275 IX86_BUILTIN_TRUNCPD,
25276 IX86_BUILTIN_RINTPD,
25277 IX86_BUILTIN_ROUNDPD_AZ,
25278
25279 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25280 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25281 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25282
25283 IX86_BUILTIN_FLOORPS,
25284 IX86_BUILTIN_CEILPS,
25285 IX86_BUILTIN_TRUNCPS,
25286 IX86_BUILTIN_RINTPS,
25287 IX86_BUILTIN_ROUNDPS_AZ,
25288
25289 IX86_BUILTIN_FLOORPS_SFIX,
25290 IX86_BUILTIN_CEILPS_SFIX,
25291 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25292
25293 IX86_BUILTIN_PTESTZ,
25294 IX86_BUILTIN_PTESTC,
25295 IX86_BUILTIN_PTESTNZC,
25296
25297 IX86_BUILTIN_VEC_INIT_V2SI,
25298 IX86_BUILTIN_VEC_INIT_V4HI,
25299 IX86_BUILTIN_VEC_INIT_V8QI,
25300 IX86_BUILTIN_VEC_EXT_V2DF,
25301 IX86_BUILTIN_VEC_EXT_V2DI,
25302 IX86_BUILTIN_VEC_EXT_V4SF,
25303 IX86_BUILTIN_VEC_EXT_V4SI,
25304 IX86_BUILTIN_VEC_EXT_V8HI,
25305 IX86_BUILTIN_VEC_EXT_V2SI,
25306 IX86_BUILTIN_VEC_EXT_V4HI,
25307 IX86_BUILTIN_VEC_EXT_V16QI,
25308 IX86_BUILTIN_VEC_SET_V2DI,
25309 IX86_BUILTIN_VEC_SET_V4SF,
25310 IX86_BUILTIN_VEC_SET_V4SI,
25311 IX86_BUILTIN_VEC_SET_V8HI,
25312 IX86_BUILTIN_VEC_SET_V4HI,
25313 IX86_BUILTIN_VEC_SET_V16QI,
25314
25315 IX86_BUILTIN_VEC_PACK_SFIX,
25316 IX86_BUILTIN_VEC_PACK_SFIX256,
25317
25318 /* SSE4.2. */
25319 IX86_BUILTIN_CRC32QI,
25320 IX86_BUILTIN_CRC32HI,
25321 IX86_BUILTIN_CRC32SI,
25322 IX86_BUILTIN_CRC32DI,
25323
25324 IX86_BUILTIN_PCMPESTRI128,
25325 IX86_BUILTIN_PCMPESTRM128,
25326 IX86_BUILTIN_PCMPESTRA128,
25327 IX86_BUILTIN_PCMPESTRC128,
25328 IX86_BUILTIN_PCMPESTRO128,
25329 IX86_BUILTIN_PCMPESTRS128,
25330 IX86_BUILTIN_PCMPESTRZ128,
25331 IX86_BUILTIN_PCMPISTRI128,
25332 IX86_BUILTIN_PCMPISTRM128,
25333 IX86_BUILTIN_PCMPISTRA128,
25334 IX86_BUILTIN_PCMPISTRC128,
25335 IX86_BUILTIN_PCMPISTRO128,
25336 IX86_BUILTIN_PCMPISTRS128,
25337 IX86_BUILTIN_PCMPISTRZ128,
25338
25339 IX86_BUILTIN_PCMPGTQ,
25340
25341 /* AES instructions */
25342 IX86_BUILTIN_AESENC128,
25343 IX86_BUILTIN_AESENCLAST128,
25344 IX86_BUILTIN_AESDEC128,
25345 IX86_BUILTIN_AESDECLAST128,
25346 IX86_BUILTIN_AESIMC128,
25347 IX86_BUILTIN_AESKEYGENASSIST128,
25348
25349 /* PCLMUL instruction */
25350 IX86_BUILTIN_PCLMULQDQ128,
25351
25352 /* AVX */
25353 IX86_BUILTIN_ADDPD256,
25354 IX86_BUILTIN_ADDPS256,
25355 IX86_BUILTIN_ADDSUBPD256,
25356 IX86_BUILTIN_ADDSUBPS256,
25357 IX86_BUILTIN_ANDPD256,
25358 IX86_BUILTIN_ANDPS256,
25359 IX86_BUILTIN_ANDNPD256,
25360 IX86_BUILTIN_ANDNPS256,
25361 IX86_BUILTIN_BLENDPD256,
25362 IX86_BUILTIN_BLENDPS256,
25363 IX86_BUILTIN_BLENDVPD256,
25364 IX86_BUILTIN_BLENDVPS256,
25365 IX86_BUILTIN_DIVPD256,
25366 IX86_BUILTIN_DIVPS256,
25367 IX86_BUILTIN_DPPS256,
25368 IX86_BUILTIN_HADDPD256,
25369 IX86_BUILTIN_HADDPS256,
25370 IX86_BUILTIN_HSUBPD256,
25371 IX86_BUILTIN_HSUBPS256,
25372 IX86_BUILTIN_MAXPD256,
25373 IX86_BUILTIN_MAXPS256,
25374 IX86_BUILTIN_MINPD256,
25375 IX86_BUILTIN_MINPS256,
25376 IX86_BUILTIN_MULPD256,
25377 IX86_BUILTIN_MULPS256,
25378 IX86_BUILTIN_ORPD256,
25379 IX86_BUILTIN_ORPS256,
25380 IX86_BUILTIN_SHUFPD256,
25381 IX86_BUILTIN_SHUFPS256,
25382 IX86_BUILTIN_SUBPD256,
25383 IX86_BUILTIN_SUBPS256,
25384 IX86_BUILTIN_XORPD256,
25385 IX86_BUILTIN_XORPS256,
25386 IX86_BUILTIN_CMPSD,
25387 IX86_BUILTIN_CMPSS,
25388 IX86_BUILTIN_CMPPD,
25389 IX86_BUILTIN_CMPPS,
25390 IX86_BUILTIN_CMPPD256,
25391 IX86_BUILTIN_CMPPS256,
25392 IX86_BUILTIN_CVTDQ2PD256,
25393 IX86_BUILTIN_CVTDQ2PS256,
25394 IX86_BUILTIN_CVTPD2PS256,
25395 IX86_BUILTIN_CVTPS2DQ256,
25396 IX86_BUILTIN_CVTPS2PD256,
25397 IX86_BUILTIN_CVTTPD2DQ256,
25398 IX86_BUILTIN_CVTPD2DQ256,
25399 IX86_BUILTIN_CVTTPS2DQ256,
25400 IX86_BUILTIN_EXTRACTF128PD256,
25401 IX86_BUILTIN_EXTRACTF128PS256,
25402 IX86_BUILTIN_EXTRACTF128SI256,
25403 IX86_BUILTIN_VZEROALL,
25404 IX86_BUILTIN_VZEROUPPER,
25405 IX86_BUILTIN_VPERMILVARPD,
25406 IX86_BUILTIN_VPERMILVARPS,
25407 IX86_BUILTIN_VPERMILVARPD256,
25408 IX86_BUILTIN_VPERMILVARPS256,
25409 IX86_BUILTIN_VPERMILPD,
25410 IX86_BUILTIN_VPERMILPS,
25411 IX86_BUILTIN_VPERMILPD256,
25412 IX86_BUILTIN_VPERMILPS256,
25413 IX86_BUILTIN_VPERMIL2PD,
25414 IX86_BUILTIN_VPERMIL2PS,
25415 IX86_BUILTIN_VPERMIL2PD256,
25416 IX86_BUILTIN_VPERMIL2PS256,
25417 IX86_BUILTIN_VPERM2F128PD256,
25418 IX86_BUILTIN_VPERM2F128PS256,
25419 IX86_BUILTIN_VPERM2F128SI256,
25420 IX86_BUILTIN_VBROADCASTSS,
25421 IX86_BUILTIN_VBROADCASTSD256,
25422 IX86_BUILTIN_VBROADCASTSS256,
25423 IX86_BUILTIN_VBROADCASTPD256,
25424 IX86_BUILTIN_VBROADCASTPS256,
25425 IX86_BUILTIN_VINSERTF128PD256,
25426 IX86_BUILTIN_VINSERTF128PS256,
25427 IX86_BUILTIN_VINSERTF128SI256,
25428 IX86_BUILTIN_LOADUPD256,
25429 IX86_BUILTIN_LOADUPS256,
25430 IX86_BUILTIN_STOREUPD256,
25431 IX86_BUILTIN_STOREUPS256,
25432 IX86_BUILTIN_LDDQU256,
25433 IX86_BUILTIN_MOVNTDQ256,
25434 IX86_BUILTIN_MOVNTPD256,
25435 IX86_BUILTIN_MOVNTPS256,
25436 IX86_BUILTIN_LOADDQU256,
25437 IX86_BUILTIN_STOREDQU256,
25438 IX86_BUILTIN_MASKLOADPD,
25439 IX86_BUILTIN_MASKLOADPS,
25440 IX86_BUILTIN_MASKSTOREPD,
25441 IX86_BUILTIN_MASKSTOREPS,
25442 IX86_BUILTIN_MASKLOADPD256,
25443 IX86_BUILTIN_MASKLOADPS256,
25444 IX86_BUILTIN_MASKSTOREPD256,
25445 IX86_BUILTIN_MASKSTOREPS256,
25446 IX86_BUILTIN_MOVSHDUP256,
25447 IX86_BUILTIN_MOVSLDUP256,
25448 IX86_BUILTIN_MOVDDUP256,
25449
25450 IX86_BUILTIN_SQRTPD256,
25451 IX86_BUILTIN_SQRTPS256,
25452 IX86_BUILTIN_SQRTPS_NR256,
25453 IX86_BUILTIN_RSQRTPS256,
25454 IX86_BUILTIN_RSQRTPS_NR256,
25455
25456 IX86_BUILTIN_RCPPS256,
25457
25458 IX86_BUILTIN_ROUNDPD256,
25459 IX86_BUILTIN_ROUNDPS256,
25460
25461 IX86_BUILTIN_FLOORPD256,
25462 IX86_BUILTIN_CEILPD256,
25463 IX86_BUILTIN_TRUNCPD256,
25464 IX86_BUILTIN_RINTPD256,
25465 IX86_BUILTIN_ROUNDPD_AZ256,
25466
25467 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25468 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25469 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25470
25471 IX86_BUILTIN_FLOORPS256,
25472 IX86_BUILTIN_CEILPS256,
25473 IX86_BUILTIN_TRUNCPS256,
25474 IX86_BUILTIN_RINTPS256,
25475 IX86_BUILTIN_ROUNDPS_AZ256,
25476
25477 IX86_BUILTIN_FLOORPS_SFIX256,
25478 IX86_BUILTIN_CEILPS_SFIX256,
25479 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25480
25481 IX86_BUILTIN_UNPCKHPD256,
25482 IX86_BUILTIN_UNPCKLPD256,
25483 IX86_BUILTIN_UNPCKHPS256,
25484 IX86_BUILTIN_UNPCKLPS256,
25485
25486 IX86_BUILTIN_SI256_SI,
25487 IX86_BUILTIN_PS256_PS,
25488 IX86_BUILTIN_PD256_PD,
25489 IX86_BUILTIN_SI_SI256,
25490 IX86_BUILTIN_PS_PS256,
25491 IX86_BUILTIN_PD_PD256,
25492
25493 IX86_BUILTIN_VTESTZPD,
25494 IX86_BUILTIN_VTESTCPD,
25495 IX86_BUILTIN_VTESTNZCPD,
25496 IX86_BUILTIN_VTESTZPS,
25497 IX86_BUILTIN_VTESTCPS,
25498 IX86_BUILTIN_VTESTNZCPS,
25499 IX86_BUILTIN_VTESTZPD256,
25500 IX86_BUILTIN_VTESTCPD256,
25501 IX86_BUILTIN_VTESTNZCPD256,
25502 IX86_BUILTIN_VTESTZPS256,
25503 IX86_BUILTIN_VTESTCPS256,
25504 IX86_BUILTIN_VTESTNZCPS256,
25505 IX86_BUILTIN_PTESTZ256,
25506 IX86_BUILTIN_PTESTC256,
25507 IX86_BUILTIN_PTESTNZC256,
25508
25509 IX86_BUILTIN_MOVMSKPD256,
25510 IX86_BUILTIN_MOVMSKPS256,
25511
25512 /* AVX2 */
25513 IX86_BUILTIN_MPSADBW256,
25514 IX86_BUILTIN_PABSB256,
25515 IX86_BUILTIN_PABSW256,
25516 IX86_BUILTIN_PABSD256,
25517 IX86_BUILTIN_PACKSSDW256,
25518 IX86_BUILTIN_PACKSSWB256,
25519 IX86_BUILTIN_PACKUSDW256,
25520 IX86_BUILTIN_PACKUSWB256,
25521 IX86_BUILTIN_PADDB256,
25522 IX86_BUILTIN_PADDW256,
25523 IX86_BUILTIN_PADDD256,
25524 IX86_BUILTIN_PADDQ256,
25525 IX86_BUILTIN_PADDSB256,
25526 IX86_BUILTIN_PADDSW256,
25527 IX86_BUILTIN_PADDUSB256,
25528 IX86_BUILTIN_PADDUSW256,
25529 IX86_BUILTIN_PALIGNR256,
25530 IX86_BUILTIN_AND256I,
25531 IX86_BUILTIN_ANDNOT256I,
25532 IX86_BUILTIN_PAVGB256,
25533 IX86_BUILTIN_PAVGW256,
25534 IX86_BUILTIN_PBLENDVB256,
25535 IX86_BUILTIN_PBLENDVW256,
25536 IX86_BUILTIN_PCMPEQB256,
25537 IX86_BUILTIN_PCMPEQW256,
25538 IX86_BUILTIN_PCMPEQD256,
25539 IX86_BUILTIN_PCMPEQQ256,
25540 IX86_BUILTIN_PCMPGTB256,
25541 IX86_BUILTIN_PCMPGTW256,
25542 IX86_BUILTIN_PCMPGTD256,
25543 IX86_BUILTIN_PCMPGTQ256,
25544 IX86_BUILTIN_PHADDW256,
25545 IX86_BUILTIN_PHADDD256,
25546 IX86_BUILTIN_PHADDSW256,
25547 IX86_BUILTIN_PHSUBW256,
25548 IX86_BUILTIN_PHSUBD256,
25549 IX86_BUILTIN_PHSUBSW256,
25550 IX86_BUILTIN_PMADDUBSW256,
25551 IX86_BUILTIN_PMADDWD256,
25552 IX86_BUILTIN_PMAXSB256,
25553 IX86_BUILTIN_PMAXSW256,
25554 IX86_BUILTIN_PMAXSD256,
25555 IX86_BUILTIN_PMAXUB256,
25556 IX86_BUILTIN_PMAXUW256,
25557 IX86_BUILTIN_PMAXUD256,
25558 IX86_BUILTIN_PMINSB256,
25559 IX86_BUILTIN_PMINSW256,
25560 IX86_BUILTIN_PMINSD256,
25561 IX86_BUILTIN_PMINUB256,
25562 IX86_BUILTIN_PMINUW256,
25563 IX86_BUILTIN_PMINUD256,
25564 IX86_BUILTIN_PMOVMSKB256,
25565 IX86_BUILTIN_PMOVSXBW256,
25566 IX86_BUILTIN_PMOVSXBD256,
25567 IX86_BUILTIN_PMOVSXBQ256,
25568 IX86_BUILTIN_PMOVSXWD256,
25569 IX86_BUILTIN_PMOVSXWQ256,
25570 IX86_BUILTIN_PMOVSXDQ256,
25571 IX86_BUILTIN_PMOVZXBW256,
25572 IX86_BUILTIN_PMOVZXBD256,
25573 IX86_BUILTIN_PMOVZXBQ256,
25574 IX86_BUILTIN_PMOVZXWD256,
25575 IX86_BUILTIN_PMOVZXWQ256,
25576 IX86_BUILTIN_PMOVZXDQ256,
25577 IX86_BUILTIN_PMULDQ256,
25578 IX86_BUILTIN_PMULHRSW256,
25579 IX86_BUILTIN_PMULHUW256,
25580 IX86_BUILTIN_PMULHW256,
25581 IX86_BUILTIN_PMULLW256,
25582 IX86_BUILTIN_PMULLD256,
25583 IX86_BUILTIN_PMULUDQ256,
25584 IX86_BUILTIN_POR256,
25585 IX86_BUILTIN_PSADBW256,
25586 IX86_BUILTIN_PSHUFB256,
25587 IX86_BUILTIN_PSHUFD256,
25588 IX86_BUILTIN_PSHUFHW256,
25589 IX86_BUILTIN_PSHUFLW256,
25590 IX86_BUILTIN_PSIGNB256,
25591 IX86_BUILTIN_PSIGNW256,
25592 IX86_BUILTIN_PSIGND256,
25593 IX86_BUILTIN_PSLLDQI256,
25594 IX86_BUILTIN_PSLLWI256,
25595 IX86_BUILTIN_PSLLW256,
25596 IX86_BUILTIN_PSLLDI256,
25597 IX86_BUILTIN_PSLLD256,
25598 IX86_BUILTIN_PSLLQI256,
25599 IX86_BUILTIN_PSLLQ256,
25600 IX86_BUILTIN_PSRAWI256,
25601 IX86_BUILTIN_PSRAW256,
25602 IX86_BUILTIN_PSRADI256,
25603 IX86_BUILTIN_PSRAD256,
25604 IX86_BUILTIN_PSRLDQI256,
25605 IX86_BUILTIN_PSRLWI256,
25606 IX86_BUILTIN_PSRLW256,
25607 IX86_BUILTIN_PSRLDI256,
25608 IX86_BUILTIN_PSRLD256,
25609 IX86_BUILTIN_PSRLQI256,
25610 IX86_BUILTIN_PSRLQ256,
25611 IX86_BUILTIN_PSUBB256,
25612 IX86_BUILTIN_PSUBW256,
25613 IX86_BUILTIN_PSUBD256,
25614 IX86_BUILTIN_PSUBQ256,
25615 IX86_BUILTIN_PSUBSB256,
25616 IX86_BUILTIN_PSUBSW256,
25617 IX86_BUILTIN_PSUBUSB256,
25618 IX86_BUILTIN_PSUBUSW256,
25619 IX86_BUILTIN_PUNPCKHBW256,
25620 IX86_BUILTIN_PUNPCKHWD256,
25621 IX86_BUILTIN_PUNPCKHDQ256,
25622 IX86_BUILTIN_PUNPCKHQDQ256,
25623 IX86_BUILTIN_PUNPCKLBW256,
25624 IX86_BUILTIN_PUNPCKLWD256,
25625 IX86_BUILTIN_PUNPCKLDQ256,
25626 IX86_BUILTIN_PUNPCKLQDQ256,
25627 IX86_BUILTIN_PXOR256,
25628 IX86_BUILTIN_MOVNTDQA256,
25629 IX86_BUILTIN_VBROADCASTSS_PS,
25630 IX86_BUILTIN_VBROADCASTSS_PS256,
25631 IX86_BUILTIN_VBROADCASTSD_PD256,
25632 IX86_BUILTIN_VBROADCASTSI256,
25633 IX86_BUILTIN_PBLENDD256,
25634 IX86_BUILTIN_PBLENDD128,
25635 IX86_BUILTIN_PBROADCASTB256,
25636 IX86_BUILTIN_PBROADCASTW256,
25637 IX86_BUILTIN_PBROADCASTD256,
25638 IX86_BUILTIN_PBROADCASTQ256,
25639 IX86_BUILTIN_PBROADCASTB128,
25640 IX86_BUILTIN_PBROADCASTW128,
25641 IX86_BUILTIN_PBROADCASTD128,
25642 IX86_BUILTIN_PBROADCASTQ128,
25643 IX86_BUILTIN_VPERMVARSI256,
25644 IX86_BUILTIN_VPERMDF256,
25645 IX86_BUILTIN_VPERMVARSF256,
25646 IX86_BUILTIN_VPERMDI256,
25647 IX86_BUILTIN_VPERMTI256,
25648 IX86_BUILTIN_VEXTRACT128I256,
25649 IX86_BUILTIN_VINSERT128I256,
25650 IX86_BUILTIN_MASKLOADD,
25651 IX86_BUILTIN_MASKLOADQ,
25652 IX86_BUILTIN_MASKLOADD256,
25653 IX86_BUILTIN_MASKLOADQ256,
25654 IX86_BUILTIN_MASKSTORED,
25655 IX86_BUILTIN_MASKSTOREQ,
25656 IX86_BUILTIN_MASKSTORED256,
25657 IX86_BUILTIN_MASKSTOREQ256,
25658 IX86_BUILTIN_PSLLVV4DI,
25659 IX86_BUILTIN_PSLLVV2DI,
25660 IX86_BUILTIN_PSLLVV8SI,
25661 IX86_BUILTIN_PSLLVV4SI,
25662 IX86_BUILTIN_PSRAVV8SI,
25663 IX86_BUILTIN_PSRAVV4SI,
25664 IX86_BUILTIN_PSRLVV4DI,
25665 IX86_BUILTIN_PSRLVV2DI,
25666 IX86_BUILTIN_PSRLVV8SI,
25667 IX86_BUILTIN_PSRLVV4SI,
25668
25669 IX86_BUILTIN_GATHERSIV2DF,
25670 IX86_BUILTIN_GATHERSIV4DF,
25671 IX86_BUILTIN_GATHERDIV2DF,
25672 IX86_BUILTIN_GATHERDIV4DF,
25673 IX86_BUILTIN_GATHERSIV4SF,
25674 IX86_BUILTIN_GATHERSIV8SF,
25675 IX86_BUILTIN_GATHERDIV4SF,
25676 IX86_BUILTIN_GATHERDIV8SF,
25677 IX86_BUILTIN_GATHERSIV2DI,
25678 IX86_BUILTIN_GATHERSIV4DI,
25679 IX86_BUILTIN_GATHERDIV2DI,
25680 IX86_BUILTIN_GATHERDIV4DI,
25681 IX86_BUILTIN_GATHERSIV4SI,
25682 IX86_BUILTIN_GATHERSIV8SI,
25683 IX86_BUILTIN_GATHERDIV4SI,
25684 IX86_BUILTIN_GATHERDIV8SI,
25685
25686 /* Alternate 4 element gather for the vectorizer where
25687 all operands are 32-byte wide. */
25688 IX86_BUILTIN_GATHERALTSIV4DF,
25689 IX86_BUILTIN_GATHERALTDIV8SF,
25690 IX86_BUILTIN_GATHERALTSIV4DI,
25691 IX86_BUILTIN_GATHERALTDIV8SI,
25692
25693 /* TFmode support builtins. */
25694 IX86_BUILTIN_INFQ,
25695 IX86_BUILTIN_HUGE_VALQ,
25696 IX86_BUILTIN_FABSQ,
25697 IX86_BUILTIN_COPYSIGNQ,
25698
25699 /* Vectorizer support builtins. */
25700 IX86_BUILTIN_CPYSGNPS,
25701 IX86_BUILTIN_CPYSGNPD,
25702 IX86_BUILTIN_CPYSGNPS256,
25703 IX86_BUILTIN_CPYSGNPD256,
25704
25705 /* FMA4 instructions. */
25706 IX86_BUILTIN_VFMADDSS,
25707 IX86_BUILTIN_VFMADDSD,
25708 IX86_BUILTIN_VFMADDPS,
25709 IX86_BUILTIN_VFMADDPD,
25710 IX86_BUILTIN_VFMADDPS256,
25711 IX86_BUILTIN_VFMADDPD256,
25712 IX86_BUILTIN_VFMADDSUBPS,
25713 IX86_BUILTIN_VFMADDSUBPD,
25714 IX86_BUILTIN_VFMADDSUBPS256,
25715 IX86_BUILTIN_VFMADDSUBPD256,
25716
25717 /* FMA3 instructions. */
25718 IX86_BUILTIN_VFMADDSS3,
25719 IX86_BUILTIN_VFMADDSD3,
25720
25721 /* XOP instructions. */
25722 IX86_BUILTIN_VPCMOV,
25723 IX86_BUILTIN_VPCMOV_V2DI,
25724 IX86_BUILTIN_VPCMOV_V4SI,
25725 IX86_BUILTIN_VPCMOV_V8HI,
25726 IX86_BUILTIN_VPCMOV_V16QI,
25727 IX86_BUILTIN_VPCMOV_V4SF,
25728 IX86_BUILTIN_VPCMOV_V2DF,
25729 IX86_BUILTIN_VPCMOV256,
25730 IX86_BUILTIN_VPCMOV_V4DI256,
25731 IX86_BUILTIN_VPCMOV_V8SI256,
25732 IX86_BUILTIN_VPCMOV_V16HI256,
25733 IX86_BUILTIN_VPCMOV_V32QI256,
25734 IX86_BUILTIN_VPCMOV_V8SF256,
25735 IX86_BUILTIN_VPCMOV_V4DF256,
25736
25737 IX86_BUILTIN_VPPERM,
25738
25739 IX86_BUILTIN_VPMACSSWW,
25740 IX86_BUILTIN_VPMACSWW,
25741 IX86_BUILTIN_VPMACSSWD,
25742 IX86_BUILTIN_VPMACSWD,
25743 IX86_BUILTIN_VPMACSSDD,
25744 IX86_BUILTIN_VPMACSDD,
25745 IX86_BUILTIN_VPMACSSDQL,
25746 IX86_BUILTIN_VPMACSSDQH,
25747 IX86_BUILTIN_VPMACSDQL,
25748 IX86_BUILTIN_VPMACSDQH,
25749 IX86_BUILTIN_VPMADCSSWD,
25750 IX86_BUILTIN_VPMADCSWD,
25751
25752 IX86_BUILTIN_VPHADDBW,
25753 IX86_BUILTIN_VPHADDBD,
25754 IX86_BUILTIN_VPHADDBQ,
25755 IX86_BUILTIN_VPHADDWD,
25756 IX86_BUILTIN_VPHADDWQ,
25757 IX86_BUILTIN_VPHADDDQ,
25758 IX86_BUILTIN_VPHADDUBW,
25759 IX86_BUILTIN_VPHADDUBD,
25760 IX86_BUILTIN_VPHADDUBQ,
25761 IX86_BUILTIN_VPHADDUWD,
25762 IX86_BUILTIN_VPHADDUWQ,
25763 IX86_BUILTIN_VPHADDUDQ,
25764 IX86_BUILTIN_VPHSUBBW,
25765 IX86_BUILTIN_VPHSUBWD,
25766 IX86_BUILTIN_VPHSUBDQ,
25767
25768 IX86_BUILTIN_VPROTB,
25769 IX86_BUILTIN_VPROTW,
25770 IX86_BUILTIN_VPROTD,
25771 IX86_BUILTIN_VPROTQ,
25772 IX86_BUILTIN_VPROTB_IMM,
25773 IX86_BUILTIN_VPROTW_IMM,
25774 IX86_BUILTIN_VPROTD_IMM,
25775 IX86_BUILTIN_VPROTQ_IMM,
25776
25777 IX86_BUILTIN_VPSHLB,
25778 IX86_BUILTIN_VPSHLW,
25779 IX86_BUILTIN_VPSHLD,
25780 IX86_BUILTIN_VPSHLQ,
25781 IX86_BUILTIN_VPSHAB,
25782 IX86_BUILTIN_VPSHAW,
25783 IX86_BUILTIN_VPSHAD,
25784 IX86_BUILTIN_VPSHAQ,
25785
25786 IX86_BUILTIN_VFRCZSS,
25787 IX86_BUILTIN_VFRCZSD,
25788 IX86_BUILTIN_VFRCZPS,
25789 IX86_BUILTIN_VFRCZPD,
25790 IX86_BUILTIN_VFRCZPS256,
25791 IX86_BUILTIN_VFRCZPD256,
25792
25793 IX86_BUILTIN_VPCOMEQUB,
25794 IX86_BUILTIN_VPCOMNEUB,
25795 IX86_BUILTIN_VPCOMLTUB,
25796 IX86_BUILTIN_VPCOMLEUB,
25797 IX86_BUILTIN_VPCOMGTUB,
25798 IX86_BUILTIN_VPCOMGEUB,
25799 IX86_BUILTIN_VPCOMFALSEUB,
25800 IX86_BUILTIN_VPCOMTRUEUB,
25801
25802 IX86_BUILTIN_VPCOMEQUW,
25803 IX86_BUILTIN_VPCOMNEUW,
25804 IX86_BUILTIN_VPCOMLTUW,
25805 IX86_BUILTIN_VPCOMLEUW,
25806 IX86_BUILTIN_VPCOMGTUW,
25807 IX86_BUILTIN_VPCOMGEUW,
25808 IX86_BUILTIN_VPCOMFALSEUW,
25809 IX86_BUILTIN_VPCOMTRUEUW,
25810
25811 IX86_BUILTIN_VPCOMEQUD,
25812 IX86_BUILTIN_VPCOMNEUD,
25813 IX86_BUILTIN_VPCOMLTUD,
25814 IX86_BUILTIN_VPCOMLEUD,
25815 IX86_BUILTIN_VPCOMGTUD,
25816 IX86_BUILTIN_VPCOMGEUD,
25817 IX86_BUILTIN_VPCOMFALSEUD,
25818 IX86_BUILTIN_VPCOMTRUEUD,
25819
25820 IX86_BUILTIN_VPCOMEQUQ,
25821 IX86_BUILTIN_VPCOMNEUQ,
25822 IX86_BUILTIN_VPCOMLTUQ,
25823 IX86_BUILTIN_VPCOMLEUQ,
25824 IX86_BUILTIN_VPCOMGTUQ,
25825 IX86_BUILTIN_VPCOMGEUQ,
25826 IX86_BUILTIN_VPCOMFALSEUQ,
25827 IX86_BUILTIN_VPCOMTRUEUQ,
25828
25829 IX86_BUILTIN_VPCOMEQB,
25830 IX86_BUILTIN_VPCOMNEB,
25831 IX86_BUILTIN_VPCOMLTB,
25832 IX86_BUILTIN_VPCOMLEB,
25833 IX86_BUILTIN_VPCOMGTB,
25834 IX86_BUILTIN_VPCOMGEB,
25835 IX86_BUILTIN_VPCOMFALSEB,
25836 IX86_BUILTIN_VPCOMTRUEB,
25837
25838 IX86_BUILTIN_VPCOMEQW,
25839 IX86_BUILTIN_VPCOMNEW,
25840 IX86_BUILTIN_VPCOMLTW,
25841 IX86_BUILTIN_VPCOMLEW,
25842 IX86_BUILTIN_VPCOMGTW,
25843 IX86_BUILTIN_VPCOMGEW,
25844 IX86_BUILTIN_VPCOMFALSEW,
25845 IX86_BUILTIN_VPCOMTRUEW,
25846
25847 IX86_BUILTIN_VPCOMEQD,
25848 IX86_BUILTIN_VPCOMNED,
25849 IX86_BUILTIN_VPCOMLTD,
25850 IX86_BUILTIN_VPCOMLED,
25851 IX86_BUILTIN_VPCOMGTD,
25852 IX86_BUILTIN_VPCOMGED,
25853 IX86_BUILTIN_VPCOMFALSED,
25854 IX86_BUILTIN_VPCOMTRUED,
25855
25856 IX86_BUILTIN_VPCOMEQQ,
25857 IX86_BUILTIN_VPCOMNEQ,
25858 IX86_BUILTIN_VPCOMLTQ,
25859 IX86_BUILTIN_VPCOMLEQ,
25860 IX86_BUILTIN_VPCOMGTQ,
25861 IX86_BUILTIN_VPCOMGEQ,
25862 IX86_BUILTIN_VPCOMFALSEQ,
25863 IX86_BUILTIN_VPCOMTRUEQ,
25864
25865 /* LWP instructions. */
25866 IX86_BUILTIN_LLWPCB,
25867 IX86_BUILTIN_SLWPCB,
25868 IX86_BUILTIN_LWPVAL32,
25869 IX86_BUILTIN_LWPVAL64,
25870 IX86_BUILTIN_LWPINS32,
25871 IX86_BUILTIN_LWPINS64,
25872
25873 IX86_BUILTIN_CLZS,
25874
25875 /* RTM */
25876 IX86_BUILTIN_XBEGIN,
25877 IX86_BUILTIN_XEND,
25878 IX86_BUILTIN_XABORT,
25879 IX86_BUILTIN_XTEST,
25880
25881 /* BMI instructions. */
25882 IX86_BUILTIN_BEXTR32,
25883 IX86_BUILTIN_BEXTR64,
25884 IX86_BUILTIN_CTZS,
25885
25886 /* TBM instructions. */
25887 IX86_BUILTIN_BEXTRI32,
25888 IX86_BUILTIN_BEXTRI64,
25889
25890 /* BMI2 instructions. */
25891 IX86_BUILTIN_BZHI32,
25892 IX86_BUILTIN_BZHI64,
25893 IX86_BUILTIN_PDEP32,
25894 IX86_BUILTIN_PDEP64,
25895 IX86_BUILTIN_PEXT32,
25896 IX86_BUILTIN_PEXT64,
25897
25898 /* FSGSBASE instructions. */
25899 IX86_BUILTIN_RDFSBASE32,
25900 IX86_BUILTIN_RDFSBASE64,
25901 IX86_BUILTIN_RDGSBASE32,
25902 IX86_BUILTIN_RDGSBASE64,
25903 IX86_BUILTIN_WRFSBASE32,
25904 IX86_BUILTIN_WRFSBASE64,
25905 IX86_BUILTIN_WRGSBASE32,
25906 IX86_BUILTIN_WRGSBASE64,
25907
25908 /* RDRND instructions. */
25909 IX86_BUILTIN_RDRAND16_STEP,
25910 IX86_BUILTIN_RDRAND32_STEP,
25911 IX86_BUILTIN_RDRAND64_STEP,
25912
25913 /* F16C instructions. */
25914 IX86_BUILTIN_CVTPH2PS,
25915 IX86_BUILTIN_CVTPH2PS256,
25916 IX86_BUILTIN_CVTPS2PH,
25917 IX86_BUILTIN_CVTPS2PH256,
25918
25919 /* CFString built-in for darwin */
25920 IX86_BUILTIN_CFSTRING,
25921
25922 /* Builtins to get CPU type and supported features. */
25923 IX86_BUILTIN_CPU_INIT,
25924 IX86_BUILTIN_CPU_IS,
25925 IX86_BUILTIN_CPU_SUPPORTS,
25926
25927 IX86_BUILTIN_MAX
25928 };
25929
25930 /* Table for the ix86 builtin decls. */
25931 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25932
25933 /* Table of all of the builtin functions that are possible with different ISA's
25934 but are waiting to be built until a function is declared to use that
25935 ISA. */
25936 struct builtin_isa {
25937 const char *name; /* function name */
25938 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25939 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25940 bool const_p; /* true if the declaration is constant */
25941 bool set_and_not_built_p;
25942 };
25943
25944 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25945
25946
25947 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25948 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25949 function decl in the ix86_builtins array. Returns the function decl or
25950 NULL_TREE, if the builtin was not added.
25951
25952 If the front end has a special hook for builtin functions, delay adding
25953 builtin functions that aren't in the current ISA until the ISA is changed
25954 with function specific optimization. Doing so, can save about 300K for the
25955 default compiler. When the builtin is expanded, check at that time whether
25956 it is valid.
25957
25958 If the front end doesn't have a special hook, record all builtins, even if
25959 it isn't an instruction set in the current ISA in case the user uses
25960 function specific options for a different ISA, so that we don't get scope
25961 errors if a builtin is added in the middle of a function scope. */
25962
25963 static inline tree
25964 def_builtin (HOST_WIDE_INT mask, const char *name,
25965 enum ix86_builtin_func_type tcode,
25966 enum ix86_builtins code)
25967 {
25968 tree decl = NULL_TREE;
25969
25970 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25971 {
25972 ix86_builtins_isa[(int) code].isa = mask;
25973
25974 mask &= ~OPTION_MASK_ISA_64BIT;
25975 if (mask == 0
25976 || (mask & ix86_isa_flags) != 0
25977 || (lang_hooks.builtin_function
25978 == lang_hooks.builtin_function_ext_scope))
25979
25980 {
25981 tree type = ix86_get_builtin_func_type (tcode);
25982 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25983 NULL, NULL_TREE);
25984 ix86_builtins[(int) code] = decl;
25985 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25986 }
25987 else
25988 {
25989 ix86_builtins[(int) code] = NULL_TREE;
25990 ix86_builtins_isa[(int) code].tcode = tcode;
25991 ix86_builtins_isa[(int) code].name = name;
25992 ix86_builtins_isa[(int) code].const_p = false;
25993 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25994 }
25995 }
25996
25997 return decl;
25998 }
25999
26000 /* Like def_builtin, but also marks the function decl "const". */
26001
26002 static inline tree
26003 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26004 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26005 {
26006 tree decl = def_builtin (mask, name, tcode, code);
26007 if (decl)
26008 TREE_READONLY (decl) = 1;
26009 else
26010 ix86_builtins_isa[(int) code].const_p = true;
26011
26012 return decl;
26013 }
26014
26015 /* Add any new builtin functions for a given ISA that may not have been
26016 declared. This saves a bit of space compared to adding all of the
26017 declarations to the tree, even if we didn't use them. */
26018
26019 static void
26020 ix86_add_new_builtins (HOST_WIDE_INT isa)
26021 {
26022 int i;
26023
26024 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26025 {
26026 if ((ix86_builtins_isa[i].isa & isa) != 0
26027 && ix86_builtins_isa[i].set_and_not_built_p)
26028 {
26029 tree decl, type;
26030
26031 /* Don't define the builtin again. */
26032 ix86_builtins_isa[i].set_and_not_built_p = false;
26033
26034 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26035 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26036 type, i, BUILT_IN_MD, NULL,
26037 NULL_TREE);
26038
26039 ix86_builtins[i] = decl;
26040 if (ix86_builtins_isa[i].const_p)
26041 TREE_READONLY (decl) = 1;
26042 }
26043 }
26044 }
26045
26046 /* Bits for builtin_description.flag. */
26047
26048 /* Set when we don't support the comparison natively, and should
26049 swap_comparison in order to support it. */
26050 #define BUILTIN_DESC_SWAP_OPERANDS 1
26051
26052 struct builtin_description
26053 {
26054 const HOST_WIDE_INT mask;
26055 const enum insn_code icode;
26056 const char *const name;
26057 const enum ix86_builtins code;
26058 const enum rtx_code comparison;
26059 const int flag;
26060 };
26061
26062 static const struct builtin_description bdesc_comi[] =
26063 {
26064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26065 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26071 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26076 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26077 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26079 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26080 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26081 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26082 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26083 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26084 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26085 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26086 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26087 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26088 };
26089
26090 static const struct builtin_description bdesc_pcmpestr[] =
26091 {
26092 /* SSE4.2 */
26093 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26094 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26095 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26096 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26097 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26098 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26099 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26100 };
26101
26102 static const struct builtin_description bdesc_pcmpistr[] =
26103 {
26104 /* SSE4.2 */
26105 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26106 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26107 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26108 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26109 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26110 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26111 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26112 };
26113
26114 /* Special builtins with variable number of arguments. */
26115 static const struct builtin_description bdesc_special_args[] =
26116 {
26117 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26118 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26119 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26120
26121 /* MMX */
26122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26123
26124 /* 3DNow! */
26125 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26126
26127 /* SSE */
26128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26131
26132 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26136
26137 /* SSE or 3DNow!A */
26138 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26139 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26140
26141 /* SSE2 */
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26143 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26149 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26152
26153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26155
26156 /* SSE3 */
26157 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26158
26159 /* SSE4.1 */
26160 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26161
26162 /* SSE4A */
26163 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26164 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26165
26166 /* AVX */
26167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26169
26170 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26171 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26172 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26173 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26174 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26175
26176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26177 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26178 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26179 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26181 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26182 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26183
26184 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26185 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26186 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26187
26188 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26189 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26190 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26191 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26192 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26193 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26194 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26195 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26196
26197 /* AVX2 */
26198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26201 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26202 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26203 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26204 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26205 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26206 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26207
26208 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26209 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26210 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26211 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26212 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26213 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26214
26215 /* FSGSBASE */
26216 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26217 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26218 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26219 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26220 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26221 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26222 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26223 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26224
26225 /* RTM */
26226 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26227 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26228 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26229 };
26230
26231 /* Builtins with variable number of arguments. */
26232 static const struct builtin_description bdesc_args[] =
26233 {
26234 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26235 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26236 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26237 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26238 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26239 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26240 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26241
26242 /* MMX */
26243 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26245 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26248 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26249
26250 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26252 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26253 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26255 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26257 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26258
26259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26260 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26261
26262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26263 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26264 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26265 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26266
26267 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26268 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26269 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26270 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26271 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26272 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26273
26274 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26275 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26276 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26277 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26278 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26279 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26280
26281 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26282 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26283 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26284
26285 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26286
26287 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26288 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26289 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26290 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26291 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26292 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26293
26294 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26295 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26296 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26297 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26298 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26299 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26300
26301 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26302 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26303 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26304 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26305
26306 /* 3DNow! */
26307 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26308 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26309 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26310 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26311
26312 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26313 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26314 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26315 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26316 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26317 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26318 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26319 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26320 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26321 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26322 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26323 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26324 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26325 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26326 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26327
26328 /* 3DNow!A */
26329 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26330 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26331 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26332 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26333 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26334 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26335
26336 /* SSE */
26337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26339 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26340 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26341 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26342 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26345 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26346 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26347 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26348 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26349
26350 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26351
26352 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26353 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26354 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26356 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26357 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26358 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26360
26361 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26362 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26363 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26364 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26366 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26367 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26368 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26369 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26370 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26371 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26372 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26373 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26374 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26375 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26376 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26377 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26378 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26379 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26380 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26381 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26382 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26383
26384 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26385 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26386 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26387 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26388
26389 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26391 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26392 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26393
26394 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26395
26396 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26397 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26398 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26399 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26400 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26401
26402 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26403 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26404 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26405
26406 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26407
26408 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26409 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26410 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26411
26412 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26413 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26414
26415 /* SSE MMX or 3Dnow!A */
26416 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26417 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26418 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26419
26420 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26421 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26422 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26423 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26424
26425 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26426 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26427
26428 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26429
26430 /* SSE2 */
26431 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26432
26433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26437 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26438
26439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26444
26445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26446
26447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26449 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26450 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26451
26452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26453 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26455
26456 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26457 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26458 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26459 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26460 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26462 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26463 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26464
26465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26468 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26469 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26470 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26472 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26475 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26476 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26478 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26479 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26480 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26481 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26482 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26484 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26485
26486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26487 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26489 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26490
26491 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26493 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26494 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26495
26496 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26497
26498 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26499 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26500 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26501
26502 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26503
26504 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26505 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26506 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26507 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26508 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26509 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26510 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26511 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26512
26513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26515 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26516 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26517 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26519 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26521
26522 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26523 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26524
26525 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26526 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26527 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26528 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26529
26530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26532
26533 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26534 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26535 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26537 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26538 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26539
26540 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26541 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26542 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26543 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26544
26545 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26546 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26547 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26548 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26549 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26550 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26551 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26552 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26553
26554 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26555 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26556 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26557
26558 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26559 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26560
26561 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26562 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26563
26564 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26565
26566 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26567 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26568 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26569 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26570
26571 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26572 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26573 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26574 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26575 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26576 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26577 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26578
26579 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26580 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26581 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26582 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26583 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26584 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26585 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26586
26587 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26588 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26589 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26590 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26591
26592 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26593 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26594 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26595
26596 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26597
26598 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26599
26600 /* SSE2 MMX */
26601 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26602 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26603
26604 /* SSE3 */
26605 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26606 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26607
26608 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26609 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26610 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26611 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26612 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26613 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26614
26615 /* SSSE3 */
26616 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26617 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26618 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26619 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26620 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26621 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26622
26623 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26624 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26625 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26626 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26627 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26628 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26629 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26630 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26631 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26632 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26633 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26634 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26635 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26636 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26637 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26638 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26639 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26640 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26641 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26642 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26643 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26644 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26645 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26646 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26647
26648 /* SSSE3. */
26649 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26650 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26651
26652 /* SSE4.1 */
26653 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26654 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26655 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26656 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26657 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26658 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26659 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26660 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26661 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26662 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26663
26664 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26665 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26666 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26667 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26668 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26669 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26670 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26671 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26672 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26673 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26674 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26675 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26676 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26677
26678 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26679 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26680 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26681 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26682 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26683 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26684 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26685 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26686 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26687 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26688 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26689 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26690
26691 /* SSE4.1 */
26692 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26693 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26694 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26695 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26696
26697 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26698 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26699 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26700 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26701
26702 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26703 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26704
26705 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26706 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26707
26708 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26709 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26710 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26711 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26712
26713 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26714 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26715
26716 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26717 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26718
26719 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26720 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26721 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26722
26723 /* SSE4.2 */
26724 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26725 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26726 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26727 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26728 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26729
26730 /* SSE4A */
26731 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26732 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26733 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26734 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26735
26736 /* AES */
26737 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26738 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26739
26740 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26741 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26742 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26743 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26744
26745 /* PCLMUL */
26746 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26747
26748 /* AVX */
26749 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26750 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26753 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26754 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26755 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26757 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26758 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26759 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26760 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26761 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26763 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26764 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26765 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26766 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26767 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26768 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26769 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26770 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26771 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26772 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26773 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26774 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26775
26776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26780
26781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26782 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26787 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26792 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26795 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26797 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26798 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26802 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26804 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26809 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26814 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26815
26816 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26818 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26819
26820 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26821 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26822 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26824 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26825
26826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26827
26828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26830
26831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26835
26836 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26837 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26838
26839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26841
26842 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26844 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26846
26847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26849
26850 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26851 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26852
26853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26855 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26856 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26857
26858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26861 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26862 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26863 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26864
26865 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26866 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26867 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26868 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26869 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26870 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26871 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26873 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26874 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26876 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26877 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26880
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26882 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26883
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26886
26887 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26888
26889 /* AVX2 */
26890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26891 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26892 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26893 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26898 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26899 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26900 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26901 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26907 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26928 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26929 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26930 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26931 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26932 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26933 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26934 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26935 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26936 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26937 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26938 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26939 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26940 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26956 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26957 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26958 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26959 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26961 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26971 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26972 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26973 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26974 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26975 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26976 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26977 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26978 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26979 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26980 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26981 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26982 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26983 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26984 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26985 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26986 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26987 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26988 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26989 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26990 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26991 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26998 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27000 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27002 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27003 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27004 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27005 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27006 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27007 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27008 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27009 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27010 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27011 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27012 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27013 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27014 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27017 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27018 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27019 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27020 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27021 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27022 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27023 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27024 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27025 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27026 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27028 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27029 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27030 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27031 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27032 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27033 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27034 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27035 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27036
27037 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27038
27039 /* BMI */
27040 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27041 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27042 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27043
27044 /* TBM */
27045 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27046 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27047
27048 /* F16C */
27049 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27050 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27051 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27052 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27053
27054 /* BMI2 */
27055 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27056 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27057 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27058 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27059 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27060 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27061 };
27062
27063 /* FMA4 and XOP. */
27064 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27065 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27066 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27067 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27068 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27069 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27070 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27071 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27072 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27073 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27074 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27075 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27076 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27077 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27078 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27079 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27080 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27081 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27082 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27083 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27084 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27085 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27086 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27087 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27088 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27089 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27090 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27091 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27092 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27093 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27094 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27095 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27096 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27097 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27098 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27099 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27100 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27101 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27102 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27103 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27104 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27105 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27106 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27107 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27108 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27109 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27110 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27111 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27112 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27113 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27114 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27115 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27116
27117 static const struct builtin_description bdesc_multi_arg[] =
27118 {
27119 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27120 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27121 UNKNOWN, (int)MULTI_ARG_3_SF },
27122 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27123 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27124 UNKNOWN, (int)MULTI_ARG_3_DF },
27125
27126 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27127 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27128 UNKNOWN, (int)MULTI_ARG_3_SF },
27129 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27130 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27131 UNKNOWN, (int)MULTI_ARG_3_DF },
27132
27133 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27134 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27135 UNKNOWN, (int)MULTI_ARG_3_SF },
27136 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27137 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27138 UNKNOWN, (int)MULTI_ARG_3_DF },
27139 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27140 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27141 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27142 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27143 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27144 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27145
27146 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27147 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27148 UNKNOWN, (int)MULTI_ARG_3_SF },
27149 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27150 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27151 UNKNOWN, (int)MULTI_ARG_3_DF },
27152 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27153 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27154 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27155 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27156 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27157 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27158
27159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27166
27167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27174
27175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27176
27177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27189
27190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27194 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27202 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27206
27207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27213
27214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27229
27230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27237
27238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27245
27246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27253
27254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27259 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27261
27262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27268 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27269
27270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27275 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27277
27278 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27279 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27281 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27283 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27285
27286 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27287 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27289 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27291 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27292 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27293
27294 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27295 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27296 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27297 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27298 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27299 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27300 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27301 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27302
27303 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27304 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27305 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27306 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27307 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27308 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27309 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27310 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27311
27312 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27313 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27314 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27315 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27316
27317 };
27318 \f
27319 /* TM vector builtins. */
27320
27321 /* Reuse the existing x86-specific `struct builtin_description' cause
27322 we're lazy. Add casts to make them fit. */
27323 static const struct builtin_description bdesc_tm[] =
27324 {
27325 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27326 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27327 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27328 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27329 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27330 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27331 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27332
27333 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27334 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27335 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27336 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27337 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27338 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27339 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27340
27341 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27342 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27343 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27344 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27345 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27346 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27347 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27348
27349 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27350 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27351 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27352 };
27353
27354 /* TM callbacks. */
27355
27356 /* Return the builtin decl needed to load a vector of TYPE. */
27357
27358 static tree
27359 ix86_builtin_tm_load (tree type)
27360 {
27361 if (TREE_CODE (type) == VECTOR_TYPE)
27362 {
27363 switch (tree_low_cst (TYPE_SIZE (type), 1))
27364 {
27365 case 64:
27366 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27367 case 128:
27368 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27369 case 256:
27370 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27371 }
27372 }
27373 return NULL_TREE;
27374 }
27375
27376 /* Return the builtin decl needed to store a vector of TYPE. */
27377
27378 static tree
27379 ix86_builtin_tm_store (tree type)
27380 {
27381 if (TREE_CODE (type) == VECTOR_TYPE)
27382 {
27383 switch (tree_low_cst (TYPE_SIZE (type), 1))
27384 {
27385 case 64:
27386 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27387 case 128:
27388 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27389 case 256:
27390 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27391 }
27392 }
27393 return NULL_TREE;
27394 }
27395 \f
27396 /* Initialize the transactional memory vector load/store builtins. */
27397
27398 static void
27399 ix86_init_tm_builtins (void)
27400 {
27401 enum ix86_builtin_func_type ftype;
27402 const struct builtin_description *d;
27403 size_t i;
27404 tree decl;
27405 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27406 tree attrs_log, attrs_type_log;
27407
27408 if (!flag_tm)
27409 return;
27410
27411 /* If there are no builtins defined, we must be compiling in a
27412 language without trans-mem support. */
27413 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27414 return;
27415
27416 /* Use whatever attributes a normal TM load has. */
27417 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27418 attrs_load = DECL_ATTRIBUTES (decl);
27419 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27420 /* Use whatever attributes a normal TM store has. */
27421 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27422 attrs_store = DECL_ATTRIBUTES (decl);
27423 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27424 /* Use whatever attributes a normal TM log has. */
27425 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27426 attrs_log = DECL_ATTRIBUTES (decl);
27427 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27428
27429 for (i = 0, d = bdesc_tm;
27430 i < ARRAY_SIZE (bdesc_tm);
27431 i++, d++)
27432 {
27433 if ((d->mask & ix86_isa_flags) != 0
27434 || (lang_hooks.builtin_function
27435 == lang_hooks.builtin_function_ext_scope))
27436 {
27437 tree type, attrs, attrs_type;
27438 enum built_in_function code = (enum built_in_function) d->code;
27439
27440 ftype = (enum ix86_builtin_func_type) d->flag;
27441 type = ix86_get_builtin_func_type (ftype);
27442
27443 if (BUILTIN_TM_LOAD_P (code))
27444 {
27445 attrs = attrs_load;
27446 attrs_type = attrs_type_load;
27447 }
27448 else if (BUILTIN_TM_STORE_P (code))
27449 {
27450 attrs = attrs_store;
27451 attrs_type = attrs_type_store;
27452 }
27453 else
27454 {
27455 attrs = attrs_log;
27456 attrs_type = attrs_type_log;
27457 }
27458 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27459 /* The builtin without the prefix for
27460 calling it directly. */
27461 d->name + strlen ("__builtin_"),
27462 attrs);
27463 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27464 set the TYPE_ATTRIBUTES. */
27465 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27466
27467 set_builtin_decl (code, decl, false);
27468 }
27469 }
27470 }
27471
27472 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27473 in the current target ISA to allow the user to compile particular modules
27474 with different target specific options that differ from the command line
27475 options. */
27476 static void
27477 ix86_init_mmx_sse_builtins (void)
27478 {
27479 const struct builtin_description * d;
27480 enum ix86_builtin_func_type ftype;
27481 size_t i;
27482
27483 /* Add all special builtins with variable number of operands. */
27484 for (i = 0, d = bdesc_special_args;
27485 i < ARRAY_SIZE (bdesc_special_args);
27486 i++, d++)
27487 {
27488 if (d->name == 0)
27489 continue;
27490
27491 ftype = (enum ix86_builtin_func_type) d->flag;
27492 def_builtin (d->mask, d->name, ftype, d->code);
27493 }
27494
27495 /* Add all builtins with variable number of operands. */
27496 for (i = 0, d = bdesc_args;
27497 i < ARRAY_SIZE (bdesc_args);
27498 i++, d++)
27499 {
27500 if (d->name == 0)
27501 continue;
27502
27503 ftype = (enum ix86_builtin_func_type) d->flag;
27504 def_builtin_const (d->mask, d->name, ftype, d->code);
27505 }
27506
27507 /* pcmpestr[im] insns. */
27508 for (i = 0, d = bdesc_pcmpestr;
27509 i < ARRAY_SIZE (bdesc_pcmpestr);
27510 i++, d++)
27511 {
27512 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27513 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27514 else
27515 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27516 def_builtin_const (d->mask, d->name, ftype, d->code);
27517 }
27518
27519 /* pcmpistr[im] insns. */
27520 for (i = 0, d = bdesc_pcmpistr;
27521 i < ARRAY_SIZE (bdesc_pcmpistr);
27522 i++, d++)
27523 {
27524 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27525 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27526 else
27527 ftype = INT_FTYPE_V16QI_V16QI_INT;
27528 def_builtin_const (d->mask, d->name, ftype, d->code);
27529 }
27530
27531 /* comi/ucomi insns. */
27532 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27533 {
27534 if (d->mask == OPTION_MASK_ISA_SSE2)
27535 ftype = INT_FTYPE_V2DF_V2DF;
27536 else
27537 ftype = INT_FTYPE_V4SF_V4SF;
27538 def_builtin_const (d->mask, d->name, ftype, d->code);
27539 }
27540
27541 /* SSE */
27542 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27543 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27544 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27545 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27546
27547 /* SSE or 3DNow!A */
27548 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27549 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27550 IX86_BUILTIN_MASKMOVQ);
27551
27552 /* SSE2 */
27553 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27554 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27555
27556 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27557 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27558 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27559 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27560
27561 /* SSE3. */
27562 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27563 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27564 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27565 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27566
27567 /* AES */
27568 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27569 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27570 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27571 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27572 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27573 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27574 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27575 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27576 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27577 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27578 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27579 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27580
27581 /* PCLMUL */
27582 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27583 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27584
27585 /* RDRND */
27586 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27587 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27588 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27589 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27590 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27591 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27592 IX86_BUILTIN_RDRAND64_STEP);
27593
27594 /* AVX2 */
27595 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27596 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27597 IX86_BUILTIN_GATHERSIV2DF);
27598
27599 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27600 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27601 IX86_BUILTIN_GATHERSIV4DF);
27602
27603 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27604 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27605 IX86_BUILTIN_GATHERDIV2DF);
27606
27607 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27608 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27609 IX86_BUILTIN_GATHERDIV4DF);
27610
27611 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27612 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27613 IX86_BUILTIN_GATHERSIV4SF);
27614
27615 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27616 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27617 IX86_BUILTIN_GATHERSIV8SF);
27618
27619 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27620 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27621 IX86_BUILTIN_GATHERDIV4SF);
27622
27623 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27624 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27625 IX86_BUILTIN_GATHERDIV8SF);
27626
27627 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27628 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27629 IX86_BUILTIN_GATHERSIV2DI);
27630
27631 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27632 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27633 IX86_BUILTIN_GATHERSIV4DI);
27634
27635 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27636 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27637 IX86_BUILTIN_GATHERDIV2DI);
27638
27639 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27640 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27641 IX86_BUILTIN_GATHERDIV4DI);
27642
27643 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27644 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27645 IX86_BUILTIN_GATHERSIV4SI);
27646
27647 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27648 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27649 IX86_BUILTIN_GATHERSIV8SI);
27650
27651 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27652 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27653 IX86_BUILTIN_GATHERDIV4SI);
27654
27655 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27656 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27657 IX86_BUILTIN_GATHERDIV8SI);
27658
27659 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27660 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27661 IX86_BUILTIN_GATHERALTSIV4DF);
27662
27663 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27664 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27665 IX86_BUILTIN_GATHERALTDIV8SF);
27666
27667 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27668 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27669 IX86_BUILTIN_GATHERALTSIV4DI);
27670
27671 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27672 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27673 IX86_BUILTIN_GATHERALTDIV8SI);
27674
27675 /* RTM. */
27676 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27677 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27678
27679 /* MMX access to the vec_init patterns. */
27680 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27681 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27682
27683 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27684 V4HI_FTYPE_HI_HI_HI_HI,
27685 IX86_BUILTIN_VEC_INIT_V4HI);
27686
27687 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27688 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27689 IX86_BUILTIN_VEC_INIT_V8QI);
27690
27691 /* Access to the vec_extract patterns. */
27692 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27693 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27694 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27695 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27696 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27697 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27698 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27699 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27700 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27701 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27702
27703 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27704 "__builtin_ia32_vec_ext_v4hi",
27705 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27706
27707 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27708 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27709
27710 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27711 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27712
27713 /* Access to the vec_set patterns. */
27714 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27715 "__builtin_ia32_vec_set_v2di",
27716 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27717
27718 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27719 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27720
27721 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27722 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27723
27724 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27725 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27726
27727 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27728 "__builtin_ia32_vec_set_v4hi",
27729 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27730
27731 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27732 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27733
27734 /* Add FMA4 multi-arg argument instructions */
27735 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27736 {
27737 if (d->name == 0)
27738 continue;
27739
27740 ftype = (enum ix86_builtin_func_type) d->flag;
27741 def_builtin_const (d->mask, d->name, ftype, d->code);
27742 }
27743 }
27744
27745 /* This builds the processor_model struct type defined in
27746 libgcc/config/i386/cpuinfo.c */
27747
27748 static tree
27749 build_processor_model_struct (void)
27750 {
27751 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
27752 "__cpu_features"};
27753 tree field = NULL_TREE, field_chain = NULL_TREE;
27754 int i;
27755 tree type = make_node (RECORD_TYPE);
27756
27757 /* The first 3 fields are unsigned int. */
27758 for (i = 0; i < 3; ++i)
27759 {
27760 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27761 get_identifier (field_name[i]), unsigned_type_node);
27762 if (field_chain != NULL_TREE)
27763 DECL_CHAIN (field) = field_chain;
27764 field_chain = field;
27765 }
27766
27767 /* The last field is an array of unsigned integers of size one. */
27768 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27769 get_identifier (field_name[3]),
27770 build_array_type (unsigned_type_node,
27771 build_index_type (size_one_node)));
27772 if (field_chain != NULL_TREE)
27773 DECL_CHAIN (field) = field_chain;
27774 field_chain = field;
27775
27776 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
27777 return type;
27778 }
27779
27780 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
27781
27782 static tree
27783 make_var_decl (tree type, const char *name)
27784 {
27785 tree new_decl;
27786
27787 new_decl = build_decl (UNKNOWN_LOCATION,
27788 VAR_DECL,
27789 get_identifier(name),
27790 type);
27791
27792 DECL_EXTERNAL (new_decl) = 1;
27793 TREE_STATIC (new_decl) = 1;
27794 TREE_PUBLIC (new_decl) = 1;
27795 DECL_INITIAL (new_decl) = 0;
27796 DECL_ARTIFICIAL (new_decl) = 0;
27797 DECL_PRESERVE_P (new_decl) = 1;
27798
27799 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
27800 assemble_variable (new_decl, 0, 0, 0);
27801
27802 return new_decl;
27803 }
27804
27805 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
27806 into an integer defined in libgcc/config/i386/cpuinfo.c */
27807
27808 static tree
27809 fold_builtin_cpu (tree fndecl, tree *args)
27810 {
27811 unsigned int i;
27812 enum ix86_builtins fn_code = (enum ix86_builtins)
27813 DECL_FUNCTION_CODE (fndecl);
27814 tree param_string_cst = NULL;
27815
27816 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
27817 enum processor_features
27818 {
27819 F_CMOV = 0,
27820 F_MMX,
27821 F_POPCNT,
27822 F_SSE,
27823 F_SSE2,
27824 F_SSE3,
27825 F_SSSE3,
27826 F_SSE4_1,
27827 F_SSE4_2,
27828 F_AVX,
27829 F_AVX2,
27830 F_MAX
27831 };
27832
27833 /* These are the values for vendor types and cpu types and subtypes
27834 in cpuinfo.c. Cpu types and subtypes should be subtracted by
27835 the corresponding start value. */
27836 enum processor_model
27837 {
27838 M_INTEL = 1,
27839 M_AMD,
27840 M_CPU_TYPE_START,
27841 M_INTEL_ATOM,
27842 M_INTEL_CORE2,
27843 M_INTEL_COREI7,
27844 M_AMDFAM10H,
27845 M_AMDFAM15H,
27846 M_CPU_SUBTYPE_START,
27847 M_INTEL_COREI7_NEHALEM,
27848 M_INTEL_COREI7_WESTMERE,
27849 M_INTEL_COREI7_SANDYBRIDGE,
27850 M_AMDFAM10H_BARCELONA,
27851 M_AMDFAM10H_SHANGHAI,
27852 M_AMDFAM10H_ISTANBUL,
27853 M_AMDFAM15H_BDVER1,
27854 M_AMDFAM15H_BDVER2
27855 };
27856
27857 static struct _arch_names_table
27858 {
27859 const char *const name;
27860 const enum processor_model model;
27861 }
27862 const arch_names_table[] =
27863 {
27864 {"amd", M_AMD},
27865 {"intel", M_INTEL},
27866 {"atom", M_INTEL_ATOM},
27867 {"core2", M_INTEL_CORE2},
27868 {"corei7", M_INTEL_COREI7},
27869 {"nehalem", M_INTEL_COREI7_NEHALEM},
27870 {"westmere", M_INTEL_COREI7_WESTMERE},
27871 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
27872 {"amdfam10h", M_AMDFAM10H},
27873 {"barcelona", M_AMDFAM10H_BARCELONA},
27874 {"shanghai", M_AMDFAM10H_SHANGHAI},
27875 {"istanbul", M_AMDFAM10H_ISTANBUL},
27876 {"amdfam15h", M_AMDFAM15H},
27877 {"bdver1", M_AMDFAM15H_BDVER1},
27878 {"bdver2", M_AMDFAM15H_BDVER2},
27879 };
27880
27881 static struct _isa_names_table
27882 {
27883 const char *const name;
27884 const enum processor_features feature;
27885 }
27886 const isa_names_table[] =
27887 {
27888 {"cmov", F_CMOV},
27889 {"mmx", F_MMX},
27890 {"popcnt", F_POPCNT},
27891 {"sse", F_SSE},
27892 {"sse2", F_SSE2},
27893 {"sse3", F_SSE3},
27894 {"ssse3", F_SSSE3},
27895 {"sse4.1", F_SSE4_1},
27896 {"sse4.2", F_SSE4_2},
27897 {"avx", F_AVX},
27898 {"avx2", F_AVX2}
27899 };
27900
27901 static tree __processor_model_type = NULL_TREE;
27902 static tree __cpu_model_var = NULL_TREE;
27903
27904 if (__processor_model_type == NULL_TREE)
27905 __processor_model_type = build_processor_model_struct ();
27906
27907 if (__cpu_model_var == NULL_TREE)
27908 __cpu_model_var = make_var_decl (__processor_model_type,
27909 "__cpu_model");
27910
27911 gcc_assert ((args != NULL) && (*args != NULL));
27912
27913 param_string_cst = *args;
27914 while (param_string_cst
27915 && TREE_CODE (param_string_cst) != STRING_CST)
27916 {
27917 /* *args must be a expr that can contain other EXPRS leading to a
27918 STRING_CST. */
27919 if (!EXPR_P (param_string_cst))
27920 {
27921 error ("Parameter to builtin must be a string constant or literal");
27922 return integer_zero_node;
27923 }
27924 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
27925 }
27926
27927 gcc_assert (param_string_cst);
27928
27929 if (fn_code == IX86_BUILTIN_CPU_IS)
27930 {
27931 tree ref;
27932 tree field;
27933 unsigned int field_val = 0;
27934 unsigned int NUM_ARCH_NAMES
27935 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
27936
27937 for (i = 0; i < NUM_ARCH_NAMES; i++)
27938 if (strcmp (arch_names_table[i].name,
27939 TREE_STRING_POINTER (param_string_cst)) == 0)
27940 break;
27941
27942 if (i == NUM_ARCH_NAMES)
27943 {
27944 error ("Parameter to builtin not valid: %s",
27945 TREE_STRING_POINTER (param_string_cst));
27946 return integer_zero_node;
27947 }
27948
27949 field = TYPE_FIELDS (__processor_model_type);
27950 field_val = arch_names_table[i].model;
27951
27952 /* CPU types are stored in the next field. */
27953 if (field_val > M_CPU_TYPE_START
27954 && field_val < M_CPU_SUBTYPE_START)
27955 {
27956 field = DECL_CHAIN (field);
27957 field_val -= M_CPU_TYPE_START;
27958 }
27959
27960 /* CPU subtypes are stored in the next field. */
27961 if (field_val > M_CPU_SUBTYPE_START)
27962 {
27963 field = DECL_CHAIN ( DECL_CHAIN (field));
27964 field_val -= M_CPU_SUBTYPE_START;
27965 }
27966
27967 /* Get the appropriate field in __cpu_model. */
27968 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
27969 field, NULL_TREE);
27970
27971 /* Check the value. */
27972 return build2 (EQ_EXPR, unsigned_type_node, ref,
27973 build_int_cstu (unsigned_type_node, field_val));
27974 }
27975 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
27976 {
27977 tree ref;
27978 tree array_elt;
27979 tree field;
27980 unsigned int field_val = 0;
27981 unsigned int NUM_ISA_NAMES
27982 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
27983
27984 for (i = 0; i < NUM_ISA_NAMES; i++)
27985 if (strcmp (isa_names_table[i].name,
27986 TREE_STRING_POINTER (param_string_cst)) == 0)
27987 break;
27988
27989 if (i == NUM_ISA_NAMES)
27990 {
27991 error ("Parameter to builtin not valid: %s",
27992 TREE_STRING_POINTER (param_string_cst));
27993 return integer_zero_node;
27994 }
27995
27996 field = TYPE_FIELDS (__processor_model_type);
27997 /* Get the last field, which is __cpu_features. */
27998 while (DECL_CHAIN (field))
27999 field = DECL_CHAIN (field);
28000
28001 /* Get the appropriate field: __cpu_model.__cpu_features */
28002 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28003 field, NULL_TREE);
28004
28005 /* Access the 0th element of __cpu_features array. */
28006 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28007 integer_zero_node, NULL_TREE, NULL_TREE);
28008
28009 field_val = (1 << isa_names_table[i].feature);
28010 /* Return __cpu_model.__cpu_features[0] & field_val */
28011 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28012 build_int_cstu (unsigned_type_node, field_val));
28013 }
28014 gcc_unreachable ();
28015 }
28016
28017 static tree
28018 ix86_fold_builtin (tree fndecl, int n_args,
28019 tree *args, bool ignore ATTRIBUTE_UNUSED)
28020 {
28021 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28022 {
28023 enum ix86_builtins fn_code = (enum ix86_builtins)
28024 DECL_FUNCTION_CODE (fndecl);
28025 if (fn_code == IX86_BUILTIN_CPU_IS
28026 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28027 {
28028 gcc_assert (n_args == 1);
28029 return fold_builtin_cpu (fndecl, args);
28030 }
28031 }
28032
28033 return NULL_TREE;
28034 }
28035
28036 /* Make builtins to detect cpu type and features supported. NAME is
28037 the builtin name, CODE is the builtin code, and FTYPE is the function
28038 type of the builtin. */
28039
28040 static void
28041 make_cpu_type_builtin (const char* name, int code,
28042 enum ix86_builtin_func_type ftype, bool is_const)
28043 {
28044 tree decl;
28045 tree type;
28046
28047 type = ix86_get_builtin_func_type (ftype);
28048 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28049 NULL, NULL_TREE);
28050 gcc_assert (decl != NULL_TREE);
28051 ix86_builtins[(int) code] = decl;
28052 TREE_READONLY (decl) = is_const;
28053 }
28054
28055 /* Make builtins to get CPU type and features supported. The created
28056 builtins are :
28057
28058 __builtin_cpu_init (), to detect cpu type and features,
28059 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28060 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28061 */
28062
28063 static void
28064 ix86_init_platform_type_builtins (void)
28065 {
28066 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28067 INT_FTYPE_VOID, false);
28068 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28069 INT_FTYPE_PCCHAR, true);
28070 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28071 INT_FTYPE_PCCHAR, true);
28072 }
28073
28074 /* Internal method for ix86_init_builtins. */
28075
28076 static void
28077 ix86_init_builtins_va_builtins_abi (void)
28078 {
28079 tree ms_va_ref, sysv_va_ref;
28080 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28081 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28082 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28083 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28084
28085 if (!TARGET_64BIT)
28086 return;
28087 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28088 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28089 ms_va_ref = build_reference_type (ms_va_list_type_node);
28090 sysv_va_ref =
28091 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28092
28093 fnvoid_va_end_ms =
28094 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28095 fnvoid_va_start_ms =
28096 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28097 fnvoid_va_end_sysv =
28098 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28099 fnvoid_va_start_sysv =
28100 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28101 NULL_TREE);
28102 fnvoid_va_copy_ms =
28103 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28104 NULL_TREE);
28105 fnvoid_va_copy_sysv =
28106 build_function_type_list (void_type_node, sysv_va_ref,
28107 sysv_va_ref, NULL_TREE);
28108
28109 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28110 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28111 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28112 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28113 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28114 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28115 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28116 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28117 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28118 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28119 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28120 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28121 }
28122
28123 static void
28124 ix86_init_builtin_types (void)
28125 {
28126 tree float128_type_node, float80_type_node;
28127
28128 /* The __float80 type. */
28129 float80_type_node = long_double_type_node;
28130 if (TYPE_MODE (float80_type_node) != XFmode)
28131 {
28132 /* The __float80 type. */
28133 float80_type_node = make_node (REAL_TYPE);
28134
28135 TYPE_PRECISION (float80_type_node) = 80;
28136 layout_type (float80_type_node);
28137 }
28138 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28139
28140 /* The __float128 type. */
28141 float128_type_node = make_node (REAL_TYPE);
28142 TYPE_PRECISION (float128_type_node) = 128;
28143 layout_type (float128_type_node);
28144 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28145
28146 /* This macro is built by i386-builtin-types.awk. */
28147 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28148 }
28149
28150 static void
28151 ix86_init_builtins (void)
28152 {
28153 tree t;
28154
28155 ix86_init_builtin_types ();
28156
28157 /* Builtins to get CPU type and features. */
28158 ix86_init_platform_type_builtins ();
28159
28160 /* TFmode support builtins. */
28161 def_builtin_const (0, "__builtin_infq",
28162 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28163 def_builtin_const (0, "__builtin_huge_valq",
28164 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28165
28166 /* We will expand them to normal call if SSE isn't available since
28167 they are used by libgcc. */
28168 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28169 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28170 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28171 TREE_READONLY (t) = 1;
28172 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28173
28174 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28175 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28176 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28177 TREE_READONLY (t) = 1;
28178 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28179
28180 ix86_init_tm_builtins ();
28181 ix86_init_mmx_sse_builtins ();
28182
28183 if (TARGET_LP64)
28184 ix86_init_builtins_va_builtins_abi ();
28185
28186 #ifdef SUBTARGET_INIT_BUILTINS
28187 SUBTARGET_INIT_BUILTINS;
28188 #endif
28189 }
28190
28191 /* Return the ix86 builtin for CODE. */
28192
28193 static tree
28194 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28195 {
28196 if (code >= IX86_BUILTIN_MAX)
28197 return error_mark_node;
28198
28199 return ix86_builtins[code];
28200 }
28201
28202 /* Errors in the source file can cause expand_expr to return const0_rtx
28203 where we expect a vector. To avoid crashing, use one of the vector
28204 clear instructions. */
28205 static rtx
28206 safe_vector_operand (rtx x, enum machine_mode mode)
28207 {
28208 if (x == const0_rtx)
28209 x = CONST0_RTX (mode);
28210 return x;
28211 }
28212
28213 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28214
28215 static rtx
28216 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28217 {
28218 rtx pat;
28219 tree arg0 = CALL_EXPR_ARG (exp, 0);
28220 tree arg1 = CALL_EXPR_ARG (exp, 1);
28221 rtx op0 = expand_normal (arg0);
28222 rtx op1 = expand_normal (arg1);
28223 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28224 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28225 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28226
28227 if (VECTOR_MODE_P (mode0))
28228 op0 = safe_vector_operand (op0, mode0);
28229 if (VECTOR_MODE_P (mode1))
28230 op1 = safe_vector_operand (op1, mode1);
28231
28232 if (optimize || !target
28233 || GET_MODE (target) != tmode
28234 || !insn_data[icode].operand[0].predicate (target, tmode))
28235 target = gen_reg_rtx (tmode);
28236
28237 if (GET_MODE (op1) == SImode && mode1 == TImode)
28238 {
28239 rtx x = gen_reg_rtx (V4SImode);
28240 emit_insn (gen_sse2_loadd (x, op1));
28241 op1 = gen_lowpart (TImode, x);
28242 }
28243
28244 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28245 op0 = copy_to_mode_reg (mode0, op0);
28246 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28247 op1 = copy_to_mode_reg (mode1, op1);
28248
28249 pat = GEN_FCN (icode) (target, op0, op1);
28250 if (! pat)
28251 return 0;
28252
28253 emit_insn (pat);
28254
28255 return target;
28256 }
28257
28258 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28259
28260 static rtx
28261 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28262 enum ix86_builtin_func_type m_type,
28263 enum rtx_code sub_code)
28264 {
28265 rtx pat;
28266 int i;
28267 int nargs;
28268 bool comparison_p = false;
28269 bool tf_p = false;
28270 bool last_arg_constant = false;
28271 int num_memory = 0;
28272 struct {
28273 rtx op;
28274 enum machine_mode mode;
28275 } args[4];
28276
28277 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28278
28279 switch (m_type)
28280 {
28281 case MULTI_ARG_4_DF2_DI_I:
28282 case MULTI_ARG_4_DF2_DI_I1:
28283 case MULTI_ARG_4_SF2_SI_I:
28284 case MULTI_ARG_4_SF2_SI_I1:
28285 nargs = 4;
28286 last_arg_constant = true;
28287 break;
28288
28289 case MULTI_ARG_3_SF:
28290 case MULTI_ARG_3_DF:
28291 case MULTI_ARG_3_SF2:
28292 case MULTI_ARG_3_DF2:
28293 case MULTI_ARG_3_DI:
28294 case MULTI_ARG_3_SI:
28295 case MULTI_ARG_3_SI_DI:
28296 case MULTI_ARG_3_HI:
28297 case MULTI_ARG_3_HI_SI:
28298 case MULTI_ARG_3_QI:
28299 case MULTI_ARG_3_DI2:
28300 case MULTI_ARG_3_SI2:
28301 case MULTI_ARG_3_HI2:
28302 case MULTI_ARG_3_QI2:
28303 nargs = 3;
28304 break;
28305
28306 case MULTI_ARG_2_SF:
28307 case MULTI_ARG_2_DF:
28308 case MULTI_ARG_2_DI:
28309 case MULTI_ARG_2_SI:
28310 case MULTI_ARG_2_HI:
28311 case MULTI_ARG_2_QI:
28312 nargs = 2;
28313 break;
28314
28315 case MULTI_ARG_2_DI_IMM:
28316 case MULTI_ARG_2_SI_IMM:
28317 case MULTI_ARG_2_HI_IMM:
28318 case MULTI_ARG_2_QI_IMM:
28319 nargs = 2;
28320 last_arg_constant = true;
28321 break;
28322
28323 case MULTI_ARG_1_SF:
28324 case MULTI_ARG_1_DF:
28325 case MULTI_ARG_1_SF2:
28326 case MULTI_ARG_1_DF2:
28327 case MULTI_ARG_1_DI:
28328 case MULTI_ARG_1_SI:
28329 case MULTI_ARG_1_HI:
28330 case MULTI_ARG_1_QI:
28331 case MULTI_ARG_1_SI_DI:
28332 case MULTI_ARG_1_HI_DI:
28333 case MULTI_ARG_1_HI_SI:
28334 case MULTI_ARG_1_QI_DI:
28335 case MULTI_ARG_1_QI_SI:
28336 case MULTI_ARG_1_QI_HI:
28337 nargs = 1;
28338 break;
28339
28340 case MULTI_ARG_2_DI_CMP:
28341 case MULTI_ARG_2_SI_CMP:
28342 case MULTI_ARG_2_HI_CMP:
28343 case MULTI_ARG_2_QI_CMP:
28344 nargs = 2;
28345 comparison_p = true;
28346 break;
28347
28348 case MULTI_ARG_2_SF_TF:
28349 case MULTI_ARG_2_DF_TF:
28350 case MULTI_ARG_2_DI_TF:
28351 case MULTI_ARG_2_SI_TF:
28352 case MULTI_ARG_2_HI_TF:
28353 case MULTI_ARG_2_QI_TF:
28354 nargs = 2;
28355 tf_p = true;
28356 break;
28357
28358 default:
28359 gcc_unreachable ();
28360 }
28361
28362 if (optimize || !target
28363 || GET_MODE (target) != tmode
28364 || !insn_data[icode].operand[0].predicate (target, tmode))
28365 target = gen_reg_rtx (tmode);
28366
28367 gcc_assert (nargs <= 4);
28368
28369 for (i = 0; i < nargs; i++)
28370 {
28371 tree arg = CALL_EXPR_ARG (exp, i);
28372 rtx op = expand_normal (arg);
28373 int adjust = (comparison_p) ? 1 : 0;
28374 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28375
28376 if (last_arg_constant && i == nargs - 1)
28377 {
28378 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28379 {
28380 enum insn_code new_icode = icode;
28381 switch (icode)
28382 {
28383 case CODE_FOR_xop_vpermil2v2df3:
28384 case CODE_FOR_xop_vpermil2v4sf3:
28385 case CODE_FOR_xop_vpermil2v4df3:
28386 case CODE_FOR_xop_vpermil2v8sf3:
28387 error ("the last argument must be a 2-bit immediate");
28388 return gen_reg_rtx (tmode);
28389 case CODE_FOR_xop_rotlv2di3:
28390 new_icode = CODE_FOR_rotlv2di3;
28391 goto xop_rotl;
28392 case CODE_FOR_xop_rotlv4si3:
28393 new_icode = CODE_FOR_rotlv4si3;
28394 goto xop_rotl;
28395 case CODE_FOR_xop_rotlv8hi3:
28396 new_icode = CODE_FOR_rotlv8hi3;
28397 goto xop_rotl;
28398 case CODE_FOR_xop_rotlv16qi3:
28399 new_icode = CODE_FOR_rotlv16qi3;
28400 xop_rotl:
28401 if (CONST_INT_P (op))
28402 {
28403 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28404 op = GEN_INT (INTVAL (op) & mask);
28405 gcc_checking_assert
28406 (insn_data[icode].operand[i + 1].predicate (op, mode));
28407 }
28408 else
28409 {
28410 gcc_checking_assert
28411 (nargs == 2
28412 && insn_data[new_icode].operand[0].mode == tmode
28413 && insn_data[new_icode].operand[1].mode == tmode
28414 && insn_data[new_icode].operand[2].mode == mode
28415 && insn_data[new_icode].operand[0].predicate
28416 == insn_data[icode].operand[0].predicate
28417 && insn_data[new_icode].operand[1].predicate
28418 == insn_data[icode].operand[1].predicate);
28419 icode = new_icode;
28420 goto non_constant;
28421 }
28422 break;
28423 default:
28424 gcc_unreachable ();
28425 }
28426 }
28427 }
28428 else
28429 {
28430 non_constant:
28431 if (VECTOR_MODE_P (mode))
28432 op = safe_vector_operand (op, mode);
28433
28434 /* If we aren't optimizing, only allow one memory operand to be
28435 generated. */
28436 if (memory_operand (op, mode))
28437 num_memory++;
28438
28439 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28440
28441 if (optimize
28442 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28443 || num_memory > 1)
28444 op = force_reg (mode, op);
28445 }
28446
28447 args[i].op = op;
28448 args[i].mode = mode;
28449 }
28450
28451 switch (nargs)
28452 {
28453 case 1:
28454 pat = GEN_FCN (icode) (target, args[0].op);
28455 break;
28456
28457 case 2:
28458 if (tf_p)
28459 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28460 GEN_INT ((int)sub_code));
28461 else if (! comparison_p)
28462 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28463 else
28464 {
28465 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28466 args[0].op,
28467 args[1].op);
28468
28469 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28470 }
28471 break;
28472
28473 case 3:
28474 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28475 break;
28476
28477 case 4:
28478 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28479 break;
28480
28481 default:
28482 gcc_unreachable ();
28483 }
28484
28485 if (! pat)
28486 return 0;
28487
28488 emit_insn (pat);
28489 return target;
28490 }
28491
28492 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28493 insns with vec_merge. */
28494
28495 static rtx
28496 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28497 rtx target)
28498 {
28499 rtx pat;
28500 tree arg0 = CALL_EXPR_ARG (exp, 0);
28501 rtx op1, op0 = expand_normal (arg0);
28502 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28503 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28504
28505 if (optimize || !target
28506 || GET_MODE (target) != tmode
28507 || !insn_data[icode].operand[0].predicate (target, tmode))
28508 target = gen_reg_rtx (tmode);
28509
28510 if (VECTOR_MODE_P (mode0))
28511 op0 = safe_vector_operand (op0, mode0);
28512
28513 if ((optimize && !register_operand (op0, mode0))
28514 || !insn_data[icode].operand[1].predicate (op0, mode0))
28515 op0 = copy_to_mode_reg (mode0, op0);
28516
28517 op1 = op0;
28518 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28519 op1 = copy_to_mode_reg (mode0, op1);
28520
28521 pat = GEN_FCN (icode) (target, op0, op1);
28522 if (! pat)
28523 return 0;
28524 emit_insn (pat);
28525 return target;
28526 }
28527
28528 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28529
28530 static rtx
28531 ix86_expand_sse_compare (const struct builtin_description *d,
28532 tree exp, rtx target, bool swap)
28533 {
28534 rtx pat;
28535 tree arg0 = CALL_EXPR_ARG (exp, 0);
28536 tree arg1 = CALL_EXPR_ARG (exp, 1);
28537 rtx op0 = expand_normal (arg0);
28538 rtx op1 = expand_normal (arg1);
28539 rtx op2;
28540 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28541 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28542 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28543 enum rtx_code comparison = d->comparison;
28544
28545 if (VECTOR_MODE_P (mode0))
28546 op0 = safe_vector_operand (op0, mode0);
28547 if (VECTOR_MODE_P (mode1))
28548 op1 = safe_vector_operand (op1, mode1);
28549
28550 /* Swap operands if we have a comparison that isn't available in
28551 hardware. */
28552 if (swap)
28553 {
28554 rtx tmp = gen_reg_rtx (mode1);
28555 emit_move_insn (tmp, op1);
28556 op1 = op0;
28557 op0 = tmp;
28558 }
28559
28560 if (optimize || !target
28561 || GET_MODE (target) != tmode
28562 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28563 target = gen_reg_rtx (tmode);
28564
28565 if ((optimize && !register_operand (op0, mode0))
28566 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28567 op0 = copy_to_mode_reg (mode0, op0);
28568 if ((optimize && !register_operand (op1, mode1))
28569 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28570 op1 = copy_to_mode_reg (mode1, op1);
28571
28572 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28573 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28574 if (! pat)
28575 return 0;
28576 emit_insn (pat);
28577 return target;
28578 }
28579
28580 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28581
28582 static rtx
28583 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28584 rtx target)
28585 {
28586 rtx pat;
28587 tree arg0 = CALL_EXPR_ARG (exp, 0);
28588 tree arg1 = CALL_EXPR_ARG (exp, 1);
28589 rtx op0 = expand_normal (arg0);
28590 rtx op1 = expand_normal (arg1);
28591 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28592 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28593 enum rtx_code comparison = d->comparison;
28594
28595 if (VECTOR_MODE_P (mode0))
28596 op0 = safe_vector_operand (op0, mode0);
28597 if (VECTOR_MODE_P (mode1))
28598 op1 = safe_vector_operand (op1, mode1);
28599
28600 /* Swap operands if we have a comparison that isn't available in
28601 hardware. */
28602 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28603 {
28604 rtx tmp = op1;
28605 op1 = op0;
28606 op0 = tmp;
28607 }
28608
28609 target = gen_reg_rtx (SImode);
28610 emit_move_insn (target, const0_rtx);
28611 target = gen_rtx_SUBREG (QImode, target, 0);
28612
28613 if ((optimize && !register_operand (op0, mode0))
28614 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28615 op0 = copy_to_mode_reg (mode0, op0);
28616 if ((optimize && !register_operand (op1, mode1))
28617 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28618 op1 = copy_to_mode_reg (mode1, op1);
28619
28620 pat = GEN_FCN (d->icode) (op0, op1);
28621 if (! pat)
28622 return 0;
28623 emit_insn (pat);
28624 emit_insn (gen_rtx_SET (VOIDmode,
28625 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28626 gen_rtx_fmt_ee (comparison, QImode,
28627 SET_DEST (pat),
28628 const0_rtx)));
28629
28630 return SUBREG_REG (target);
28631 }
28632
28633 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28634
28635 static rtx
28636 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28637 rtx target)
28638 {
28639 rtx pat;
28640 tree arg0 = CALL_EXPR_ARG (exp, 0);
28641 rtx op1, op0 = expand_normal (arg0);
28642 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28643 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28644
28645 if (optimize || target == 0
28646 || GET_MODE (target) != tmode
28647 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28648 target = gen_reg_rtx (tmode);
28649
28650 if (VECTOR_MODE_P (mode0))
28651 op0 = safe_vector_operand (op0, mode0);
28652
28653 if ((optimize && !register_operand (op0, mode0))
28654 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28655 op0 = copy_to_mode_reg (mode0, op0);
28656
28657 op1 = GEN_INT (d->comparison);
28658
28659 pat = GEN_FCN (d->icode) (target, op0, op1);
28660 if (! pat)
28661 return 0;
28662 emit_insn (pat);
28663 return target;
28664 }
28665
28666 static rtx
28667 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28668 tree exp, rtx target)
28669 {
28670 rtx pat;
28671 tree arg0 = CALL_EXPR_ARG (exp, 0);
28672 tree arg1 = CALL_EXPR_ARG (exp, 1);
28673 rtx op0 = expand_normal (arg0);
28674 rtx op1 = expand_normal (arg1);
28675 rtx op2;
28676 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28677 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28678 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28679
28680 if (optimize || target == 0
28681 || GET_MODE (target) != tmode
28682 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28683 target = gen_reg_rtx (tmode);
28684
28685 op0 = safe_vector_operand (op0, mode0);
28686 op1 = safe_vector_operand (op1, mode1);
28687
28688 if ((optimize && !register_operand (op0, mode0))
28689 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28690 op0 = copy_to_mode_reg (mode0, op0);
28691 if ((optimize && !register_operand (op1, mode1))
28692 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28693 op1 = copy_to_mode_reg (mode1, op1);
28694
28695 op2 = GEN_INT (d->comparison);
28696
28697 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28698 if (! pat)
28699 return 0;
28700 emit_insn (pat);
28701 return target;
28702 }
28703
28704 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28705
28706 static rtx
28707 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28708 rtx target)
28709 {
28710 rtx pat;
28711 tree arg0 = CALL_EXPR_ARG (exp, 0);
28712 tree arg1 = CALL_EXPR_ARG (exp, 1);
28713 rtx op0 = expand_normal (arg0);
28714 rtx op1 = expand_normal (arg1);
28715 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28716 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28717 enum rtx_code comparison = d->comparison;
28718
28719 if (VECTOR_MODE_P (mode0))
28720 op0 = safe_vector_operand (op0, mode0);
28721 if (VECTOR_MODE_P (mode1))
28722 op1 = safe_vector_operand (op1, mode1);
28723
28724 target = gen_reg_rtx (SImode);
28725 emit_move_insn (target, const0_rtx);
28726 target = gen_rtx_SUBREG (QImode, target, 0);
28727
28728 if ((optimize && !register_operand (op0, mode0))
28729 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28730 op0 = copy_to_mode_reg (mode0, op0);
28731 if ((optimize && !register_operand (op1, mode1))
28732 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28733 op1 = copy_to_mode_reg (mode1, op1);
28734
28735 pat = GEN_FCN (d->icode) (op0, op1);
28736 if (! pat)
28737 return 0;
28738 emit_insn (pat);
28739 emit_insn (gen_rtx_SET (VOIDmode,
28740 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28741 gen_rtx_fmt_ee (comparison, QImode,
28742 SET_DEST (pat),
28743 const0_rtx)));
28744
28745 return SUBREG_REG (target);
28746 }
28747
28748 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28749
28750 static rtx
28751 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28752 tree exp, rtx target)
28753 {
28754 rtx pat;
28755 tree arg0 = CALL_EXPR_ARG (exp, 0);
28756 tree arg1 = CALL_EXPR_ARG (exp, 1);
28757 tree arg2 = CALL_EXPR_ARG (exp, 2);
28758 tree arg3 = CALL_EXPR_ARG (exp, 3);
28759 tree arg4 = CALL_EXPR_ARG (exp, 4);
28760 rtx scratch0, scratch1;
28761 rtx op0 = expand_normal (arg0);
28762 rtx op1 = expand_normal (arg1);
28763 rtx op2 = expand_normal (arg2);
28764 rtx op3 = expand_normal (arg3);
28765 rtx op4 = expand_normal (arg4);
28766 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28767
28768 tmode0 = insn_data[d->icode].operand[0].mode;
28769 tmode1 = insn_data[d->icode].operand[1].mode;
28770 modev2 = insn_data[d->icode].operand[2].mode;
28771 modei3 = insn_data[d->icode].operand[3].mode;
28772 modev4 = insn_data[d->icode].operand[4].mode;
28773 modei5 = insn_data[d->icode].operand[5].mode;
28774 modeimm = insn_data[d->icode].operand[6].mode;
28775
28776 if (VECTOR_MODE_P (modev2))
28777 op0 = safe_vector_operand (op0, modev2);
28778 if (VECTOR_MODE_P (modev4))
28779 op2 = safe_vector_operand (op2, modev4);
28780
28781 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28782 op0 = copy_to_mode_reg (modev2, op0);
28783 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28784 op1 = copy_to_mode_reg (modei3, op1);
28785 if ((optimize && !register_operand (op2, modev4))
28786 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28787 op2 = copy_to_mode_reg (modev4, op2);
28788 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28789 op3 = copy_to_mode_reg (modei5, op3);
28790
28791 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28792 {
28793 error ("the fifth argument must be an 8-bit immediate");
28794 return const0_rtx;
28795 }
28796
28797 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28798 {
28799 if (optimize || !target
28800 || GET_MODE (target) != tmode0
28801 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28802 target = gen_reg_rtx (tmode0);
28803
28804 scratch1 = gen_reg_rtx (tmode1);
28805
28806 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28807 }
28808 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28809 {
28810 if (optimize || !target
28811 || GET_MODE (target) != tmode1
28812 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28813 target = gen_reg_rtx (tmode1);
28814
28815 scratch0 = gen_reg_rtx (tmode0);
28816
28817 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28818 }
28819 else
28820 {
28821 gcc_assert (d->flag);
28822
28823 scratch0 = gen_reg_rtx (tmode0);
28824 scratch1 = gen_reg_rtx (tmode1);
28825
28826 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28827 }
28828
28829 if (! pat)
28830 return 0;
28831
28832 emit_insn (pat);
28833
28834 if (d->flag)
28835 {
28836 target = gen_reg_rtx (SImode);
28837 emit_move_insn (target, const0_rtx);
28838 target = gen_rtx_SUBREG (QImode, target, 0);
28839
28840 emit_insn
28841 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28842 gen_rtx_fmt_ee (EQ, QImode,
28843 gen_rtx_REG ((enum machine_mode) d->flag,
28844 FLAGS_REG),
28845 const0_rtx)));
28846 return SUBREG_REG (target);
28847 }
28848 else
28849 return target;
28850 }
28851
28852
28853 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28854
28855 static rtx
28856 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28857 tree exp, rtx target)
28858 {
28859 rtx pat;
28860 tree arg0 = CALL_EXPR_ARG (exp, 0);
28861 tree arg1 = CALL_EXPR_ARG (exp, 1);
28862 tree arg2 = CALL_EXPR_ARG (exp, 2);
28863 rtx scratch0, scratch1;
28864 rtx op0 = expand_normal (arg0);
28865 rtx op1 = expand_normal (arg1);
28866 rtx op2 = expand_normal (arg2);
28867 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28868
28869 tmode0 = insn_data[d->icode].operand[0].mode;
28870 tmode1 = insn_data[d->icode].operand[1].mode;
28871 modev2 = insn_data[d->icode].operand[2].mode;
28872 modev3 = insn_data[d->icode].operand[3].mode;
28873 modeimm = insn_data[d->icode].operand[4].mode;
28874
28875 if (VECTOR_MODE_P (modev2))
28876 op0 = safe_vector_operand (op0, modev2);
28877 if (VECTOR_MODE_P (modev3))
28878 op1 = safe_vector_operand (op1, modev3);
28879
28880 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28881 op0 = copy_to_mode_reg (modev2, op0);
28882 if ((optimize && !register_operand (op1, modev3))
28883 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28884 op1 = copy_to_mode_reg (modev3, op1);
28885
28886 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28887 {
28888 error ("the third argument must be an 8-bit immediate");
28889 return const0_rtx;
28890 }
28891
28892 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28893 {
28894 if (optimize || !target
28895 || GET_MODE (target) != tmode0
28896 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28897 target = gen_reg_rtx (tmode0);
28898
28899 scratch1 = gen_reg_rtx (tmode1);
28900
28901 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28902 }
28903 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28904 {
28905 if (optimize || !target
28906 || GET_MODE (target) != tmode1
28907 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28908 target = gen_reg_rtx (tmode1);
28909
28910 scratch0 = gen_reg_rtx (tmode0);
28911
28912 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28913 }
28914 else
28915 {
28916 gcc_assert (d->flag);
28917
28918 scratch0 = gen_reg_rtx (tmode0);
28919 scratch1 = gen_reg_rtx (tmode1);
28920
28921 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28922 }
28923
28924 if (! pat)
28925 return 0;
28926
28927 emit_insn (pat);
28928
28929 if (d->flag)
28930 {
28931 target = gen_reg_rtx (SImode);
28932 emit_move_insn (target, const0_rtx);
28933 target = gen_rtx_SUBREG (QImode, target, 0);
28934
28935 emit_insn
28936 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28937 gen_rtx_fmt_ee (EQ, QImode,
28938 gen_rtx_REG ((enum machine_mode) d->flag,
28939 FLAGS_REG),
28940 const0_rtx)));
28941 return SUBREG_REG (target);
28942 }
28943 else
28944 return target;
28945 }
28946
28947 /* Subroutine of ix86_expand_builtin to take care of insns with
28948 variable number of operands. */
28949
28950 static rtx
28951 ix86_expand_args_builtin (const struct builtin_description *d,
28952 tree exp, rtx target)
28953 {
28954 rtx pat, real_target;
28955 unsigned int i, nargs;
28956 unsigned int nargs_constant = 0;
28957 int num_memory = 0;
28958 struct
28959 {
28960 rtx op;
28961 enum machine_mode mode;
28962 } args[4];
28963 bool last_arg_count = false;
28964 enum insn_code icode = d->icode;
28965 const struct insn_data_d *insn_p = &insn_data[icode];
28966 enum machine_mode tmode = insn_p->operand[0].mode;
28967 enum machine_mode rmode = VOIDmode;
28968 bool swap = false;
28969 enum rtx_code comparison = d->comparison;
28970
28971 switch ((enum ix86_builtin_func_type) d->flag)
28972 {
28973 case V2DF_FTYPE_V2DF_ROUND:
28974 case V4DF_FTYPE_V4DF_ROUND:
28975 case V4SF_FTYPE_V4SF_ROUND:
28976 case V8SF_FTYPE_V8SF_ROUND:
28977 case V4SI_FTYPE_V4SF_ROUND:
28978 case V8SI_FTYPE_V8SF_ROUND:
28979 return ix86_expand_sse_round (d, exp, target);
28980 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28981 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28982 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28983 case INT_FTYPE_V8SF_V8SF_PTEST:
28984 case INT_FTYPE_V4DI_V4DI_PTEST:
28985 case INT_FTYPE_V4DF_V4DF_PTEST:
28986 case INT_FTYPE_V4SF_V4SF_PTEST:
28987 case INT_FTYPE_V2DI_V2DI_PTEST:
28988 case INT_FTYPE_V2DF_V2DF_PTEST:
28989 return ix86_expand_sse_ptest (d, exp, target);
28990 case FLOAT128_FTYPE_FLOAT128:
28991 case FLOAT_FTYPE_FLOAT:
28992 case INT_FTYPE_INT:
28993 case UINT64_FTYPE_INT:
28994 case UINT16_FTYPE_UINT16:
28995 case INT64_FTYPE_INT64:
28996 case INT64_FTYPE_V4SF:
28997 case INT64_FTYPE_V2DF:
28998 case INT_FTYPE_V16QI:
28999 case INT_FTYPE_V8QI:
29000 case INT_FTYPE_V8SF:
29001 case INT_FTYPE_V4DF:
29002 case INT_FTYPE_V4SF:
29003 case INT_FTYPE_V2DF:
29004 case INT_FTYPE_V32QI:
29005 case V16QI_FTYPE_V16QI:
29006 case V8SI_FTYPE_V8SF:
29007 case V8SI_FTYPE_V4SI:
29008 case V8HI_FTYPE_V8HI:
29009 case V8HI_FTYPE_V16QI:
29010 case V8QI_FTYPE_V8QI:
29011 case V8SF_FTYPE_V8SF:
29012 case V8SF_FTYPE_V8SI:
29013 case V8SF_FTYPE_V4SF:
29014 case V8SF_FTYPE_V8HI:
29015 case V4SI_FTYPE_V4SI:
29016 case V4SI_FTYPE_V16QI:
29017 case V4SI_FTYPE_V4SF:
29018 case V4SI_FTYPE_V8SI:
29019 case V4SI_FTYPE_V8HI:
29020 case V4SI_FTYPE_V4DF:
29021 case V4SI_FTYPE_V2DF:
29022 case V4HI_FTYPE_V4HI:
29023 case V4DF_FTYPE_V4DF:
29024 case V4DF_FTYPE_V4SI:
29025 case V4DF_FTYPE_V4SF:
29026 case V4DF_FTYPE_V2DF:
29027 case V4SF_FTYPE_V4SF:
29028 case V4SF_FTYPE_V4SI:
29029 case V4SF_FTYPE_V8SF:
29030 case V4SF_FTYPE_V4DF:
29031 case V4SF_FTYPE_V8HI:
29032 case V4SF_FTYPE_V2DF:
29033 case V2DI_FTYPE_V2DI:
29034 case V2DI_FTYPE_V16QI:
29035 case V2DI_FTYPE_V8HI:
29036 case V2DI_FTYPE_V4SI:
29037 case V2DF_FTYPE_V2DF:
29038 case V2DF_FTYPE_V4SI:
29039 case V2DF_FTYPE_V4DF:
29040 case V2DF_FTYPE_V4SF:
29041 case V2DF_FTYPE_V2SI:
29042 case V2SI_FTYPE_V2SI:
29043 case V2SI_FTYPE_V4SF:
29044 case V2SI_FTYPE_V2SF:
29045 case V2SI_FTYPE_V2DF:
29046 case V2SF_FTYPE_V2SF:
29047 case V2SF_FTYPE_V2SI:
29048 case V32QI_FTYPE_V32QI:
29049 case V32QI_FTYPE_V16QI:
29050 case V16HI_FTYPE_V16HI:
29051 case V16HI_FTYPE_V8HI:
29052 case V8SI_FTYPE_V8SI:
29053 case V16HI_FTYPE_V16QI:
29054 case V8SI_FTYPE_V16QI:
29055 case V4DI_FTYPE_V16QI:
29056 case V8SI_FTYPE_V8HI:
29057 case V4DI_FTYPE_V8HI:
29058 case V4DI_FTYPE_V4SI:
29059 case V4DI_FTYPE_V2DI:
29060 nargs = 1;
29061 break;
29062 case V4SF_FTYPE_V4SF_VEC_MERGE:
29063 case V2DF_FTYPE_V2DF_VEC_MERGE:
29064 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29065 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29066 case V16QI_FTYPE_V16QI_V16QI:
29067 case V16QI_FTYPE_V8HI_V8HI:
29068 case V8QI_FTYPE_V8QI_V8QI:
29069 case V8QI_FTYPE_V4HI_V4HI:
29070 case V8HI_FTYPE_V8HI_V8HI:
29071 case V8HI_FTYPE_V16QI_V16QI:
29072 case V8HI_FTYPE_V4SI_V4SI:
29073 case V8SF_FTYPE_V8SF_V8SF:
29074 case V8SF_FTYPE_V8SF_V8SI:
29075 case V4SI_FTYPE_V4SI_V4SI:
29076 case V4SI_FTYPE_V8HI_V8HI:
29077 case V4SI_FTYPE_V4SF_V4SF:
29078 case V4SI_FTYPE_V2DF_V2DF:
29079 case V4HI_FTYPE_V4HI_V4HI:
29080 case V4HI_FTYPE_V8QI_V8QI:
29081 case V4HI_FTYPE_V2SI_V2SI:
29082 case V4DF_FTYPE_V4DF_V4DF:
29083 case V4DF_FTYPE_V4DF_V4DI:
29084 case V4SF_FTYPE_V4SF_V4SF:
29085 case V4SF_FTYPE_V4SF_V4SI:
29086 case V4SF_FTYPE_V4SF_V2SI:
29087 case V4SF_FTYPE_V4SF_V2DF:
29088 case V4SF_FTYPE_V4SF_DI:
29089 case V4SF_FTYPE_V4SF_SI:
29090 case V2DI_FTYPE_V2DI_V2DI:
29091 case V2DI_FTYPE_V16QI_V16QI:
29092 case V2DI_FTYPE_V4SI_V4SI:
29093 case V2DI_FTYPE_V2DI_V16QI:
29094 case V2DI_FTYPE_V2DF_V2DF:
29095 case V2SI_FTYPE_V2SI_V2SI:
29096 case V2SI_FTYPE_V4HI_V4HI:
29097 case V2SI_FTYPE_V2SF_V2SF:
29098 case V2DF_FTYPE_V2DF_V2DF:
29099 case V2DF_FTYPE_V2DF_V4SF:
29100 case V2DF_FTYPE_V2DF_V2DI:
29101 case V2DF_FTYPE_V2DF_DI:
29102 case V2DF_FTYPE_V2DF_SI:
29103 case V2SF_FTYPE_V2SF_V2SF:
29104 case V1DI_FTYPE_V1DI_V1DI:
29105 case V1DI_FTYPE_V8QI_V8QI:
29106 case V1DI_FTYPE_V2SI_V2SI:
29107 case V32QI_FTYPE_V16HI_V16HI:
29108 case V16HI_FTYPE_V8SI_V8SI:
29109 case V32QI_FTYPE_V32QI_V32QI:
29110 case V16HI_FTYPE_V32QI_V32QI:
29111 case V16HI_FTYPE_V16HI_V16HI:
29112 case V8SI_FTYPE_V4DF_V4DF:
29113 case V8SI_FTYPE_V8SI_V8SI:
29114 case V8SI_FTYPE_V16HI_V16HI:
29115 case V4DI_FTYPE_V4DI_V4DI:
29116 case V4DI_FTYPE_V8SI_V8SI:
29117 if (comparison == UNKNOWN)
29118 return ix86_expand_binop_builtin (icode, exp, target);
29119 nargs = 2;
29120 break;
29121 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29122 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29123 gcc_assert (comparison != UNKNOWN);
29124 nargs = 2;
29125 swap = true;
29126 break;
29127 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29128 case V16HI_FTYPE_V16HI_SI_COUNT:
29129 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29130 case V8SI_FTYPE_V8SI_SI_COUNT:
29131 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29132 case V4DI_FTYPE_V4DI_INT_COUNT:
29133 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29134 case V8HI_FTYPE_V8HI_SI_COUNT:
29135 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29136 case V4SI_FTYPE_V4SI_SI_COUNT:
29137 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29138 case V4HI_FTYPE_V4HI_SI_COUNT:
29139 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29140 case V2DI_FTYPE_V2DI_SI_COUNT:
29141 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29142 case V2SI_FTYPE_V2SI_SI_COUNT:
29143 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29144 case V1DI_FTYPE_V1DI_SI_COUNT:
29145 nargs = 2;
29146 last_arg_count = true;
29147 break;
29148 case UINT64_FTYPE_UINT64_UINT64:
29149 case UINT_FTYPE_UINT_UINT:
29150 case UINT_FTYPE_UINT_USHORT:
29151 case UINT_FTYPE_UINT_UCHAR:
29152 case UINT16_FTYPE_UINT16_INT:
29153 case UINT8_FTYPE_UINT8_INT:
29154 nargs = 2;
29155 break;
29156 case V2DI_FTYPE_V2DI_INT_CONVERT:
29157 nargs = 2;
29158 rmode = V1TImode;
29159 nargs_constant = 1;
29160 break;
29161 case V4DI_FTYPE_V4DI_INT_CONVERT:
29162 nargs = 2;
29163 rmode = V2TImode;
29164 nargs_constant = 1;
29165 break;
29166 case V8HI_FTYPE_V8HI_INT:
29167 case V8HI_FTYPE_V8SF_INT:
29168 case V8HI_FTYPE_V4SF_INT:
29169 case V8SF_FTYPE_V8SF_INT:
29170 case V4SI_FTYPE_V4SI_INT:
29171 case V4SI_FTYPE_V8SI_INT:
29172 case V4HI_FTYPE_V4HI_INT:
29173 case V4DF_FTYPE_V4DF_INT:
29174 case V4SF_FTYPE_V4SF_INT:
29175 case V4SF_FTYPE_V8SF_INT:
29176 case V2DI_FTYPE_V2DI_INT:
29177 case V2DF_FTYPE_V2DF_INT:
29178 case V2DF_FTYPE_V4DF_INT:
29179 case V16HI_FTYPE_V16HI_INT:
29180 case V8SI_FTYPE_V8SI_INT:
29181 case V4DI_FTYPE_V4DI_INT:
29182 case V2DI_FTYPE_V4DI_INT:
29183 nargs = 2;
29184 nargs_constant = 1;
29185 break;
29186 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29187 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29188 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29189 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29190 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29191 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29192 nargs = 3;
29193 break;
29194 case V32QI_FTYPE_V32QI_V32QI_INT:
29195 case V16HI_FTYPE_V16HI_V16HI_INT:
29196 case V16QI_FTYPE_V16QI_V16QI_INT:
29197 case V4DI_FTYPE_V4DI_V4DI_INT:
29198 case V8HI_FTYPE_V8HI_V8HI_INT:
29199 case V8SI_FTYPE_V8SI_V8SI_INT:
29200 case V8SI_FTYPE_V8SI_V4SI_INT:
29201 case V8SF_FTYPE_V8SF_V8SF_INT:
29202 case V8SF_FTYPE_V8SF_V4SF_INT:
29203 case V4SI_FTYPE_V4SI_V4SI_INT:
29204 case V4DF_FTYPE_V4DF_V4DF_INT:
29205 case V4DF_FTYPE_V4DF_V2DF_INT:
29206 case V4SF_FTYPE_V4SF_V4SF_INT:
29207 case V2DI_FTYPE_V2DI_V2DI_INT:
29208 case V4DI_FTYPE_V4DI_V2DI_INT:
29209 case V2DF_FTYPE_V2DF_V2DF_INT:
29210 nargs = 3;
29211 nargs_constant = 1;
29212 break;
29213 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29214 nargs = 3;
29215 rmode = V4DImode;
29216 nargs_constant = 1;
29217 break;
29218 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29219 nargs = 3;
29220 rmode = V2DImode;
29221 nargs_constant = 1;
29222 break;
29223 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29224 nargs = 3;
29225 rmode = DImode;
29226 nargs_constant = 1;
29227 break;
29228 case V2DI_FTYPE_V2DI_UINT_UINT:
29229 nargs = 3;
29230 nargs_constant = 2;
29231 break;
29232 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29233 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29234 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29235 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29236 nargs = 4;
29237 nargs_constant = 1;
29238 break;
29239 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29240 nargs = 4;
29241 nargs_constant = 2;
29242 break;
29243 default:
29244 gcc_unreachable ();
29245 }
29246
29247 gcc_assert (nargs <= ARRAY_SIZE (args));
29248
29249 if (comparison != UNKNOWN)
29250 {
29251 gcc_assert (nargs == 2);
29252 return ix86_expand_sse_compare (d, exp, target, swap);
29253 }
29254
29255 if (rmode == VOIDmode || rmode == tmode)
29256 {
29257 if (optimize
29258 || target == 0
29259 || GET_MODE (target) != tmode
29260 || !insn_p->operand[0].predicate (target, tmode))
29261 target = gen_reg_rtx (tmode);
29262 real_target = target;
29263 }
29264 else
29265 {
29266 target = gen_reg_rtx (rmode);
29267 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29268 }
29269
29270 for (i = 0; i < nargs; i++)
29271 {
29272 tree arg = CALL_EXPR_ARG (exp, i);
29273 rtx op = expand_normal (arg);
29274 enum machine_mode mode = insn_p->operand[i + 1].mode;
29275 bool match = insn_p->operand[i + 1].predicate (op, mode);
29276
29277 if (last_arg_count && (i + 1) == nargs)
29278 {
29279 /* SIMD shift insns take either an 8-bit immediate or
29280 register as count. But builtin functions take int as
29281 count. If count doesn't match, we put it in register. */
29282 if (!match)
29283 {
29284 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29285 if (!insn_p->operand[i + 1].predicate (op, mode))
29286 op = copy_to_reg (op);
29287 }
29288 }
29289 else if ((nargs - i) <= nargs_constant)
29290 {
29291 if (!match)
29292 switch (icode)
29293 {
29294 case CODE_FOR_avx2_inserti128:
29295 case CODE_FOR_avx2_extracti128:
29296 error ("the last argument must be an 1-bit immediate");
29297 return const0_rtx;
29298
29299 case CODE_FOR_sse4_1_roundsd:
29300 case CODE_FOR_sse4_1_roundss:
29301
29302 case CODE_FOR_sse4_1_roundpd:
29303 case CODE_FOR_sse4_1_roundps:
29304 case CODE_FOR_avx_roundpd256:
29305 case CODE_FOR_avx_roundps256:
29306
29307 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29308 case CODE_FOR_sse4_1_roundps_sfix:
29309 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29310 case CODE_FOR_avx_roundps_sfix256:
29311
29312 case CODE_FOR_sse4_1_blendps:
29313 case CODE_FOR_avx_blendpd256:
29314 case CODE_FOR_avx_vpermilv4df:
29315 error ("the last argument must be a 4-bit immediate");
29316 return const0_rtx;
29317
29318 case CODE_FOR_sse4_1_blendpd:
29319 case CODE_FOR_avx_vpermilv2df:
29320 case CODE_FOR_xop_vpermil2v2df3:
29321 case CODE_FOR_xop_vpermil2v4sf3:
29322 case CODE_FOR_xop_vpermil2v4df3:
29323 case CODE_FOR_xop_vpermil2v8sf3:
29324 error ("the last argument must be a 2-bit immediate");
29325 return const0_rtx;
29326
29327 case CODE_FOR_avx_vextractf128v4df:
29328 case CODE_FOR_avx_vextractf128v8sf:
29329 case CODE_FOR_avx_vextractf128v8si:
29330 case CODE_FOR_avx_vinsertf128v4df:
29331 case CODE_FOR_avx_vinsertf128v8sf:
29332 case CODE_FOR_avx_vinsertf128v8si:
29333 error ("the last argument must be a 1-bit immediate");
29334 return const0_rtx;
29335
29336 case CODE_FOR_avx_vmcmpv2df3:
29337 case CODE_FOR_avx_vmcmpv4sf3:
29338 case CODE_FOR_avx_cmpv2df3:
29339 case CODE_FOR_avx_cmpv4sf3:
29340 case CODE_FOR_avx_cmpv4df3:
29341 case CODE_FOR_avx_cmpv8sf3:
29342 error ("the last argument must be a 5-bit immediate");
29343 return const0_rtx;
29344
29345 default:
29346 switch (nargs_constant)
29347 {
29348 case 2:
29349 if ((nargs - i) == nargs_constant)
29350 {
29351 error ("the next to last argument must be an 8-bit immediate");
29352 break;
29353 }
29354 case 1:
29355 error ("the last argument must be an 8-bit immediate");
29356 break;
29357 default:
29358 gcc_unreachable ();
29359 }
29360 return const0_rtx;
29361 }
29362 }
29363 else
29364 {
29365 if (VECTOR_MODE_P (mode))
29366 op = safe_vector_operand (op, mode);
29367
29368 /* If we aren't optimizing, only allow one memory operand to
29369 be generated. */
29370 if (memory_operand (op, mode))
29371 num_memory++;
29372
29373 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29374 {
29375 if (optimize || !match || num_memory > 1)
29376 op = copy_to_mode_reg (mode, op);
29377 }
29378 else
29379 {
29380 op = copy_to_reg (op);
29381 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29382 }
29383 }
29384
29385 args[i].op = op;
29386 args[i].mode = mode;
29387 }
29388
29389 switch (nargs)
29390 {
29391 case 1:
29392 pat = GEN_FCN (icode) (real_target, args[0].op);
29393 break;
29394 case 2:
29395 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29396 break;
29397 case 3:
29398 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29399 args[2].op);
29400 break;
29401 case 4:
29402 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29403 args[2].op, args[3].op);
29404 break;
29405 default:
29406 gcc_unreachable ();
29407 }
29408
29409 if (! pat)
29410 return 0;
29411
29412 emit_insn (pat);
29413 return target;
29414 }
29415
29416 /* Subroutine of ix86_expand_builtin to take care of special insns
29417 with variable number of operands. */
29418
29419 static rtx
29420 ix86_expand_special_args_builtin (const struct builtin_description *d,
29421 tree exp, rtx target)
29422 {
29423 tree arg;
29424 rtx pat, op;
29425 unsigned int i, nargs, arg_adjust, memory;
29426 struct
29427 {
29428 rtx op;
29429 enum machine_mode mode;
29430 } args[3];
29431 enum insn_code icode = d->icode;
29432 bool last_arg_constant = false;
29433 const struct insn_data_d *insn_p = &insn_data[icode];
29434 enum machine_mode tmode = insn_p->operand[0].mode;
29435 enum { load, store } klass;
29436
29437 switch ((enum ix86_builtin_func_type) d->flag)
29438 {
29439 case VOID_FTYPE_VOID:
29440 if (icode == CODE_FOR_avx_vzeroupper)
29441 target = GEN_INT (vzeroupper_intrinsic);
29442 emit_insn (GEN_FCN (icode) (target));
29443 return 0;
29444 case VOID_FTYPE_UINT64:
29445 case VOID_FTYPE_UNSIGNED:
29446 nargs = 0;
29447 klass = store;
29448 memory = 0;
29449 break;
29450
29451 case INT_FTYPE_VOID:
29452 case UINT64_FTYPE_VOID:
29453 case UNSIGNED_FTYPE_VOID:
29454 nargs = 0;
29455 klass = load;
29456 memory = 0;
29457 break;
29458 case UINT64_FTYPE_PUNSIGNED:
29459 case V2DI_FTYPE_PV2DI:
29460 case V4DI_FTYPE_PV4DI:
29461 case V32QI_FTYPE_PCCHAR:
29462 case V16QI_FTYPE_PCCHAR:
29463 case V8SF_FTYPE_PCV4SF:
29464 case V8SF_FTYPE_PCFLOAT:
29465 case V4SF_FTYPE_PCFLOAT:
29466 case V4DF_FTYPE_PCV2DF:
29467 case V4DF_FTYPE_PCDOUBLE:
29468 case V2DF_FTYPE_PCDOUBLE:
29469 case VOID_FTYPE_PVOID:
29470 nargs = 1;
29471 klass = load;
29472 memory = 0;
29473 break;
29474 case VOID_FTYPE_PV2SF_V4SF:
29475 case VOID_FTYPE_PV4DI_V4DI:
29476 case VOID_FTYPE_PV2DI_V2DI:
29477 case VOID_FTYPE_PCHAR_V32QI:
29478 case VOID_FTYPE_PCHAR_V16QI:
29479 case VOID_FTYPE_PFLOAT_V8SF:
29480 case VOID_FTYPE_PFLOAT_V4SF:
29481 case VOID_FTYPE_PDOUBLE_V4DF:
29482 case VOID_FTYPE_PDOUBLE_V2DF:
29483 case VOID_FTYPE_PLONGLONG_LONGLONG:
29484 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29485 case VOID_FTYPE_PINT_INT:
29486 nargs = 1;
29487 klass = store;
29488 /* Reserve memory operand for target. */
29489 memory = ARRAY_SIZE (args);
29490 break;
29491 case V4SF_FTYPE_V4SF_PCV2SF:
29492 case V2DF_FTYPE_V2DF_PCDOUBLE:
29493 nargs = 2;
29494 klass = load;
29495 memory = 1;
29496 break;
29497 case V8SF_FTYPE_PCV8SF_V8SI:
29498 case V4DF_FTYPE_PCV4DF_V4DI:
29499 case V4SF_FTYPE_PCV4SF_V4SI:
29500 case V2DF_FTYPE_PCV2DF_V2DI:
29501 case V8SI_FTYPE_PCV8SI_V8SI:
29502 case V4DI_FTYPE_PCV4DI_V4DI:
29503 case V4SI_FTYPE_PCV4SI_V4SI:
29504 case V2DI_FTYPE_PCV2DI_V2DI:
29505 nargs = 2;
29506 klass = load;
29507 memory = 0;
29508 break;
29509 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29510 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29511 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29512 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29513 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29514 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29515 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29516 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29517 nargs = 2;
29518 klass = store;
29519 /* Reserve memory operand for target. */
29520 memory = ARRAY_SIZE (args);
29521 break;
29522 case VOID_FTYPE_UINT_UINT_UINT:
29523 case VOID_FTYPE_UINT64_UINT_UINT:
29524 case UCHAR_FTYPE_UINT_UINT_UINT:
29525 case UCHAR_FTYPE_UINT64_UINT_UINT:
29526 nargs = 3;
29527 klass = load;
29528 memory = ARRAY_SIZE (args);
29529 last_arg_constant = true;
29530 break;
29531 default:
29532 gcc_unreachable ();
29533 }
29534
29535 gcc_assert (nargs <= ARRAY_SIZE (args));
29536
29537 if (klass == store)
29538 {
29539 arg = CALL_EXPR_ARG (exp, 0);
29540 op = expand_normal (arg);
29541 gcc_assert (target == 0);
29542 if (memory)
29543 {
29544 if (GET_MODE (op) != Pmode)
29545 op = convert_to_mode (Pmode, op, 1);
29546 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29547 }
29548 else
29549 target = force_reg (tmode, op);
29550 arg_adjust = 1;
29551 }
29552 else
29553 {
29554 arg_adjust = 0;
29555 if (optimize
29556 || target == 0
29557 || !register_operand (target, tmode)
29558 || GET_MODE (target) != tmode)
29559 target = gen_reg_rtx (tmode);
29560 }
29561
29562 for (i = 0; i < nargs; i++)
29563 {
29564 enum machine_mode mode = insn_p->operand[i + 1].mode;
29565 bool match;
29566
29567 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29568 op = expand_normal (arg);
29569 match = insn_p->operand[i + 1].predicate (op, mode);
29570
29571 if (last_arg_constant && (i + 1) == nargs)
29572 {
29573 if (!match)
29574 {
29575 if (icode == CODE_FOR_lwp_lwpvalsi3
29576 || icode == CODE_FOR_lwp_lwpinssi3
29577 || icode == CODE_FOR_lwp_lwpvaldi3
29578 || icode == CODE_FOR_lwp_lwpinsdi3)
29579 error ("the last argument must be a 32-bit immediate");
29580 else
29581 error ("the last argument must be an 8-bit immediate");
29582 return const0_rtx;
29583 }
29584 }
29585 else
29586 {
29587 if (i == memory)
29588 {
29589 /* This must be the memory operand. */
29590 if (GET_MODE (op) != Pmode)
29591 op = convert_to_mode (Pmode, op, 1);
29592 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29593 gcc_assert (GET_MODE (op) == mode
29594 || GET_MODE (op) == VOIDmode);
29595 }
29596 else
29597 {
29598 /* This must be register. */
29599 if (VECTOR_MODE_P (mode))
29600 op = safe_vector_operand (op, mode);
29601
29602 gcc_assert (GET_MODE (op) == mode
29603 || GET_MODE (op) == VOIDmode);
29604 op = copy_to_mode_reg (mode, op);
29605 }
29606 }
29607
29608 args[i].op = op;
29609 args[i].mode = mode;
29610 }
29611
29612 switch (nargs)
29613 {
29614 case 0:
29615 pat = GEN_FCN (icode) (target);
29616 break;
29617 case 1:
29618 pat = GEN_FCN (icode) (target, args[0].op);
29619 break;
29620 case 2:
29621 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29622 break;
29623 case 3:
29624 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29625 break;
29626 default:
29627 gcc_unreachable ();
29628 }
29629
29630 if (! pat)
29631 return 0;
29632 emit_insn (pat);
29633 return klass == store ? 0 : target;
29634 }
29635
29636 /* Return the integer constant in ARG. Constrain it to be in the range
29637 of the subparts of VEC_TYPE; issue an error if not. */
29638
29639 static int
29640 get_element_number (tree vec_type, tree arg)
29641 {
29642 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29643
29644 if (!host_integerp (arg, 1)
29645 || (elt = tree_low_cst (arg, 1), elt > max))
29646 {
29647 error ("selector must be an integer constant in the range 0..%wi", max);
29648 return 0;
29649 }
29650
29651 return elt;
29652 }
29653
29654 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29655 ix86_expand_vector_init. We DO have language-level syntax for this, in
29656 the form of (type){ init-list }. Except that since we can't place emms
29657 instructions from inside the compiler, we can't allow the use of MMX
29658 registers unless the user explicitly asks for it. So we do *not* define
29659 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29660 we have builtins invoked by mmintrin.h that gives us license to emit
29661 these sorts of instructions. */
29662
29663 static rtx
29664 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29665 {
29666 enum machine_mode tmode = TYPE_MODE (type);
29667 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29668 int i, n_elt = GET_MODE_NUNITS (tmode);
29669 rtvec v = rtvec_alloc (n_elt);
29670
29671 gcc_assert (VECTOR_MODE_P (tmode));
29672 gcc_assert (call_expr_nargs (exp) == n_elt);
29673
29674 for (i = 0; i < n_elt; ++i)
29675 {
29676 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29677 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29678 }
29679
29680 if (!target || !register_operand (target, tmode))
29681 target = gen_reg_rtx (tmode);
29682
29683 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29684 return target;
29685 }
29686
29687 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29688 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29689 had a language-level syntax for referencing vector elements. */
29690
29691 static rtx
29692 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29693 {
29694 enum machine_mode tmode, mode0;
29695 tree arg0, arg1;
29696 int elt;
29697 rtx op0;
29698
29699 arg0 = CALL_EXPR_ARG (exp, 0);
29700 arg1 = CALL_EXPR_ARG (exp, 1);
29701
29702 op0 = expand_normal (arg0);
29703 elt = get_element_number (TREE_TYPE (arg0), arg1);
29704
29705 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29706 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29707 gcc_assert (VECTOR_MODE_P (mode0));
29708
29709 op0 = force_reg (mode0, op0);
29710
29711 if (optimize || !target || !register_operand (target, tmode))
29712 target = gen_reg_rtx (tmode);
29713
29714 ix86_expand_vector_extract (true, target, op0, elt);
29715
29716 return target;
29717 }
29718
29719 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29720 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29721 a language-level syntax for referencing vector elements. */
29722
29723 static rtx
29724 ix86_expand_vec_set_builtin (tree exp)
29725 {
29726 enum machine_mode tmode, mode1;
29727 tree arg0, arg1, arg2;
29728 int elt;
29729 rtx op0, op1, target;
29730
29731 arg0 = CALL_EXPR_ARG (exp, 0);
29732 arg1 = CALL_EXPR_ARG (exp, 1);
29733 arg2 = CALL_EXPR_ARG (exp, 2);
29734
29735 tmode = TYPE_MODE (TREE_TYPE (arg0));
29736 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29737 gcc_assert (VECTOR_MODE_P (tmode));
29738
29739 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29740 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29741 elt = get_element_number (TREE_TYPE (arg0), arg2);
29742
29743 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29744 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29745
29746 op0 = force_reg (tmode, op0);
29747 op1 = force_reg (mode1, op1);
29748
29749 /* OP0 is the source of these builtin functions and shouldn't be
29750 modified. Create a copy, use it and return it as target. */
29751 target = gen_reg_rtx (tmode);
29752 emit_move_insn (target, op0);
29753 ix86_expand_vector_set (true, target, op1, elt);
29754
29755 return target;
29756 }
29757
29758 /* Expand an expression EXP that calls a built-in function,
29759 with result going to TARGET if that's convenient
29760 (and in mode MODE if that's convenient).
29761 SUBTARGET may be used as the target for computing one of EXP's operands.
29762 IGNORE is nonzero if the value is to be ignored. */
29763
29764 static rtx
29765 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29766 enum machine_mode mode ATTRIBUTE_UNUSED,
29767 int ignore ATTRIBUTE_UNUSED)
29768 {
29769 const struct builtin_description *d;
29770 size_t i;
29771 enum insn_code icode;
29772 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29773 tree arg0, arg1, arg2, arg3, arg4;
29774 rtx op0, op1, op2, op3, op4, pat;
29775 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29776 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29777
29778 /* For CPU builtins that can be folded, fold first and expand the fold. */
29779 switch (fcode)
29780 {
29781 case IX86_BUILTIN_CPU_INIT:
29782 {
29783 /* Make it call __cpu_indicator_init in libgcc. */
29784 tree call_expr, fndecl, type;
29785 type = build_function_type_list (integer_type_node, NULL_TREE);
29786 fndecl = build_fn_decl ("__cpu_indicator_init", type);
29787 call_expr = build_call_expr (fndecl, 0);
29788 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
29789 }
29790 case IX86_BUILTIN_CPU_IS:
29791 case IX86_BUILTIN_CPU_SUPPORTS:
29792 {
29793 tree arg0 = CALL_EXPR_ARG (exp, 0);
29794 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
29795 gcc_assert (fold_expr != NULL_TREE);
29796 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
29797 }
29798 }
29799
29800 /* Determine whether the builtin function is available under the current ISA.
29801 Originally the builtin was not created if it wasn't applicable to the
29802 current ISA based on the command line switches. With function specific
29803 options, we need to check in the context of the function making the call
29804 whether it is supported. */
29805 if (ix86_builtins_isa[fcode].isa
29806 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29807 {
29808 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29809 NULL, (enum fpmath_unit) 0, false);
29810
29811 if (!opts)
29812 error ("%qE needs unknown isa option", fndecl);
29813 else
29814 {
29815 gcc_assert (opts != NULL);
29816 error ("%qE needs isa option %s", fndecl, opts);
29817 free (opts);
29818 }
29819 return const0_rtx;
29820 }
29821
29822 switch (fcode)
29823 {
29824 case IX86_BUILTIN_MASKMOVQ:
29825 case IX86_BUILTIN_MASKMOVDQU:
29826 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29827 ? CODE_FOR_mmx_maskmovq
29828 : CODE_FOR_sse2_maskmovdqu);
29829 /* Note the arg order is different from the operand order. */
29830 arg1 = CALL_EXPR_ARG (exp, 0);
29831 arg2 = CALL_EXPR_ARG (exp, 1);
29832 arg0 = CALL_EXPR_ARG (exp, 2);
29833 op0 = expand_normal (arg0);
29834 op1 = expand_normal (arg1);
29835 op2 = expand_normal (arg2);
29836 mode0 = insn_data[icode].operand[0].mode;
29837 mode1 = insn_data[icode].operand[1].mode;
29838 mode2 = insn_data[icode].operand[2].mode;
29839
29840 if (GET_MODE (op0) != Pmode)
29841 op0 = convert_to_mode (Pmode, op0, 1);
29842 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29843
29844 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29845 op0 = copy_to_mode_reg (mode0, op0);
29846 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29847 op1 = copy_to_mode_reg (mode1, op1);
29848 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29849 op2 = copy_to_mode_reg (mode2, op2);
29850 pat = GEN_FCN (icode) (op0, op1, op2);
29851 if (! pat)
29852 return 0;
29853 emit_insn (pat);
29854 return 0;
29855
29856 case IX86_BUILTIN_LDMXCSR:
29857 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29858 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29859 emit_move_insn (target, op0);
29860 emit_insn (gen_sse_ldmxcsr (target));
29861 return 0;
29862
29863 case IX86_BUILTIN_STMXCSR:
29864 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29865 emit_insn (gen_sse_stmxcsr (target));
29866 return copy_to_mode_reg (SImode, target);
29867
29868 case IX86_BUILTIN_CLFLUSH:
29869 arg0 = CALL_EXPR_ARG (exp, 0);
29870 op0 = expand_normal (arg0);
29871 icode = CODE_FOR_sse2_clflush;
29872 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29873 {
29874 if (GET_MODE (op0) != Pmode)
29875 op0 = convert_to_mode (Pmode, op0, 1);
29876 op0 = force_reg (Pmode, op0);
29877 }
29878
29879 emit_insn (gen_sse2_clflush (op0));
29880 return 0;
29881
29882 case IX86_BUILTIN_MONITOR:
29883 arg0 = CALL_EXPR_ARG (exp, 0);
29884 arg1 = CALL_EXPR_ARG (exp, 1);
29885 arg2 = CALL_EXPR_ARG (exp, 2);
29886 op0 = expand_normal (arg0);
29887 op1 = expand_normal (arg1);
29888 op2 = expand_normal (arg2);
29889 if (!REG_P (op0))
29890 {
29891 if (GET_MODE (op0) != Pmode)
29892 op0 = convert_to_mode (Pmode, op0, 1);
29893 op0 = force_reg (Pmode, op0);
29894 }
29895 if (!REG_P (op1))
29896 op1 = copy_to_mode_reg (SImode, op1);
29897 if (!REG_P (op2))
29898 op2 = copy_to_mode_reg (SImode, op2);
29899 emit_insn (ix86_gen_monitor (op0, op1, op2));
29900 return 0;
29901
29902 case IX86_BUILTIN_MWAIT:
29903 arg0 = CALL_EXPR_ARG (exp, 0);
29904 arg1 = CALL_EXPR_ARG (exp, 1);
29905 op0 = expand_normal (arg0);
29906 op1 = expand_normal (arg1);
29907 if (!REG_P (op0))
29908 op0 = copy_to_mode_reg (SImode, op0);
29909 if (!REG_P (op1))
29910 op1 = copy_to_mode_reg (SImode, op1);
29911 emit_insn (gen_sse3_mwait (op0, op1));
29912 return 0;
29913
29914 case IX86_BUILTIN_VEC_INIT_V2SI:
29915 case IX86_BUILTIN_VEC_INIT_V4HI:
29916 case IX86_BUILTIN_VEC_INIT_V8QI:
29917 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29918
29919 case IX86_BUILTIN_VEC_EXT_V2DF:
29920 case IX86_BUILTIN_VEC_EXT_V2DI:
29921 case IX86_BUILTIN_VEC_EXT_V4SF:
29922 case IX86_BUILTIN_VEC_EXT_V4SI:
29923 case IX86_BUILTIN_VEC_EXT_V8HI:
29924 case IX86_BUILTIN_VEC_EXT_V2SI:
29925 case IX86_BUILTIN_VEC_EXT_V4HI:
29926 case IX86_BUILTIN_VEC_EXT_V16QI:
29927 return ix86_expand_vec_ext_builtin (exp, target);
29928
29929 case IX86_BUILTIN_VEC_SET_V2DI:
29930 case IX86_BUILTIN_VEC_SET_V4SF:
29931 case IX86_BUILTIN_VEC_SET_V4SI:
29932 case IX86_BUILTIN_VEC_SET_V8HI:
29933 case IX86_BUILTIN_VEC_SET_V4HI:
29934 case IX86_BUILTIN_VEC_SET_V16QI:
29935 return ix86_expand_vec_set_builtin (exp);
29936
29937 case IX86_BUILTIN_INFQ:
29938 case IX86_BUILTIN_HUGE_VALQ:
29939 {
29940 REAL_VALUE_TYPE inf;
29941 rtx tmp;
29942
29943 real_inf (&inf);
29944 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29945
29946 tmp = validize_mem (force_const_mem (mode, tmp));
29947
29948 if (target == 0)
29949 target = gen_reg_rtx (mode);
29950
29951 emit_move_insn (target, tmp);
29952 return target;
29953 }
29954
29955 case IX86_BUILTIN_LLWPCB:
29956 arg0 = CALL_EXPR_ARG (exp, 0);
29957 op0 = expand_normal (arg0);
29958 icode = CODE_FOR_lwp_llwpcb;
29959 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29960 {
29961 if (GET_MODE (op0) != Pmode)
29962 op0 = convert_to_mode (Pmode, op0, 1);
29963 op0 = force_reg (Pmode, op0);
29964 }
29965 emit_insn (gen_lwp_llwpcb (op0));
29966 return 0;
29967
29968 case IX86_BUILTIN_SLWPCB:
29969 icode = CODE_FOR_lwp_slwpcb;
29970 if (!target
29971 || !insn_data[icode].operand[0].predicate (target, Pmode))
29972 target = gen_reg_rtx (Pmode);
29973 emit_insn (gen_lwp_slwpcb (target));
29974 return target;
29975
29976 case IX86_BUILTIN_BEXTRI32:
29977 case IX86_BUILTIN_BEXTRI64:
29978 arg0 = CALL_EXPR_ARG (exp, 0);
29979 arg1 = CALL_EXPR_ARG (exp, 1);
29980 op0 = expand_normal (arg0);
29981 op1 = expand_normal (arg1);
29982 icode = (fcode == IX86_BUILTIN_BEXTRI32
29983 ? CODE_FOR_tbm_bextri_si
29984 : CODE_FOR_tbm_bextri_di);
29985 if (!CONST_INT_P (op1))
29986 {
29987 error ("last argument must be an immediate");
29988 return const0_rtx;
29989 }
29990 else
29991 {
29992 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29993 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29994 op1 = GEN_INT (length);
29995 op2 = GEN_INT (lsb_index);
29996 pat = GEN_FCN (icode) (target, op0, op1, op2);
29997 if (pat)
29998 emit_insn (pat);
29999 return target;
30000 }
30001
30002 case IX86_BUILTIN_RDRAND16_STEP:
30003 icode = CODE_FOR_rdrandhi_1;
30004 mode0 = HImode;
30005 goto rdrand_step;
30006
30007 case IX86_BUILTIN_RDRAND32_STEP:
30008 icode = CODE_FOR_rdrandsi_1;
30009 mode0 = SImode;
30010 goto rdrand_step;
30011
30012 case IX86_BUILTIN_RDRAND64_STEP:
30013 icode = CODE_FOR_rdranddi_1;
30014 mode0 = DImode;
30015
30016 rdrand_step:
30017 op0 = gen_reg_rtx (mode0);
30018 emit_insn (GEN_FCN (icode) (op0));
30019
30020 arg0 = CALL_EXPR_ARG (exp, 0);
30021 op1 = expand_normal (arg0);
30022 if (!address_operand (op1, VOIDmode))
30023 {
30024 op1 = convert_memory_address (Pmode, op1);
30025 op1 = copy_addr_to_reg (op1);
30026 }
30027 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30028
30029 op1 = gen_reg_rtx (SImode);
30030 emit_move_insn (op1, CONST1_RTX (SImode));
30031
30032 /* Emit SImode conditional move. */
30033 if (mode0 == HImode)
30034 {
30035 op2 = gen_reg_rtx (SImode);
30036 emit_insn (gen_zero_extendhisi2 (op2, op0));
30037 }
30038 else if (mode0 == SImode)
30039 op2 = op0;
30040 else
30041 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30042
30043 if (target == 0)
30044 target = gen_reg_rtx (SImode);
30045
30046 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30047 const0_rtx);
30048 emit_insn (gen_rtx_SET (VOIDmode, target,
30049 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30050 return target;
30051
30052 case IX86_BUILTIN_GATHERSIV2DF:
30053 icode = CODE_FOR_avx2_gathersiv2df;
30054 goto gather_gen;
30055 case IX86_BUILTIN_GATHERSIV4DF:
30056 icode = CODE_FOR_avx2_gathersiv4df;
30057 goto gather_gen;
30058 case IX86_BUILTIN_GATHERDIV2DF:
30059 icode = CODE_FOR_avx2_gatherdiv2df;
30060 goto gather_gen;
30061 case IX86_BUILTIN_GATHERDIV4DF:
30062 icode = CODE_FOR_avx2_gatherdiv4df;
30063 goto gather_gen;
30064 case IX86_BUILTIN_GATHERSIV4SF:
30065 icode = CODE_FOR_avx2_gathersiv4sf;
30066 goto gather_gen;
30067 case IX86_BUILTIN_GATHERSIV8SF:
30068 icode = CODE_FOR_avx2_gathersiv8sf;
30069 goto gather_gen;
30070 case IX86_BUILTIN_GATHERDIV4SF:
30071 icode = CODE_FOR_avx2_gatherdiv4sf;
30072 goto gather_gen;
30073 case IX86_BUILTIN_GATHERDIV8SF:
30074 icode = CODE_FOR_avx2_gatherdiv8sf;
30075 goto gather_gen;
30076 case IX86_BUILTIN_GATHERSIV2DI:
30077 icode = CODE_FOR_avx2_gathersiv2di;
30078 goto gather_gen;
30079 case IX86_BUILTIN_GATHERSIV4DI:
30080 icode = CODE_FOR_avx2_gathersiv4di;
30081 goto gather_gen;
30082 case IX86_BUILTIN_GATHERDIV2DI:
30083 icode = CODE_FOR_avx2_gatherdiv2di;
30084 goto gather_gen;
30085 case IX86_BUILTIN_GATHERDIV4DI:
30086 icode = CODE_FOR_avx2_gatherdiv4di;
30087 goto gather_gen;
30088 case IX86_BUILTIN_GATHERSIV4SI:
30089 icode = CODE_FOR_avx2_gathersiv4si;
30090 goto gather_gen;
30091 case IX86_BUILTIN_GATHERSIV8SI:
30092 icode = CODE_FOR_avx2_gathersiv8si;
30093 goto gather_gen;
30094 case IX86_BUILTIN_GATHERDIV4SI:
30095 icode = CODE_FOR_avx2_gatherdiv4si;
30096 goto gather_gen;
30097 case IX86_BUILTIN_GATHERDIV8SI:
30098 icode = CODE_FOR_avx2_gatherdiv8si;
30099 goto gather_gen;
30100 case IX86_BUILTIN_GATHERALTSIV4DF:
30101 icode = CODE_FOR_avx2_gathersiv4df;
30102 goto gather_gen;
30103 case IX86_BUILTIN_GATHERALTDIV8SF:
30104 icode = CODE_FOR_avx2_gatherdiv8sf;
30105 goto gather_gen;
30106 case IX86_BUILTIN_GATHERALTSIV4DI:
30107 icode = CODE_FOR_avx2_gathersiv4di;
30108 goto gather_gen;
30109 case IX86_BUILTIN_GATHERALTDIV8SI:
30110 icode = CODE_FOR_avx2_gatherdiv8si;
30111 goto gather_gen;
30112
30113 gather_gen:
30114 arg0 = CALL_EXPR_ARG (exp, 0);
30115 arg1 = CALL_EXPR_ARG (exp, 1);
30116 arg2 = CALL_EXPR_ARG (exp, 2);
30117 arg3 = CALL_EXPR_ARG (exp, 3);
30118 arg4 = CALL_EXPR_ARG (exp, 4);
30119 op0 = expand_normal (arg0);
30120 op1 = expand_normal (arg1);
30121 op2 = expand_normal (arg2);
30122 op3 = expand_normal (arg3);
30123 op4 = expand_normal (arg4);
30124 /* Note the arg order is different from the operand order. */
30125 mode0 = insn_data[icode].operand[1].mode;
30126 mode2 = insn_data[icode].operand[3].mode;
30127 mode3 = insn_data[icode].operand[4].mode;
30128 mode4 = insn_data[icode].operand[5].mode;
30129
30130 if (target == NULL_RTX
30131 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30132 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30133 else
30134 subtarget = target;
30135
30136 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30137 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30138 {
30139 rtx half = gen_reg_rtx (V4SImode);
30140 if (!nonimmediate_operand (op2, V8SImode))
30141 op2 = copy_to_mode_reg (V8SImode, op2);
30142 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30143 op2 = half;
30144 }
30145 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30146 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30147 {
30148 rtx (*gen) (rtx, rtx);
30149 rtx half = gen_reg_rtx (mode0);
30150 if (mode0 == V4SFmode)
30151 gen = gen_vec_extract_lo_v8sf;
30152 else
30153 gen = gen_vec_extract_lo_v8si;
30154 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30155 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30156 emit_insn (gen (half, op0));
30157 op0 = half;
30158 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30159 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30160 emit_insn (gen (half, op3));
30161 op3 = half;
30162 }
30163
30164 /* Force memory operand only with base register here. But we
30165 don't want to do it on memory operand for other builtin
30166 functions. */
30167 if (GET_MODE (op1) != Pmode)
30168 op1 = convert_to_mode (Pmode, op1, 1);
30169 op1 = force_reg (Pmode, op1);
30170
30171 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30172 op0 = copy_to_mode_reg (mode0, op0);
30173 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30174 op1 = copy_to_mode_reg (Pmode, op1);
30175 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30176 op2 = copy_to_mode_reg (mode2, op2);
30177 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30178 op3 = copy_to_mode_reg (mode3, op3);
30179 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30180 {
30181 error ("last argument must be scale 1, 2, 4, 8");
30182 return const0_rtx;
30183 }
30184
30185 /* Optimize. If mask is known to have all high bits set,
30186 replace op0 with pc_rtx to signal that the instruction
30187 overwrites the whole destination and doesn't use its
30188 previous contents. */
30189 if (optimize)
30190 {
30191 if (TREE_CODE (arg3) == VECTOR_CST)
30192 {
30193 unsigned int negative = 0;
30194 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30195 {
30196 tree cst = VECTOR_CST_ELT (arg3, i);
30197 if (TREE_CODE (cst) == INTEGER_CST
30198 && tree_int_cst_sign_bit (cst))
30199 negative++;
30200 else if (TREE_CODE (cst) == REAL_CST
30201 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30202 negative++;
30203 }
30204 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30205 op0 = pc_rtx;
30206 }
30207 else if (TREE_CODE (arg3) == SSA_NAME)
30208 {
30209 /* Recognize also when mask is like:
30210 __v2df src = _mm_setzero_pd ();
30211 __v2df mask = _mm_cmpeq_pd (src, src);
30212 or
30213 __v8sf src = _mm256_setzero_ps ();
30214 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30215 as that is a cheaper way to load all ones into
30216 a register than having to load a constant from
30217 memory. */
30218 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30219 if (is_gimple_call (def_stmt))
30220 {
30221 tree fndecl = gimple_call_fndecl (def_stmt);
30222 if (fndecl
30223 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30224 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30225 {
30226 case IX86_BUILTIN_CMPPD:
30227 case IX86_BUILTIN_CMPPS:
30228 case IX86_BUILTIN_CMPPD256:
30229 case IX86_BUILTIN_CMPPS256:
30230 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30231 break;
30232 /* FALLTHRU */
30233 case IX86_BUILTIN_CMPEQPD:
30234 case IX86_BUILTIN_CMPEQPS:
30235 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30236 && initializer_zerop (gimple_call_arg (def_stmt,
30237 1)))
30238 op0 = pc_rtx;
30239 break;
30240 default:
30241 break;
30242 }
30243 }
30244 }
30245 }
30246
30247 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30248 if (! pat)
30249 return const0_rtx;
30250 emit_insn (pat);
30251
30252 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30253 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30254 {
30255 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30256 ? V4SFmode : V4SImode;
30257 if (target == NULL_RTX)
30258 target = gen_reg_rtx (tmode);
30259 if (tmode == V4SFmode)
30260 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30261 else
30262 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30263 }
30264 else
30265 target = subtarget;
30266
30267 return target;
30268
30269 case IX86_BUILTIN_XABORT:
30270 icode = CODE_FOR_xabort;
30271 arg0 = CALL_EXPR_ARG (exp, 0);
30272 op0 = expand_normal (arg0);
30273 mode0 = insn_data[icode].operand[0].mode;
30274 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30275 {
30276 error ("the xabort's argument must be an 8-bit immediate");
30277 return const0_rtx;
30278 }
30279 emit_insn (gen_xabort (op0));
30280 return 0;
30281
30282 default:
30283 break;
30284 }
30285
30286 for (i = 0, d = bdesc_special_args;
30287 i < ARRAY_SIZE (bdesc_special_args);
30288 i++, d++)
30289 if (d->code == fcode)
30290 return ix86_expand_special_args_builtin (d, exp, target);
30291
30292 for (i = 0, d = bdesc_args;
30293 i < ARRAY_SIZE (bdesc_args);
30294 i++, d++)
30295 if (d->code == fcode)
30296 switch (fcode)
30297 {
30298 case IX86_BUILTIN_FABSQ:
30299 case IX86_BUILTIN_COPYSIGNQ:
30300 if (!TARGET_SSE)
30301 /* Emit a normal call if SSE isn't available. */
30302 return expand_call (exp, target, ignore);
30303 default:
30304 return ix86_expand_args_builtin (d, exp, target);
30305 }
30306
30307 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30308 if (d->code == fcode)
30309 return ix86_expand_sse_comi (d, exp, target);
30310
30311 for (i = 0, d = bdesc_pcmpestr;
30312 i < ARRAY_SIZE (bdesc_pcmpestr);
30313 i++, d++)
30314 if (d->code == fcode)
30315 return ix86_expand_sse_pcmpestr (d, exp, target);
30316
30317 for (i = 0, d = bdesc_pcmpistr;
30318 i < ARRAY_SIZE (bdesc_pcmpistr);
30319 i++, d++)
30320 if (d->code == fcode)
30321 return ix86_expand_sse_pcmpistr (d, exp, target);
30322
30323 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30324 if (d->code == fcode)
30325 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30326 (enum ix86_builtin_func_type)
30327 d->flag, d->comparison);
30328
30329 gcc_unreachable ();
30330 }
30331
30332 /* Returns a function decl for a vectorized version of the builtin function
30333 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30334 if it is not available. */
30335
30336 static tree
30337 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30338 tree type_in)
30339 {
30340 enum machine_mode in_mode, out_mode;
30341 int in_n, out_n;
30342 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30343
30344 if (TREE_CODE (type_out) != VECTOR_TYPE
30345 || TREE_CODE (type_in) != VECTOR_TYPE
30346 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30347 return NULL_TREE;
30348
30349 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30350 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30351 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30352 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30353
30354 switch (fn)
30355 {
30356 case BUILT_IN_SQRT:
30357 if (out_mode == DFmode && in_mode == DFmode)
30358 {
30359 if (out_n == 2 && in_n == 2)
30360 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30361 else if (out_n == 4 && in_n == 4)
30362 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30363 }
30364 break;
30365
30366 case BUILT_IN_SQRTF:
30367 if (out_mode == SFmode && in_mode == SFmode)
30368 {
30369 if (out_n == 4 && in_n == 4)
30370 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30371 else if (out_n == 8 && in_n == 8)
30372 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30373 }
30374 break;
30375
30376 case BUILT_IN_IFLOOR:
30377 case BUILT_IN_LFLOOR:
30378 case BUILT_IN_LLFLOOR:
30379 /* The round insn does not trap on denormals. */
30380 if (flag_trapping_math || !TARGET_ROUND)
30381 break;
30382
30383 if (out_mode == SImode && in_mode == DFmode)
30384 {
30385 if (out_n == 4 && in_n == 2)
30386 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30387 else if (out_n == 8 && in_n == 4)
30388 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30389 }
30390 break;
30391
30392 case BUILT_IN_IFLOORF:
30393 case BUILT_IN_LFLOORF:
30394 case BUILT_IN_LLFLOORF:
30395 /* The round insn does not trap on denormals. */
30396 if (flag_trapping_math || !TARGET_ROUND)
30397 break;
30398
30399 if (out_mode == SImode && in_mode == SFmode)
30400 {
30401 if (out_n == 4 && in_n == 4)
30402 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30403 else if (out_n == 8 && in_n == 8)
30404 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30405 }
30406 break;
30407
30408 case BUILT_IN_ICEIL:
30409 case BUILT_IN_LCEIL:
30410 case BUILT_IN_LLCEIL:
30411 /* The round insn does not trap on denormals. */
30412 if (flag_trapping_math || !TARGET_ROUND)
30413 break;
30414
30415 if (out_mode == SImode && in_mode == DFmode)
30416 {
30417 if (out_n == 4 && in_n == 2)
30418 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30419 else if (out_n == 8 && in_n == 4)
30420 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30421 }
30422 break;
30423
30424 case BUILT_IN_ICEILF:
30425 case BUILT_IN_LCEILF:
30426 case BUILT_IN_LLCEILF:
30427 /* The round insn does not trap on denormals. */
30428 if (flag_trapping_math || !TARGET_ROUND)
30429 break;
30430
30431 if (out_mode == SImode && in_mode == SFmode)
30432 {
30433 if (out_n == 4 && in_n == 4)
30434 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30435 else if (out_n == 8 && in_n == 8)
30436 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30437 }
30438 break;
30439
30440 case BUILT_IN_IRINT:
30441 case BUILT_IN_LRINT:
30442 case BUILT_IN_LLRINT:
30443 if (out_mode == SImode && in_mode == DFmode)
30444 {
30445 if (out_n == 4 && in_n == 2)
30446 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30447 else if (out_n == 8 && in_n == 4)
30448 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30449 }
30450 break;
30451
30452 case BUILT_IN_IRINTF:
30453 case BUILT_IN_LRINTF:
30454 case BUILT_IN_LLRINTF:
30455 if (out_mode == SImode && in_mode == SFmode)
30456 {
30457 if (out_n == 4 && in_n == 4)
30458 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30459 else if (out_n == 8 && in_n == 8)
30460 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30461 }
30462 break;
30463
30464 case BUILT_IN_IROUND:
30465 case BUILT_IN_LROUND:
30466 case BUILT_IN_LLROUND:
30467 /* The round insn does not trap on denormals. */
30468 if (flag_trapping_math || !TARGET_ROUND)
30469 break;
30470
30471 if (out_mode == SImode && in_mode == DFmode)
30472 {
30473 if (out_n == 4 && in_n == 2)
30474 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30475 else if (out_n == 8 && in_n == 4)
30476 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30477 }
30478 break;
30479
30480 case BUILT_IN_IROUNDF:
30481 case BUILT_IN_LROUNDF:
30482 case BUILT_IN_LLROUNDF:
30483 /* The round insn does not trap on denormals. */
30484 if (flag_trapping_math || !TARGET_ROUND)
30485 break;
30486
30487 if (out_mode == SImode && in_mode == SFmode)
30488 {
30489 if (out_n == 4 && in_n == 4)
30490 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30491 else if (out_n == 8 && in_n == 8)
30492 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30493 }
30494 break;
30495
30496 case BUILT_IN_COPYSIGN:
30497 if (out_mode == DFmode && in_mode == DFmode)
30498 {
30499 if (out_n == 2 && in_n == 2)
30500 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30501 else if (out_n == 4 && in_n == 4)
30502 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30503 }
30504 break;
30505
30506 case BUILT_IN_COPYSIGNF:
30507 if (out_mode == SFmode && in_mode == SFmode)
30508 {
30509 if (out_n == 4 && in_n == 4)
30510 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30511 else if (out_n == 8 && in_n == 8)
30512 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30513 }
30514 break;
30515
30516 case BUILT_IN_FLOOR:
30517 /* The round insn does not trap on denormals. */
30518 if (flag_trapping_math || !TARGET_ROUND)
30519 break;
30520
30521 if (out_mode == DFmode && in_mode == DFmode)
30522 {
30523 if (out_n == 2 && in_n == 2)
30524 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30525 else if (out_n == 4 && in_n == 4)
30526 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30527 }
30528 break;
30529
30530 case BUILT_IN_FLOORF:
30531 /* The round insn does not trap on denormals. */
30532 if (flag_trapping_math || !TARGET_ROUND)
30533 break;
30534
30535 if (out_mode == SFmode && in_mode == SFmode)
30536 {
30537 if (out_n == 4 && in_n == 4)
30538 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30539 else if (out_n == 8 && in_n == 8)
30540 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30541 }
30542 break;
30543
30544 case BUILT_IN_CEIL:
30545 /* The round insn does not trap on denormals. */
30546 if (flag_trapping_math || !TARGET_ROUND)
30547 break;
30548
30549 if (out_mode == DFmode && in_mode == DFmode)
30550 {
30551 if (out_n == 2 && in_n == 2)
30552 return ix86_builtins[IX86_BUILTIN_CEILPD];
30553 else if (out_n == 4 && in_n == 4)
30554 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30555 }
30556 break;
30557
30558 case BUILT_IN_CEILF:
30559 /* The round insn does not trap on denormals. */
30560 if (flag_trapping_math || !TARGET_ROUND)
30561 break;
30562
30563 if (out_mode == SFmode && in_mode == SFmode)
30564 {
30565 if (out_n == 4 && in_n == 4)
30566 return ix86_builtins[IX86_BUILTIN_CEILPS];
30567 else if (out_n == 8 && in_n == 8)
30568 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30569 }
30570 break;
30571
30572 case BUILT_IN_TRUNC:
30573 /* The round insn does not trap on denormals. */
30574 if (flag_trapping_math || !TARGET_ROUND)
30575 break;
30576
30577 if (out_mode == DFmode && in_mode == DFmode)
30578 {
30579 if (out_n == 2 && in_n == 2)
30580 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30581 else if (out_n == 4 && in_n == 4)
30582 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30583 }
30584 break;
30585
30586 case BUILT_IN_TRUNCF:
30587 /* The round insn does not trap on denormals. */
30588 if (flag_trapping_math || !TARGET_ROUND)
30589 break;
30590
30591 if (out_mode == SFmode && in_mode == SFmode)
30592 {
30593 if (out_n == 4 && in_n == 4)
30594 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30595 else if (out_n == 8 && in_n == 8)
30596 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30597 }
30598 break;
30599
30600 case BUILT_IN_RINT:
30601 /* The round insn does not trap on denormals. */
30602 if (flag_trapping_math || !TARGET_ROUND)
30603 break;
30604
30605 if (out_mode == DFmode && in_mode == DFmode)
30606 {
30607 if (out_n == 2 && in_n == 2)
30608 return ix86_builtins[IX86_BUILTIN_RINTPD];
30609 else if (out_n == 4 && in_n == 4)
30610 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30611 }
30612 break;
30613
30614 case BUILT_IN_RINTF:
30615 /* The round insn does not trap on denormals. */
30616 if (flag_trapping_math || !TARGET_ROUND)
30617 break;
30618
30619 if (out_mode == SFmode && in_mode == SFmode)
30620 {
30621 if (out_n == 4 && in_n == 4)
30622 return ix86_builtins[IX86_BUILTIN_RINTPS];
30623 else if (out_n == 8 && in_n == 8)
30624 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30625 }
30626 break;
30627
30628 case BUILT_IN_ROUND:
30629 /* The round insn does not trap on denormals. */
30630 if (flag_trapping_math || !TARGET_ROUND)
30631 break;
30632
30633 if (out_mode == DFmode && in_mode == DFmode)
30634 {
30635 if (out_n == 2 && in_n == 2)
30636 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30637 else if (out_n == 4 && in_n == 4)
30638 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30639 }
30640 break;
30641
30642 case BUILT_IN_ROUNDF:
30643 /* The round insn does not trap on denormals. */
30644 if (flag_trapping_math || !TARGET_ROUND)
30645 break;
30646
30647 if (out_mode == SFmode && in_mode == SFmode)
30648 {
30649 if (out_n == 4 && in_n == 4)
30650 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30651 else if (out_n == 8 && in_n == 8)
30652 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30653 }
30654 break;
30655
30656 case BUILT_IN_FMA:
30657 if (out_mode == DFmode && in_mode == DFmode)
30658 {
30659 if (out_n == 2 && in_n == 2)
30660 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30661 if (out_n == 4 && in_n == 4)
30662 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30663 }
30664 break;
30665
30666 case BUILT_IN_FMAF:
30667 if (out_mode == SFmode && in_mode == SFmode)
30668 {
30669 if (out_n == 4 && in_n == 4)
30670 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30671 if (out_n == 8 && in_n == 8)
30672 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30673 }
30674 break;
30675
30676 default:
30677 break;
30678 }
30679
30680 /* Dispatch to a handler for a vectorization library. */
30681 if (ix86_veclib_handler)
30682 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30683 type_in);
30684
30685 return NULL_TREE;
30686 }
30687
30688 /* Handler for an SVML-style interface to
30689 a library with vectorized intrinsics. */
30690
30691 static tree
30692 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30693 {
30694 char name[20];
30695 tree fntype, new_fndecl, args;
30696 unsigned arity;
30697 const char *bname;
30698 enum machine_mode el_mode, in_mode;
30699 int n, in_n;
30700
30701 /* The SVML is suitable for unsafe math only. */
30702 if (!flag_unsafe_math_optimizations)
30703 return NULL_TREE;
30704
30705 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30706 n = TYPE_VECTOR_SUBPARTS (type_out);
30707 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30708 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30709 if (el_mode != in_mode
30710 || n != in_n)
30711 return NULL_TREE;
30712
30713 switch (fn)
30714 {
30715 case BUILT_IN_EXP:
30716 case BUILT_IN_LOG:
30717 case BUILT_IN_LOG10:
30718 case BUILT_IN_POW:
30719 case BUILT_IN_TANH:
30720 case BUILT_IN_TAN:
30721 case BUILT_IN_ATAN:
30722 case BUILT_IN_ATAN2:
30723 case BUILT_IN_ATANH:
30724 case BUILT_IN_CBRT:
30725 case BUILT_IN_SINH:
30726 case BUILT_IN_SIN:
30727 case BUILT_IN_ASINH:
30728 case BUILT_IN_ASIN:
30729 case BUILT_IN_COSH:
30730 case BUILT_IN_COS:
30731 case BUILT_IN_ACOSH:
30732 case BUILT_IN_ACOS:
30733 if (el_mode != DFmode || n != 2)
30734 return NULL_TREE;
30735 break;
30736
30737 case BUILT_IN_EXPF:
30738 case BUILT_IN_LOGF:
30739 case BUILT_IN_LOG10F:
30740 case BUILT_IN_POWF:
30741 case BUILT_IN_TANHF:
30742 case BUILT_IN_TANF:
30743 case BUILT_IN_ATANF:
30744 case BUILT_IN_ATAN2F:
30745 case BUILT_IN_ATANHF:
30746 case BUILT_IN_CBRTF:
30747 case BUILT_IN_SINHF:
30748 case BUILT_IN_SINF:
30749 case BUILT_IN_ASINHF:
30750 case BUILT_IN_ASINF:
30751 case BUILT_IN_COSHF:
30752 case BUILT_IN_COSF:
30753 case BUILT_IN_ACOSHF:
30754 case BUILT_IN_ACOSF:
30755 if (el_mode != SFmode || n != 4)
30756 return NULL_TREE;
30757 break;
30758
30759 default:
30760 return NULL_TREE;
30761 }
30762
30763 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30764
30765 if (fn == BUILT_IN_LOGF)
30766 strcpy (name, "vmlsLn4");
30767 else if (fn == BUILT_IN_LOG)
30768 strcpy (name, "vmldLn2");
30769 else if (n == 4)
30770 {
30771 sprintf (name, "vmls%s", bname+10);
30772 name[strlen (name)-1] = '4';
30773 }
30774 else
30775 sprintf (name, "vmld%s2", bname+10);
30776
30777 /* Convert to uppercase. */
30778 name[4] &= ~0x20;
30779
30780 arity = 0;
30781 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30782 args;
30783 args = TREE_CHAIN (args))
30784 arity++;
30785
30786 if (arity == 1)
30787 fntype = build_function_type_list (type_out, type_in, NULL);
30788 else
30789 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30790
30791 /* Build a function declaration for the vectorized function. */
30792 new_fndecl = build_decl (BUILTINS_LOCATION,
30793 FUNCTION_DECL, get_identifier (name), fntype);
30794 TREE_PUBLIC (new_fndecl) = 1;
30795 DECL_EXTERNAL (new_fndecl) = 1;
30796 DECL_IS_NOVOPS (new_fndecl) = 1;
30797 TREE_READONLY (new_fndecl) = 1;
30798
30799 return new_fndecl;
30800 }
30801
30802 /* Handler for an ACML-style interface to
30803 a library with vectorized intrinsics. */
30804
30805 static tree
30806 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30807 {
30808 char name[20] = "__vr.._";
30809 tree fntype, new_fndecl, args;
30810 unsigned arity;
30811 const char *bname;
30812 enum machine_mode el_mode, in_mode;
30813 int n, in_n;
30814
30815 /* The ACML is 64bits only and suitable for unsafe math only as
30816 it does not correctly support parts of IEEE with the required
30817 precision such as denormals. */
30818 if (!TARGET_64BIT
30819 || !flag_unsafe_math_optimizations)
30820 return NULL_TREE;
30821
30822 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30823 n = TYPE_VECTOR_SUBPARTS (type_out);
30824 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30825 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30826 if (el_mode != in_mode
30827 || n != in_n)
30828 return NULL_TREE;
30829
30830 switch (fn)
30831 {
30832 case BUILT_IN_SIN:
30833 case BUILT_IN_COS:
30834 case BUILT_IN_EXP:
30835 case BUILT_IN_LOG:
30836 case BUILT_IN_LOG2:
30837 case BUILT_IN_LOG10:
30838 name[4] = 'd';
30839 name[5] = '2';
30840 if (el_mode != DFmode
30841 || n != 2)
30842 return NULL_TREE;
30843 break;
30844
30845 case BUILT_IN_SINF:
30846 case BUILT_IN_COSF:
30847 case BUILT_IN_EXPF:
30848 case BUILT_IN_POWF:
30849 case BUILT_IN_LOGF:
30850 case BUILT_IN_LOG2F:
30851 case BUILT_IN_LOG10F:
30852 name[4] = 's';
30853 name[5] = '4';
30854 if (el_mode != SFmode
30855 || n != 4)
30856 return NULL_TREE;
30857 break;
30858
30859 default:
30860 return NULL_TREE;
30861 }
30862
30863 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30864 sprintf (name + 7, "%s", bname+10);
30865
30866 arity = 0;
30867 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30868 args;
30869 args = TREE_CHAIN (args))
30870 arity++;
30871
30872 if (arity == 1)
30873 fntype = build_function_type_list (type_out, type_in, NULL);
30874 else
30875 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30876
30877 /* Build a function declaration for the vectorized function. */
30878 new_fndecl = build_decl (BUILTINS_LOCATION,
30879 FUNCTION_DECL, get_identifier (name), fntype);
30880 TREE_PUBLIC (new_fndecl) = 1;
30881 DECL_EXTERNAL (new_fndecl) = 1;
30882 DECL_IS_NOVOPS (new_fndecl) = 1;
30883 TREE_READONLY (new_fndecl) = 1;
30884
30885 return new_fndecl;
30886 }
30887
30888 /* Returns a decl of a function that implements gather load with
30889 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30890 Return NULL_TREE if it is not available. */
30891
30892 static tree
30893 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30894 const_tree index_type, int scale)
30895 {
30896 bool si;
30897 enum ix86_builtins code;
30898
30899 if (! TARGET_AVX2)
30900 return NULL_TREE;
30901
30902 if ((TREE_CODE (index_type) != INTEGER_TYPE
30903 && !POINTER_TYPE_P (index_type))
30904 || (TYPE_MODE (index_type) != SImode
30905 && TYPE_MODE (index_type) != DImode))
30906 return NULL_TREE;
30907
30908 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30909 return NULL_TREE;
30910
30911 /* v*gather* insn sign extends index to pointer mode. */
30912 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30913 && TYPE_UNSIGNED (index_type))
30914 return NULL_TREE;
30915
30916 if (scale <= 0
30917 || scale > 8
30918 || (scale & (scale - 1)) != 0)
30919 return NULL_TREE;
30920
30921 si = TYPE_MODE (index_type) == SImode;
30922 switch (TYPE_MODE (mem_vectype))
30923 {
30924 case V2DFmode:
30925 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30926 break;
30927 case V4DFmode:
30928 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30929 break;
30930 case V2DImode:
30931 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30932 break;
30933 case V4DImode:
30934 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30935 break;
30936 case V4SFmode:
30937 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30938 break;
30939 case V8SFmode:
30940 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30941 break;
30942 case V4SImode:
30943 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30944 break;
30945 case V8SImode:
30946 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30947 break;
30948 default:
30949 return NULL_TREE;
30950 }
30951
30952 return ix86_builtins[code];
30953 }
30954
30955 /* Returns a code for a target-specific builtin that implements
30956 reciprocal of the function, or NULL_TREE if not available. */
30957
30958 static tree
30959 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30960 bool sqrt ATTRIBUTE_UNUSED)
30961 {
30962 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30963 && flag_finite_math_only && !flag_trapping_math
30964 && flag_unsafe_math_optimizations))
30965 return NULL_TREE;
30966
30967 if (md_fn)
30968 /* Machine dependent builtins. */
30969 switch (fn)
30970 {
30971 /* Vectorized version of sqrt to rsqrt conversion. */
30972 case IX86_BUILTIN_SQRTPS_NR:
30973 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30974
30975 case IX86_BUILTIN_SQRTPS_NR256:
30976 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30977
30978 default:
30979 return NULL_TREE;
30980 }
30981 else
30982 /* Normal builtins. */
30983 switch (fn)
30984 {
30985 /* Sqrt to rsqrt conversion. */
30986 case BUILT_IN_SQRTF:
30987 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30988
30989 default:
30990 return NULL_TREE;
30991 }
30992 }
30993 \f
30994 /* Helper for avx_vpermilps256_operand et al. This is also used by
30995 the expansion functions to turn the parallel back into a mask.
30996 The return value is 0 for no match and the imm8+1 for a match. */
30997
30998 int
30999 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
31000 {
31001 unsigned i, nelt = GET_MODE_NUNITS (mode);
31002 unsigned mask = 0;
31003 unsigned char ipar[8];
31004
31005 if (XVECLEN (par, 0) != (int) nelt)
31006 return 0;
31007
31008 /* Validate that all of the elements are constants, and not totally
31009 out of range. Copy the data into an integral array to make the
31010 subsequent checks easier. */
31011 for (i = 0; i < nelt; ++i)
31012 {
31013 rtx er = XVECEXP (par, 0, i);
31014 unsigned HOST_WIDE_INT ei;
31015
31016 if (!CONST_INT_P (er))
31017 return 0;
31018 ei = INTVAL (er);
31019 if (ei >= nelt)
31020 return 0;
31021 ipar[i] = ei;
31022 }
31023
31024 switch (mode)
31025 {
31026 case V4DFmode:
31027 /* In the 256-bit DFmode case, we can only move elements within
31028 a 128-bit lane. */
31029 for (i = 0; i < 2; ++i)
31030 {
31031 if (ipar[i] >= 2)
31032 return 0;
31033 mask |= ipar[i] << i;
31034 }
31035 for (i = 2; i < 4; ++i)
31036 {
31037 if (ipar[i] < 2)
31038 return 0;
31039 mask |= (ipar[i] - 2) << i;
31040 }
31041 break;
31042
31043 case V8SFmode:
31044 /* In the 256-bit SFmode case, we have full freedom of movement
31045 within the low 128-bit lane, but the high 128-bit lane must
31046 mirror the exact same pattern. */
31047 for (i = 0; i < 4; ++i)
31048 if (ipar[i] + 4 != ipar[i + 4])
31049 return 0;
31050 nelt = 4;
31051 /* FALLTHRU */
31052
31053 case V2DFmode:
31054 case V4SFmode:
31055 /* In the 128-bit case, we've full freedom in the placement of
31056 the elements from the source operand. */
31057 for (i = 0; i < nelt; ++i)
31058 mask |= ipar[i] << (i * (nelt / 2));
31059 break;
31060
31061 default:
31062 gcc_unreachable ();
31063 }
31064
31065 /* Make sure success has a non-zero value by adding one. */
31066 return mask + 1;
31067 }
31068
31069 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
31070 the expansion functions to turn the parallel back into a mask.
31071 The return value is 0 for no match and the imm8+1 for a match. */
31072
31073 int
31074 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
31075 {
31076 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
31077 unsigned mask = 0;
31078 unsigned char ipar[8];
31079
31080 if (XVECLEN (par, 0) != (int) nelt)
31081 return 0;
31082
31083 /* Validate that all of the elements are constants, and not totally
31084 out of range. Copy the data into an integral array to make the
31085 subsequent checks easier. */
31086 for (i = 0; i < nelt; ++i)
31087 {
31088 rtx er = XVECEXP (par, 0, i);
31089 unsigned HOST_WIDE_INT ei;
31090
31091 if (!CONST_INT_P (er))
31092 return 0;
31093 ei = INTVAL (er);
31094 if (ei >= 2 * nelt)
31095 return 0;
31096 ipar[i] = ei;
31097 }
31098
31099 /* Validate that the halves of the permute are halves. */
31100 for (i = 0; i < nelt2 - 1; ++i)
31101 if (ipar[i] + 1 != ipar[i + 1])
31102 return 0;
31103 for (i = nelt2; i < nelt - 1; ++i)
31104 if (ipar[i] + 1 != ipar[i + 1])
31105 return 0;
31106
31107 /* Reconstruct the mask. */
31108 for (i = 0; i < 2; ++i)
31109 {
31110 unsigned e = ipar[i * nelt2];
31111 if (e % nelt2)
31112 return 0;
31113 e /= nelt2;
31114 mask |= e << (i * 4);
31115 }
31116
31117 /* Make sure success has a non-zero value by adding one. */
31118 return mask + 1;
31119 }
31120 \f
31121 /* Store OPERAND to the memory after reload is completed. This means
31122 that we can't easily use assign_stack_local. */
31123 rtx
31124 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31125 {
31126 rtx result;
31127
31128 gcc_assert (reload_completed);
31129 if (ix86_using_red_zone ())
31130 {
31131 result = gen_rtx_MEM (mode,
31132 gen_rtx_PLUS (Pmode,
31133 stack_pointer_rtx,
31134 GEN_INT (-RED_ZONE_SIZE)));
31135 emit_move_insn (result, operand);
31136 }
31137 else if (TARGET_64BIT)
31138 {
31139 switch (mode)
31140 {
31141 case HImode:
31142 case SImode:
31143 operand = gen_lowpart (DImode, operand);
31144 /* FALLTHRU */
31145 case DImode:
31146 emit_insn (
31147 gen_rtx_SET (VOIDmode,
31148 gen_rtx_MEM (DImode,
31149 gen_rtx_PRE_DEC (DImode,
31150 stack_pointer_rtx)),
31151 operand));
31152 break;
31153 default:
31154 gcc_unreachable ();
31155 }
31156 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31157 }
31158 else
31159 {
31160 switch (mode)
31161 {
31162 case DImode:
31163 {
31164 rtx operands[2];
31165 split_double_mode (mode, &operand, 1, operands, operands + 1);
31166 emit_insn (
31167 gen_rtx_SET (VOIDmode,
31168 gen_rtx_MEM (SImode,
31169 gen_rtx_PRE_DEC (Pmode,
31170 stack_pointer_rtx)),
31171 operands[1]));
31172 emit_insn (
31173 gen_rtx_SET (VOIDmode,
31174 gen_rtx_MEM (SImode,
31175 gen_rtx_PRE_DEC (Pmode,
31176 stack_pointer_rtx)),
31177 operands[0]));
31178 }
31179 break;
31180 case HImode:
31181 /* Store HImodes as SImodes. */
31182 operand = gen_lowpart (SImode, operand);
31183 /* FALLTHRU */
31184 case SImode:
31185 emit_insn (
31186 gen_rtx_SET (VOIDmode,
31187 gen_rtx_MEM (GET_MODE (operand),
31188 gen_rtx_PRE_DEC (SImode,
31189 stack_pointer_rtx)),
31190 operand));
31191 break;
31192 default:
31193 gcc_unreachable ();
31194 }
31195 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31196 }
31197 return result;
31198 }
31199
31200 /* Free operand from the memory. */
31201 void
31202 ix86_free_from_memory (enum machine_mode mode)
31203 {
31204 if (!ix86_using_red_zone ())
31205 {
31206 int size;
31207
31208 if (mode == DImode || TARGET_64BIT)
31209 size = 8;
31210 else
31211 size = 4;
31212 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31213 to pop or add instruction if registers are available. */
31214 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31215 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31216 GEN_INT (size))));
31217 }
31218 }
31219
31220 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31221
31222 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31223 QImode must go into class Q_REGS.
31224 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31225 movdf to do mem-to-mem moves through integer regs. */
31226
31227 static reg_class_t
31228 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31229 {
31230 enum machine_mode mode = GET_MODE (x);
31231
31232 /* We're only allowed to return a subclass of CLASS. Many of the
31233 following checks fail for NO_REGS, so eliminate that early. */
31234 if (regclass == NO_REGS)
31235 return NO_REGS;
31236
31237 /* All classes can load zeros. */
31238 if (x == CONST0_RTX (mode))
31239 return regclass;
31240
31241 /* Force constants into memory if we are loading a (nonzero) constant into
31242 an MMX or SSE register. This is because there are no MMX/SSE instructions
31243 to load from a constant. */
31244 if (CONSTANT_P (x)
31245 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31246 return NO_REGS;
31247
31248 /* Prefer SSE regs only, if we can use them for math. */
31249 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31250 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31251
31252 /* Floating-point constants need more complex checks. */
31253 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31254 {
31255 /* General regs can load everything. */
31256 if (reg_class_subset_p (regclass, GENERAL_REGS))
31257 return regclass;
31258
31259 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31260 zero above. We only want to wind up preferring 80387 registers if
31261 we plan on doing computation with them. */
31262 if (TARGET_80387
31263 && standard_80387_constant_p (x) > 0)
31264 {
31265 /* Limit class to non-sse. */
31266 if (regclass == FLOAT_SSE_REGS)
31267 return FLOAT_REGS;
31268 if (regclass == FP_TOP_SSE_REGS)
31269 return FP_TOP_REG;
31270 if (regclass == FP_SECOND_SSE_REGS)
31271 return FP_SECOND_REG;
31272 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31273 return regclass;
31274 }
31275
31276 return NO_REGS;
31277 }
31278
31279 /* Generally when we see PLUS here, it's the function invariant
31280 (plus soft-fp const_int). Which can only be computed into general
31281 regs. */
31282 if (GET_CODE (x) == PLUS)
31283 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31284
31285 /* QImode constants are easy to load, but non-constant QImode data
31286 must go into Q_REGS. */
31287 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31288 {
31289 if (reg_class_subset_p (regclass, Q_REGS))
31290 return regclass;
31291 if (reg_class_subset_p (Q_REGS, regclass))
31292 return Q_REGS;
31293 return NO_REGS;
31294 }
31295
31296 return regclass;
31297 }
31298
31299 /* Discourage putting floating-point values in SSE registers unless
31300 SSE math is being used, and likewise for the 387 registers. */
31301 static reg_class_t
31302 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31303 {
31304 enum machine_mode mode = GET_MODE (x);
31305
31306 /* Restrict the output reload class to the register bank that we are doing
31307 math on. If we would like not to return a subset of CLASS, reject this
31308 alternative: if reload cannot do this, it will still use its choice. */
31309 mode = GET_MODE (x);
31310 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31311 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31312
31313 if (X87_FLOAT_MODE_P (mode))
31314 {
31315 if (regclass == FP_TOP_SSE_REGS)
31316 return FP_TOP_REG;
31317 else if (regclass == FP_SECOND_SSE_REGS)
31318 return FP_SECOND_REG;
31319 else
31320 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31321 }
31322
31323 return regclass;
31324 }
31325
31326 static reg_class_t
31327 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31328 enum machine_mode mode, secondary_reload_info *sri)
31329 {
31330 /* Double-word spills from general registers to non-offsettable memory
31331 references (zero-extended addresses) require special handling. */
31332 if (TARGET_64BIT
31333 && MEM_P (x)
31334 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31335 && rclass == GENERAL_REGS
31336 && !offsettable_memref_p (x))
31337 {
31338 sri->icode = (in_p
31339 ? CODE_FOR_reload_noff_load
31340 : CODE_FOR_reload_noff_store);
31341 /* Add the cost of moving address to a temporary. */
31342 sri->extra_cost = 1;
31343
31344 return NO_REGS;
31345 }
31346
31347 /* QImode spills from non-QI registers require
31348 intermediate register on 32bit targets. */
31349 if (!TARGET_64BIT
31350 && !in_p && mode == QImode
31351 && (rclass == GENERAL_REGS
31352 || rclass == LEGACY_REGS
31353 || rclass == INDEX_REGS))
31354 {
31355 int regno;
31356
31357 if (REG_P (x))
31358 regno = REGNO (x);
31359 else
31360 regno = -1;
31361
31362 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31363 regno = true_regnum (x);
31364
31365 /* Return Q_REGS if the operand is in memory. */
31366 if (regno == -1)
31367 return Q_REGS;
31368 }
31369
31370 /* This condition handles corner case where an expression involving
31371 pointers gets vectorized. We're trying to use the address of a
31372 stack slot as a vector initializer.
31373
31374 (set (reg:V2DI 74 [ vect_cst_.2 ])
31375 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31376
31377 Eventually frame gets turned into sp+offset like this:
31378
31379 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31380 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31381 (const_int 392 [0x188]))))
31382
31383 That later gets turned into:
31384
31385 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31386 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31387 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31388
31389 We'll have the following reload recorded:
31390
31391 Reload 0: reload_in (DI) =
31392 (plus:DI (reg/f:DI 7 sp)
31393 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31394 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31395 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31396 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31397 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31398 reload_reg_rtx: (reg:V2DI 22 xmm1)
31399
31400 Which isn't going to work since SSE instructions can't handle scalar
31401 additions. Returning GENERAL_REGS forces the addition into integer
31402 register and reload can handle subsequent reloads without problems. */
31403
31404 if (in_p && GET_CODE (x) == PLUS
31405 && SSE_CLASS_P (rclass)
31406 && SCALAR_INT_MODE_P (mode))
31407 return GENERAL_REGS;
31408
31409 return NO_REGS;
31410 }
31411
31412 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31413
31414 static bool
31415 ix86_class_likely_spilled_p (reg_class_t rclass)
31416 {
31417 switch (rclass)
31418 {
31419 case AREG:
31420 case DREG:
31421 case CREG:
31422 case BREG:
31423 case AD_REGS:
31424 case SIREG:
31425 case DIREG:
31426 case SSE_FIRST_REG:
31427 case FP_TOP_REG:
31428 case FP_SECOND_REG:
31429 return true;
31430
31431 default:
31432 break;
31433 }
31434
31435 return false;
31436 }
31437
31438 /* If we are copying between general and FP registers, we need a memory
31439 location. The same is true for SSE and MMX registers.
31440
31441 To optimize register_move_cost performance, allow inline variant.
31442
31443 The macro can't work reliably when one of the CLASSES is class containing
31444 registers from multiple units (SSE, MMX, integer). We avoid this by never
31445 combining those units in single alternative in the machine description.
31446 Ensure that this constraint holds to avoid unexpected surprises.
31447
31448 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31449 enforce these sanity checks. */
31450
31451 static inline bool
31452 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31453 enum machine_mode mode, int strict)
31454 {
31455 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31456 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31457 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31458 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31459 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31460 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31461 {
31462 gcc_assert (!strict);
31463 return true;
31464 }
31465
31466 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31467 return true;
31468
31469 /* ??? This is a lie. We do have moves between mmx/general, and for
31470 mmx/sse2. But by saying we need secondary memory we discourage the
31471 register allocator from using the mmx registers unless needed. */
31472 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31473 return true;
31474
31475 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31476 {
31477 /* SSE1 doesn't have any direct moves from other classes. */
31478 if (!TARGET_SSE2)
31479 return true;
31480
31481 /* If the target says that inter-unit moves are more expensive
31482 than moving through memory, then don't generate them. */
31483 if (!TARGET_INTER_UNIT_MOVES)
31484 return true;
31485
31486 /* Between SSE and general, we have moves no larger than word size. */
31487 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31488 return true;
31489 }
31490
31491 return false;
31492 }
31493
31494 bool
31495 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31496 enum machine_mode mode, int strict)
31497 {
31498 return inline_secondary_memory_needed (class1, class2, mode, strict);
31499 }
31500
31501 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31502
31503 On the 80386, this is the size of MODE in words,
31504 except in the FP regs, where a single reg is always enough. */
31505
31506 static unsigned char
31507 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31508 {
31509 if (MAYBE_INTEGER_CLASS_P (rclass))
31510 {
31511 if (mode == XFmode)
31512 return (TARGET_64BIT ? 2 : 3);
31513 else if (mode == XCmode)
31514 return (TARGET_64BIT ? 4 : 6);
31515 else
31516 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31517 }
31518 else
31519 {
31520 if (COMPLEX_MODE_P (mode))
31521 return 2;
31522 else
31523 return 1;
31524 }
31525 }
31526
31527 /* Return true if the registers in CLASS cannot represent the change from
31528 modes FROM to TO. */
31529
31530 bool
31531 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31532 enum reg_class regclass)
31533 {
31534 if (from == to)
31535 return false;
31536
31537 /* x87 registers can't do subreg at all, as all values are reformatted
31538 to extended precision. */
31539 if (MAYBE_FLOAT_CLASS_P (regclass))
31540 return true;
31541
31542 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31543 {
31544 /* Vector registers do not support QI or HImode loads. If we don't
31545 disallow a change to these modes, reload will assume it's ok to
31546 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31547 the vec_dupv4hi pattern. */
31548 if (GET_MODE_SIZE (from) < 4)
31549 return true;
31550
31551 /* Vector registers do not support subreg with nonzero offsets, which
31552 are otherwise valid for integer registers. Since we can't see
31553 whether we have a nonzero offset from here, prohibit all
31554 nonparadoxical subregs changing size. */
31555 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31556 return true;
31557 }
31558
31559 return false;
31560 }
31561
31562 /* Return the cost of moving data of mode M between a
31563 register and memory. A value of 2 is the default; this cost is
31564 relative to those in `REGISTER_MOVE_COST'.
31565
31566 This function is used extensively by register_move_cost that is used to
31567 build tables at startup. Make it inline in this case.
31568 When IN is 2, return maximum of in and out move cost.
31569
31570 If moving between registers and memory is more expensive than
31571 between two registers, you should define this macro to express the
31572 relative cost.
31573
31574 Model also increased moving costs of QImode registers in non
31575 Q_REGS classes.
31576 */
31577 static inline int
31578 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31579 int in)
31580 {
31581 int cost;
31582 if (FLOAT_CLASS_P (regclass))
31583 {
31584 int index;
31585 switch (mode)
31586 {
31587 case SFmode:
31588 index = 0;
31589 break;
31590 case DFmode:
31591 index = 1;
31592 break;
31593 case XFmode:
31594 index = 2;
31595 break;
31596 default:
31597 return 100;
31598 }
31599 if (in == 2)
31600 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31601 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31602 }
31603 if (SSE_CLASS_P (regclass))
31604 {
31605 int index;
31606 switch (GET_MODE_SIZE (mode))
31607 {
31608 case 4:
31609 index = 0;
31610 break;
31611 case 8:
31612 index = 1;
31613 break;
31614 case 16:
31615 index = 2;
31616 break;
31617 default:
31618 return 100;
31619 }
31620 if (in == 2)
31621 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31622 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31623 }
31624 if (MMX_CLASS_P (regclass))
31625 {
31626 int index;
31627 switch (GET_MODE_SIZE (mode))
31628 {
31629 case 4:
31630 index = 0;
31631 break;
31632 case 8:
31633 index = 1;
31634 break;
31635 default:
31636 return 100;
31637 }
31638 if (in)
31639 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31640 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31641 }
31642 switch (GET_MODE_SIZE (mode))
31643 {
31644 case 1:
31645 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31646 {
31647 if (!in)
31648 return ix86_cost->int_store[0];
31649 if (TARGET_PARTIAL_REG_DEPENDENCY
31650 && optimize_function_for_speed_p (cfun))
31651 cost = ix86_cost->movzbl_load;
31652 else
31653 cost = ix86_cost->int_load[0];
31654 if (in == 2)
31655 return MAX (cost, ix86_cost->int_store[0]);
31656 return cost;
31657 }
31658 else
31659 {
31660 if (in == 2)
31661 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31662 if (in)
31663 return ix86_cost->movzbl_load;
31664 else
31665 return ix86_cost->int_store[0] + 4;
31666 }
31667 break;
31668 case 2:
31669 if (in == 2)
31670 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31671 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31672 default:
31673 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31674 if (mode == TFmode)
31675 mode = XFmode;
31676 if (in == 2)
31677 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31678 else if (in)
31679 cost = ix86_cost->int_load[2];
31680 else
31681 cost = ix86_cost->int_store[2];
31682 return (cost * (((int) GET_MODE_SIZE (mode)
31683 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31684 }
31685 }
31686
31687 static int
31688 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31689 bool in)
31690 {
31691 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31692 }
31693
31694
31695 /* Return the cost of moving data from a register in class CLASS1 to
31696 one in class CLASS2.
31697
31698 It is not required that the cost always equal 2 when FROM is the same as TO;
31699 on some machines it is expensive to move between registers if they are not
31700 general registers. */
31701
31702 static int
31703 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31704 reg_class_t class2_i)
31705 {
31706 enum reg_class class1 = (enum reg_class) class1_i;
31707 enum reg_class class2 = (enum reg_class) class2_i;
31708
31709 /* In case we require secondary memory, compute cost of the store followed
31710 by load. In order to avoid bad register allocation choices, we need
31711 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31712
31713 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31714 {
31715 int cost = 1;
31716
31717 cost += inline_memory_move_cost (mode, class1, 2);
31718 cost += inline_memory_move_cost (mode, class2, 2);
31719
31720 /* In case of copying from general_purpose_register we may emit multiple
31721 stores followed by single load causing memory size mismatch stall.
31722 Count this as arbitrarily high cost of 20. */
31723 if (targetm.class_max_nregs (class1, mode)
31724 > targetm.class_max_nregs (class2, mode))
31725 cost += 20;
31726
31727 /* In the case of FP/MMX moves, the registers actually overlap, and we
31728 have to switch modes in order to treat them differently. */
31729 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31730 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31731 cost += 20;
31732
31733 return cost;
31734 }
31735
31736 /* Moves between SSE/MMX and integer unit are expensive. */
31737 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31738 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31739
31740 /* ??? By keeping returned value relatively high, we limit the number
31741 of moves between integer and MMX/SSE registers for all targets.
31742 Additionally, high value prevents problem with x86_modes_tieable_p(),
31743 where integer modes in MMX/SSE registers are not tieable
31744 because of missing QImode and HImode moves to, from or between
31745 MMX/SSE registers. */
31746 return MAX (8, ix86_cost->mmxsse_to_integer);
31747
31748 if (MAYBE_FLOAT_CLASS_P (class1))
31749 return ix86_cost->fp_move;
31750 if (MAYBE_SSE_CLASS_P (class1))
31751 return ix86_cost->sse_move;
31752 if (MAYBE_MMX_CLASS_P (class1))
31753 return ix86_cost->mmx_move;
31754 return 2;
31755 }
31756
31757 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31758 MODE. */
31759
31760 bool
31761 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31762 {
31763 /* Flags and only flags can only hold CCmode values. */
31764 if (CC_REGNO_P (regno))
31765 return GET_MODE_CLASS (mode) == MODE_CC;
31766 if (GET_MODE_CLASS (mode) == MODE_CC
31767 || GET_MODE_CLASS (mode) == MODE_RANDOM
31768 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31769 return false;
31770 if (FP_REGNO_P (regno))
31771 return VALID_FP_MODE_P (mode);
31772 if (SSE_REGNO_P (regno))
31773 {
31774 /* We implement the move patterns for all vector modes into and
31775 out of SSE registers, even when no operation instructions
31776 are available. OImode move is available only when AVX is
31777 enabled. */
31778 return ((TARGET_AVX && mode == OImode)
31779 || VALID_AVX256_REG_MODE (mode)
31780 || VALID_SSE_REG_MODE (mode)
31781 || VALID_SSE2_REG_MODE (mode)
31782 || VALID_MMX_REG_MODE (mode)
31783 || VALID_MMX_REG_MODE_3DNOW (mode));
31784 }
31785 if (MMX_REGNO_P (regno))
31786 {
31787 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31788 so if the register is available at all, then we can move data of
31789 the given mode into or out of it. */
31790 return (VALID_MMX_REG_MODE (mode)
31791 || VALID_MMX_REG_MODE_3DNOW (mode));
31792 }
31793
31794 if (mode == QImode)
31795 {
31796 /* Take care for QImode values - they can be in non-QI regs,
31797 but then they do cause partial register stalls. */
31798 if (regno <= BX_REG || TARGET_64BIT)
31799 return true;
31800 if (!TARGET_PARTIAL_REG_STALL)
31801 return true;
31802 return !can_create_pseudo_p ();
31803 }
31804 /* We handle both integer and floats in the general purpose registers. */
31805 else if (VALID_INT_MODE_P (mode))
31806 return true;
31807 else if (VALID_FP_MODE_P (mode))
31808 return true;
31809 else if (VALID_DFP_MODE_P (mode))
31810 return true;
31811 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31812 on to use that value in smaller contexts, this can easily force a
31813 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31814 supporting DImode, allow it. */
31815 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31816 return true;
31817
31818 return false;
31819 }
31820
31821 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31822 tieable integer mode. */
31823
31824 static bool
31825 ix86_tieable_integer_mode_p (enum machine_mode mode)
31826 {
31827 switch (mode)
31828 {
31829 case HImode:
31830 case SImode:
31831 return true;
31832
31833 case QImode:
31834 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31835
31836 case DImode:
31837 return TARGET_64BIT;
31838
31839 default:
31840 return false;
31841 }
31842 }
31843
31844 /* Return true if MODE1 is accessible in a register that can hold MODE2
31845 without copying. That is, all register classes that can hold MODE2
31846 can also hold MODE1. */
31847
31848 bool
31849 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31850 {
31851 if (mode1 == mode2)
31852 return true;
31853
31854 if (ix86_tieable_integer_mode_p (mode1)
31855 && ix86_tieable_integer_mode_p (mode2))
31856 return true;
31857
31858 /* MODE2 being XFmode implies fp stack or general regs, which means we
31859 can tie any smaller floating point modes to it. Note that we do not
31860 tie this with TFmode. */
31861 if (mode2 == XFmode)
31862 return mode1 == SFmode || mode1 == DFmode;
31863
31864 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31865 that we can tie it with SFmode. */
31866 if (mode2 == DFmode)
31867 return mode1 == SFmode;
31868
31869 /* If MODE2 is only appropriate for an SSE register, then tie with
31870 any other mode acceptable to SSE registers. */
31871 if (GET_MODE_SIZE (mode2) == 32
31872 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31873 return (GET_MODE_SIZE (mode1) == 32
31874 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31875 if (GET_MODE_SIZE (mode2) == 16
31876 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31877 return (GET_MODE_SIZE (mode1) == 16
31878 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31879
31880 /* If MODE2 is appropriate for an MMX register, then tie
31881 with any other mode acceptable to MMX registers. */
31882 if (GET_MODE_SIZE (mode2) == 8
31883 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31884 return (GET_MODE_SIZE (mode1) == 8
31885 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31886
31887 return false;
31888 }
31889
31890 /* Return the cost of moving between two registers of mode MODE. */
31891
31892 static int
31893 ix86_set_reg_reg_cost (enum machine_mode mode)
31894 {
31895 unsigned int units = UNITS_PER_WORD;
31896
31897 switch (GET_MODE_CLASS (mode))
31898 {
31899 default:
31900 break;
31901
31902 case MODE_CC:
31903 units = GET_MODE_SIZE (CCmode);
31904 break;
31905
31906 case MODE_FLOAT:
31907 if ((TARGET_SSE2 && mode == TFmode)
31908 || (TARGET_80387 && mode == XFmode)
31909 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
31910 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
31911 units = GET_MODE_SIZE (mode);
31912 break;
31913
31914 case MODE_COMPLEX_FLOAT:
31915 if ((TARGET_SSE2 && mode == TCmode)
31916 || (TARGET_80387 && mode == XCmode)
31917 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
31918 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
31919 units = GET_MODE_SIZE (mode);
31920 break;
31921
31922 case MODE_VECTOR_INT:
31923 case MODE_VECTOR_FLOAT:
31924 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
31925 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
31926 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
31927 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
31928 units = GET_MODE_SIZE (mode);
31929 }
31930
31931 /* Return the cost of moving between two registers of mode MODE,
31932 assuming that the move will be in pieces of at most UNITS bytes. */
31933 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
31934 }
31935
31936 /* Compute a (partial) cost for rtx X. Return true if the complete
31937 cost has been computed, and false if subexpressions should be
31938 scanned. In either case, *TOTAL contains the cost result. */
31939
31940 static bool
31941 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
31942 bool speed)
31943 {
31944 enum rtx_code code = (enum rtx_code) code_i;
31945 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31946 enum machine_mode mode = GET_MODE (x);
31947 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31948
31949 switch (code)
31950 {
31951 case SET:
31952 if (register_operand (SET_DEST (x), VOIDmode)
31953 && reg_or_0_operand (SET_SRC (x), VOIDmode))
31954 {
31955 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
31956 return true;
31957 }
31958 return false;
31959
31960 case CONST_INT:
31961 case CONST:
31962 case LABEL_REF:
31963 case SYMBOL_REF:
31964 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31965 *total = 3;
31966 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31967 *total = 2;
31968 else if (flag_pic && SYMBOLIC_CONST (x)
31969 && (!TARGET_64BIT
31970 || (!GET_CODE (x) != LABEL_REF
31971 && (GET_CODE (x) != SYMBOL_REF
31972 || !SYMBOL_REF_LOCAL_P (x)))))
31973 *total = 1;
31974 else
31975 *total = 0;
31976 return true;
31977
31978 case CONST_DOUBLE:
31979 if (mode == VOIDmode)
31980 *total = 0;
31981 else
31982 switch (standard_80387_constant_p (x))
31983 {
31984 case 1: /* 0.0 */
31985 *total = 1;
31986 break;
31987 default: /* Other constants */
31988 *total = 2;
31989 break;
31990 case 0:
31991 case -1:
31992 break;
31993 }
31994 /* FALLTHRU */
31995
31996 case CONST_VECTOR:
31997 /* Start with (MEM (SYMBOL_REF)), since that's where
31998 it'll probably end up. Add a penalty for size. */
31999 *total = (COSTS_N_INSNS (1)
32000 + (flag_pic != 0 && !TARGET_64BIT)
32001 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
32002 return true;
32003
32004 case ZERO_EXTEND:
32005 /* The zero extensions is often completely free on x86_64, so make
32006 it as cheap as possible. */
32007 if (TARGET_64BIT && mode == DImode
32008 && GET_MODE (XEXP (x, 0)) == SImode)
32009 *total = 1;
32010 else if (TARGET_ZERO_EXTEND_WITH_AND)
32011 *total = cost->add;
32012 else
32013 *total = cost->movzx;
32014 return false;
32015
32016 case SIGN_EXTEND:
32017 *total = cost->movsx;
32018 return false;
32019
32020 case ASHIFT:
32021 if (SCALAR_INT_MODE_P (mode)
32022 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
32023 && CONST_INT_P (XEXP (x, 1)))
32024 {
32025 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32026 if (value == 1)
32027 {
32028 *total = cost->add;
32029 return false;
32030 }
32031 if ((value == 2 || value == 3)
32032 && cost->lea <= cost->shift_const)
32033 {
32034 *total = cost->lea;
32035 return false;
32036 }
32037 }
32038 /* FALLTHRU */
32039
32040 case ROTATE:
32041 case ASHIFTRT:
32042 case LSHIFTRT:
32043 case ROTATERT:
32044 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32045 {
32046 /* ??? Should be SSE vector operation cost. */
32047 /* At least for published AMD latencies, this really is the same
32048 as the latency for a simple fpu operation like fabs. */
32049 /* V*QImode is emulated with 1-11 insns. */
32050 if (mode == V16QImode || mode == V32QImode)
32051 {
32052 int count;
32053 if (TARGET_XOP && mode == V16QImode)
32054 {
32055 /* For XOP we use vpshab, which requires a broadcast of the
32056 value to the variable shift insn. For constants this
32057 means a V16Q const in mem; even when we can perform the
32058 shift with one insn set the cost to prefer paddb. */
32059 if (CONSTANT_P (XEXP (x, 1)))
32060 {
32061 *total = (cost->fabs
32062 + rtx_cost (XEXP (x, 0), code, 0, speed)
32063 + (speed ? 2 : COSTS_N_BYTES (16)));
32064 return true;
32065 }
32066 count = 3;
32067 }
32068 else
32069 count = TARGET_SSSE3 ? 7 : 11;
32070 *total = cost->fabs * count;
32071 }
32072 else
32073 *total = cost->fabs;
32074 return false;
32075 }
32076 if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
32077 {
32078 if (CONST_INT_P (XEXP (x, 1)))
32079 {
32080 if (INTVAL (XEXP (x, 1)) > 32)
32081 *total = cost->shift_const + COSTS_N_INSNS (2);
32082 else
32083 *total = cost->shift_const * 2;
32084 }
32085 else
32086 {
32087 if (GET_CODE (XEXP (x, 1)) == AND)
32088 *total = cost->shift_var * 2;
32089 else
32090 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
32091 }
32092 }
32093 else
32094 {
32095 if (CONST_INT_P (XEXP (x, 1)))
32096 *total = cost->shift_const;
32097 else
32098 *total = cost->shift_var;
32099 }
32100 return false;
32101
32102 case FMA:
32103 {
32104 rtx sub;
32105
32106 gcc_assert (FLOAT_MODE_P (mode));
32107 gcc_assert (TARGET_FMA || TARGET_FMA4);
32108
32109 /* ??? SSE scalar/vector cost should be used here. */
32110 /* ??? Bald assumption that fma has the same cost as fmul. */
32111 *total = cost->fmul;
32112 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
32113
32114 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
32115 sub = XEXP (x, 0);
32116 if (GET_CODE (sub) == NEG)
32117 sub = XEXP (sub, 0);
32118 *total += rtx_cost (sub, FMA, 0, speed);
32119
32120 sub = XEXP (x, 2);
32121 if (GET_CODE (sub) == NEG)
32122 sub = XEXP (sub, 0);
32123 *total += rtx_cost (sub, FMA, 2, speed);
32124 return true;
32125 }
32126
32127 case MULT:
32128 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32129 {
32130 /* ??? SSE scalar cost should be used here. */
32131 *total = cost->fmul;
32132 return false;
32133 }
32134 else if (X87_FLOAT_MODE_P (mode))
32135 {
32136 *total = cost->fmul;
32137 return false;
32138 }
32139 else if (FLOAT_MODE_P (mode))
32140 {
32141 /* ??? SSE vector cost should be used here. */
32142 *total = cost->fmul;
32143 return false;
32144 }
32145 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32146 {
32147 /* V*QImode is emulated with 7-13 insns. */
32148 if (mode == V16QImode || mode == V32QImode)
32149 {
32150 int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
32151 *total = cost->fmul * 2 + cost->fabs * extra;
32152 }
32153 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
32154 insns, including two PMULUDQ. */
32155 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
32156 *total = cost->fmul * 2 + cost->fabs * 5;
32157 else
32158 *total = cost->fmul;
32159 return false;
32160 }
32161 else
32162 {
32163 rtx op0 = XEXP (x, 0);
32164 rtx op1 = XEXP (x, 1);
32165 int nbits;
32166 if (CONST_INT_P (XEXP (x, 1)))
32167 {
32168 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32169 for (nbits = 0; value != 0; value &= value - 1)
32170 nbits++;
32171 }
32172 else
32173 /* This is arbitrary. */
32174 nbits = 7;
32175
32176 /* Compute costs correctly for widening multiplication. */
32177 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32178 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32179 == GET_MODE_SIZE (mode))
32180 {
32181 int is_mulwiden = 0;
32182 enum machine_mode inner_mode = GET_MODE (op0);
32183
32184 if (GET_CODE (op0) == GET_CODE (op1))
32185 is_mulwiden = 1, op1 = XEXP (op1, 0);
32186 else if (CONST_INT_P (op1))
32187 {
32188 if (GET_CODE (op0) == SIGN_EXTEND)
32189 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32190 == INTVAL (op1);
32191 else
32192 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32193 }
32194
32195 if (is_mulwiden)
32196 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32197 }
32198
32199 *total = (cost->mult_init[MODE_INDEX (mode)]
32200 + nbits * cost->mult_bit
32201 + rtx_cost (op0, outer_code, opno, speed)
32202 + rtx_cost (op1, outer_code, opno, speed));
32203
32204 return true;
32205 }
32206
32207 case DIV:
32208 case UDIV:
32209 case MOD:
32210 case UMOD:
32211 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32212 /* ??? SSE cost should be used here. */
32213 *total = cost->fdiv;
32214 else if (X87_FLOAT_MODE_P (mode))
32215 *total = cost->fdiv;
32216 else if (FLOAT_MODE_P (mode))
32217 /* ??? SSE vector cost should be used here. */
32218 *total = cost->fdiv;
32219 else
32220 *total = cost->divide[MODE_INDEX (mode)];
32221 return false;
32222
32223 case PLUS:
32224 if (GET_MODE_CLASS (mode) == MODE_INT
32225 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
32226 {
32227 if (GET_CODE (XEXP (x, 0)) == PLUS
32228 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32229 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32230 && CONSTANT_P (XEXP (x, 1)))
32231 {
32232 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32233 if (val == 2 || val == 4 || val == 8)
32234 {
32235 *total = cost->lea;
32236 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32237 outer_code, opno, speed);
32238 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32239 outer_code, opno, speed);
32240 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32241 return true;
32242 }
32243 }
32244 else if (GET_CODE (XEXP (x, 0)) == MULT
32245 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32246 {
32247 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32248 if (val == 2 || val == 4 || val == 8)
32249 {
32250 *total = cost->lea;
32251 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32252 outer_code, opno, speed);
32253 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32254 return true;
32255 }
32256 }
32257 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32258 {
32259 *total = cost->lea;
32260 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32261 outer_code, opno, speed);
32262 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32263 outer_code, opno, speed);
32264 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32265 return true;
32266 }
32267 }
32268 /* FALLTHRU */
32269
32270 case MINUS:
32271 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32272 {
32273 /* ??? SSE cost should be used here. */
32274 *total = cost->fadd;
32275 return false;
32276 }
32277 else if (X87_FLOAT_MODE_P (mode))
32278 {
32279 *total = cost->fadd;
32280 return false;
32281 }
32282 else if (FLOAT_MODE_P (mode))
32283 {
32284 /* ??? SSE vector cost should be used here. */
32285 *total = cost->fadd;
32286 return false;
32287 }
32288 /* FALLTHRU */
32289
32290 case AND:
32291 case IOR:
32292 case XOR:
32293 if (!TARGET_64BIT && mode == DImode)
32294 {
32295 *total = (cost->add * 2
32296 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32297 << (GET_MODE (XEXP (x, 0)) != DImode))
32298 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32299 << (GET_MODE (XEXP (x, 1)) != DImode)));
32300 return true;
32301 }
32302 /* FALLTHRU */
32303
32304 case NEG:
32305 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32306 {
32307 /* ??? SSE cost should be used here. */
32308 *total = cost->fchs;
32309 return false;
32310 }
32311 else if (X87_FLOAT_MODE_P (mode))
32312 {
32313 *total = cost->fchs;
32314 return false;
32315 }
32316 else if (FLOAT_MODE_P (mode))
32317 {
32318 /* ??? SSE vector cost should be used here. */
32319 *total = cost->fchs;
32320 return false;
32321 }
32322 /* FALLTHRU */
32323
32324 case NOT:
32325 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32326 {
32327 /* ??? Should be SSE vector operation cost. */
32328 /* At least for published AMD latencies, this really is the same
32329 as the latency for a simple fpu operation like fabs. */
32330 *total = cost->fabs;
32331 return false;
32332 }
32333 if (!TARGET_64BIT && mode == DImode)
32334 *total = cost->add * 2;
32335 else
32336 *total = cost->add;
32337 return false;
32338
32339 case COMPARE:
32340 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32341 && XEXP (XEXP (x, 0), 1) == const1_rtx
32342 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32343 && XEXP (x, 1) == const0_rtx)
32344 {
32345 /* This kind of construct is implemented using test[bwl].
32346 Treat it as if we had an AND. */
32347 *total = (cost->add
32348 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32349 + rtx_cost (const1_rtx, outer_code, opno, speed));
32350 return true;
32351 }
32352 return false;
32353
32354 case FLOAT_EXTEND:
32355 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32356 *total = 0;
32357 return false;
32358
32359 case ABS:
32360 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32361 /* ??? SSE cost should be used here. */
32362 *total = cost->fabs;
32363 else if (X87_FLOAT_MODE_P (mode))
32364 *total = cost->fabs;
32365 else if (FLOAT_MODE_P (mode))
32366 /* ??? SSE vector cost should be used here. */
32367 *total = cost->fabs;
32368 return false;
32369
32370 case SQRT:
32371 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32372 /* ??? SSE cost should be used here. */
32373 *total = cost->fsqrt;
32374 else if (X87_FLOAT_MODE_P (mode))
32375 *total = cost->fsqrt;
32376 else if (FLOAT_MODE_P (mode))
32377 /* ??? SSE vector cost should be used here. */
32378 *total = cost->fsqrt;
32379 return false;
32380
32381 case UNSPEC:
32382 if (XINT (x, 1) == UNSPEC_TP)
32383 *total = 0;
32384 return false;
32385
32386 case VEC_SELECT:
32387 case VEC_CONCAT:
32388 case VEC_MERGE:
32389 case VEC_DUPLICATE:
32390 /* ??? Assume all of these vector manipulation patterns are
32391 recognizable. In which case they all pretty much have the
32392 same cost. */
32393 *total = cost->fabs;
32394 return true;
32395
32396 default:
32397 return false;
32398 }
32399 }
32400
32401 #if TARGET_MACHO
32402
32403 static int current_machopic_label_num;
32404
32405 /* Given a symbol name and its associated stub, write out the
32406 definition of the stub. */
32407
32408 void
32409 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32410 {
32411 unsigned int length;
32412 char *binder_name, *symbol_name, lazy_ptr_name[32];
32413 int label = ++current_machopic_label_num;
32414
32415 /* For 64-bit we shouldn't get here. */
32416 gcc_assert (!TARGET_64BIT);
32417
32418 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32419 symb = targetm.strip_name_encoding (symb);
32420
32421 length = strlen (stub);
32422 binder_name = XALLOCAVEC (char, length + 32);
32423 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32424
32425 length = strlen (symb);
32426 symbol_name = XALLOCAVEC (char, length + 32);
32427 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32428
32429 sprintf (lazy_ptr_name, "L%d$lz", label);
32430
32431 if (MACHOPIC_ATT_STUB)
32432 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32433 else if (MACHOPIC_PURE)
32434 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32435 else
32436 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32437
32438 fprintf (file, "%s:\n", stub);
32439 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32440
32441 if (MACHOPIC_ATT_STUB)
32442 {
32443 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32444 }
32445 else if (MACHOPIC_PURE)
32446 {
32447 /* PIC stub. */
32448 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32449 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32450 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32451 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32452 label, lazy_ptr_name, label);
32453 fprintf (file, "\tjmp\t*%%ecx\n");
32454 }
32455 else
32456 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32457
32458 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32459 it needs no stub-binding-helper. */
32460 if (MACHOPIC_ATT_STUB)
32461 return;
32462
32463 fprintf (file, "%s:\n", binder_name);
32464
32465 if (MACHOPIC_PURE)
32466 {
32467 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32468 fprintf (file, "\tpushl\t%%ecx\n");
32469 }
32470 else
32471 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32472
32473 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32474
32475 /* N.B. Keep the correspondence of these
32476 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32477 old-pic/new-pic/non-pic stubs; altering this will break
32478 compatibility with existing dylibs. */
32479 if (MACHOPIC_PURE)
32480 {
32481 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32482 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32483 }
32484 else
32485 /* 16-byte -mdynamic-no-pic stub. */
32486 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32487
32488 fprintf (file, "%s:\n", lazy_ptr_name);
32489 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32490 fprintf (file, ASM_LONG "%s\n", binder_name);
32491 }
32492 #endif /* TARGET_MACHO */
32493
32494 /* Order the registers for register allocator. */
32495
32496 void
32497 x86_order_regs_for_local_alloc (void)
32498 {
32499 int pos = 0;
32500 int i;
32501
32502 /* First allocate the local general purpose registers. */
32503 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32504 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32505 reg_alloc_order [pos++] = i;
32506
32507 /* Global general purpose registers. */
32508 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32509 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32510 reg_alloc_order [pos++] = i;
32511
32512 /* x87 registers come first in case we are doing FP math
32513 using them. */
32514 if (!TARGET_SSE_MATH)
32515 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32516 reg_alloc_order [pos++] = i;
32517
32518 /* SSE registers. */
32519 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32520 reg_alloc_order [pos++] = i;
32521 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32522 reg_alloc_order [pos++] = i;
32523
32524 /* x87 registers. */
32525 if (TARGET_SSE_MATH)
32526 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32527 reg_alloc_order [pos++] = i;
32528
32529 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32530 reg_alloc_order [pos++] = i;
32531
32532 /* Initialize the rest of array as we do not allocate some registers
32533 at all. */
32534 while (pos < FIRST_PSEUDO_REGISTER)
32535 reg_alloc_order [pos++] = 0;
32536 }
32537
32538 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32539 in struct attribute_spec handler. */
32540 static tree
32541 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32542 tree args,
32543 int flags ATTRIBUTE_UNUSED,
32544 bool *no_add_attrs)
32545 {
32546 if (TREE_CODE (*node) != FUNCTION_TYPE
32547 && TREE_CODE (*node) != METHOD_TYPE
32548 && TREE_CODE (*node) != FIELD_DECL
32549 && TREE_CODE (*node) != TYPE_DECL)
32550 {
32551 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32552 name);
32553 *no_add_attrs = true;
32554 return NULL_TREE;
32555 }
32556 if (TARGET_64BIT)
32557 {
32558 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32559 name);
32560 *no_add_attrs = true;
32561 return NULL_TREE;
32562 }
32563 if (is_attribute_p ("callee_pop_aggregate_return", name))
32564 {
32565 tree cst;
32566
32567 cst = TREE_VALUE (args);
32568 if (TREE_CODE (cst) != INTEGER_CST)
32569 {
32570 warning (OPT_Wattributes,
32571 "%qE attribute requires an integer constant argument",
32572 name);
32573 *no_add_attrs = true;
32574 }
32575 else if (compare_tree_int (cst, 0) != 0
32576 && compare_tree_int (cst, 1) != 0)
32577 {
32578 warning (OPT_Wattributes,
32579 "argument to %qE attribute is neither zero, nor one",
32580 name);
32581 *no_add_attrs = true;
32582 }
32583
32584 return NULL_TREE;
32585 }
32586
32587 return NULL_TREE;
32588 }
32589
32590 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32591 struct attribute_spec.handler. */
32592 static tree
32593 ix86_handle_abi_attribute (tree *node, tree name,
32594 tree args ATTRIBUTE_UNUSED,
32595 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32596 {
32597 if (TREE_CODE (*node) != FUNCTION_TYPE
32598 && TREE_CODE (*node) != METHOD_TYPE
32599 && TREE_CODE (*node) != FIELD_DECL
32600 && TREE_CODE (*node) != TYPE_DECL)
32601 {
32602 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32603 name);
32604 *no_add_attrs = true;
32605 return NULL_TREE;
32606 }
32607
32608 /* Can combine regparm with all attributes but fastcall. */
32609 if (is_attribute_p ("ms_abi", name))
32610 {
32611 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32612 {
32613 error ("ms_abi and sysv_abi attributes are not compatible");
32614 }
32615
32616 return NULL_TREE;
32617 }
32618 else if (is_attribute_p ("sysv_abi", name))
32619 {
32620 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32621 {
32622 error ("ms_abi and sysv_abi attributes are not compatible");
32623 }
32624
32625 return NULL_TREE;
32626 }
32627
32628 return NULL_TREE;
32629 }
32630
32631 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32632 struct attribute_spec.handler. */
32633 static tree
32634 ix86_handle_struct_attribute (tree *node, tree name,
32635 tree args ATTRIBUTE_UNUSED,
32636 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32637 {
32638 tree *type = NULL;
32639 if (DECL_P (*node))
32640 {
32641 if (TREE_CODE (*node) == TYPE_DECL)
32642 type = &TREE_TYPE (*node);
32643 }
32644 else
32645 type = node;
32646
32647 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32648 {
32649 warning (OPT_Wattributes, "%qE attribute ignored",
32650 name);
32651 *no_add_attrs = true;
32652 }
32653
32654 else if ((is_attribute_p ("ms_struct", name)
32655 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32656 || ((is_attribute_p ("gcc_struct", name)
32657 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32658 {
32659 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32660 name);
32661 *no_add_attrs = true;
32662 }
32663
32664 return NULL_TREE;
32665 }
32666
32667 static tree
32668 ix86_handle_fndecl_attribute (tree *node, tree name,
32669 tree args ATTRIBUTE_UNUSED,
32670 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32671 {
32672 if (TREE_CODE (*node) != FUNCTION_DECL)
32673 {
32674 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32675 name);
32676 *no_add_attrs = true;
32677 }
32678 return NULL_TREE;
32679 }
32680
32681 static bool
32682 ix86_ms_bitfield_layout_p (const_tree record_type)
32683 {
32684 return ((TARGET_MS_BITFIELD_LAYOUT
32685 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32686 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32687 }
32688
32689 /* Returns an expression indicating where the this parameter is
32690 located on entry to the FUNCTION. */
32691
32692 static rtx
32693 x86_this_parameter (tree function)
32694 {
32695 tree type = TREE_TYPE (function);
32696 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32697 int nregs;
32698
32699 if (TARGET_64BIT)
32700 {
32701 const int *parm_regs;
32702
32703 if (ix86_function_type_abi (type) == MS_ABI)
32704 parm_regs = x86_64_ms_abi_int_parameter_registers;
32705 else
32706 parm_regs = x86_64_int_parameter_registers;
32707 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32708 }
32709
32710 nregs = ix86_function_regparm (type, function);
32711
32712 if (nregs > 0 && !stdarg_p (type))
32713 {
32714 int regno;
32715 unsigned int ccvt = ix86_get_callcvt (type);
32716
32717 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32718 regno = aggr ? DX_REG : CX_REG;
32719 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32720 {
32721 regno = CX_REG;
32722 if (aggr)
32723 return gen_rtx_MEM (SImode,
32724 plus_constant (Pmode, stack_pointer_rtx, 4));
32725 }
32726 else
32727 {
32728 regno = AX_REG;
32729 if (aggr)
32730 {
32731 regno = DX_REG;
32732 if (nregs == 1)
32733 return gen_rtx_MEM (SImode,
32734 plus_constant (Pmode,
32735 stack_pointer_rtx, 4));
32736 }
32737 }
32738 return gen_rtx_REG (SImode, regno);
32739 }
32740
32741 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
32742 aggr ? 8 : 4));
32743 }
32744
32745 /* Determine whether x86_output_mi_thunk can succeed. */
32746
32747 static bool
32748 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32749 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32750 HOST_WIDE_INT vcall_offset, const_tree function)
32751 {
32752 /* 64-bit can handle anything. */
32753 if (TARGET_64BIT)
32754 return true;
32755
32756 /* For 32-bit, everything's fine if we have one free register. */
32757 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32758 return true;
32759
32760 /* Need a free register for vcall_offset. */
32761 if (vcall_offset)
32762 return false;
32763
32764 /* Need a free register for GOT references. */
32765 if (flag_pic && !targetm.binds_local_p (function))
32766 return false;
32767
32768 /* Otherwise ok. */
32769 return true;
32770 }
32771
32772 /* Output the assembler code for a thunk function. THUNK_DECL is the
32773 declaration for the thunk function itself, FUNCTION is the decl for
32774 the target function. DELTA is an immediate constant offset to be
32775 added to THIS. If VCALL_OFFSET is nonzero, the word at
32776 *(*this + vcall_offset) should be added to THIS. */
32777
32778 static void
32779 x86_output_mi_thunk (FILE *file,
32780 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32781 HOST_WIDE_INT vcall_offset, tree function)
32782 {
32783 rtx this_param = x86_this_parameter (function);
32784 rtx this_reg, tmp, fnaddr;
32785
32786 emit_note (NOTE_INSN_PROLOGUE_END);
32787
32788 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32789 pull it in now and let DELTA benefit. */
32790 if (REG_P (this_param))
32791 this_reg = this_param;
32792 else if (vcall_offset)
32793 {
32794 /* Put the this parameter into %eax. */
32795 this_reg = gen_rtx_REG (Pmode, AX_REG);
32796 emit_move_insn (this_reg, this_param);
32797 }
32798 else
32799 this_reg = NULL_RTX;
32800
32801 /* Adjust the this parameter by a fixed constant. */
32802 if (delta)
32803 {
32804 rtx delta_rtx = GEN_INT (delta);
32805 rtx delta_dst = this_reg ? this_reg : this_param;
32806
32807 if (TARGET_64BIT)
32808 {
32809 if (!x86_64_general_operand (delta_rtx, Pmode))
32810 {
32811 tmp = gen_rtx_REG (Pmode, R10_REG);
32812 emit_move_insn (tmp, delta_rtx);
32813 delta_rtx = tmp;
32814 }
32815 }
32816
32817 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32818 }
32819
32820 /* Adjust the this parameter by a value stored in the vtable. */
32821 if (vcall_offset)
32822 {
32823 rtx vcall_addr, vcall_mem, this_mem;
32824 unsigned int tmp_regno;
32825
32826 if (TARGET_64BIT)
32827 tmp_regno = R10_REG;
32828 else
32829 {
32830 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32831 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32832 tmp_regno = AX_REG;
32833 else
32834 tmp_regno = CX_REG;
32835 }
32836 tmp = gen_rtx_REG (Pmode, tmp_regno);
32837
32838 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32839 if (Pmode != ptr_mode)
32840 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32841 emit_move_insn (tmp, this_mem);
32842
32843 /* Adjust the this parameter. */
32844 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
32845 if (TARGET_64BIT
32846 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32847 {
32848 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32849 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32850 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32851 }
32852
32853 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32854 if (Pmode != ptr_mode)
32855 emit_insn (gen_addsi_1_zext (this_reg,
32856 gen_rtx_REG (ptr_mode,
32857 REGNO (this_reg)),
32858 vcall_mem));
32859 else
32860 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32861 }
32862
32863 /* If necessary, drop THIS back to its stack slot. */
32864 if (this_reg && this_reg != this_param)
32865 emit_move_insn (this_param, this_reg);
32866
32867 fnaddr = XEXP (DECL_RTL (function), 0);
32868 if (TARGET_64BIT)
32869 {
32870 if (!flag_pic || targetm.binds_local_p (function)
32871 || cfun->machine->call_abi == MS_ABI)
32872 ;
32873 else
32874 {
32875 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32876 tmp = gen_rtx_CONST (Pmode, tmp);
32877 fnaddr = gen_rtx_MEM (Pmode, tmp);
32878 }
32879 }
32880 else
32881 {
32882 if (!flag_pic || targetm.binds_local_p (function))
32883 ;
32884 #if TARGET_MACHO
32885 else if (TARGET_MACHO)
32886 {
32887 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32888 fnaddr = XEXP (fnaddr, 0);
32889 }
32890 #endif /* TARGET_MACHO */
32891 else
32892 {
32893 tmp = gen_rtx_REG (Pmode, CX_REG);
32894 output_set_got (tmp, NULL_RTX);
32895
32896 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32897 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32898 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32899 }
32900 }
32901
32902 /* Our sibling call patterns do not allow memories, because we have no
32903 predicate that can distinguish between frame and non-frame memory.
32904 For our purposes here, we can get away with (ab)using a jump pattern,
32905 because we're going to do no optimization. */
32906 if (MEM_P (fnaddr))
32907 emit_jump_insn (gen_indirect_jump (fnaddr));
32908 else
32909 {
32910 tmp = gen_rtx_MEM (QImode, fnaddr);
32911 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32912 tmp = emit_call_insn (tmp);
32913 SIBLING_CALL_P (tmp) = 1;
32914 }
32915 emit_barrier ();
32916
32917 /* Emit just enough of rest_of_compilation to get the insns emitted.
32918 Note that use_thunk calls assemble_start_function et al. */
32919 tmp = get_insns ();
32920 insn_locators_alloc ();
32921 shorten_branches (tmp);
32922 final_start_function (tmp, file, 1);
32923 final (tmp, file, 1);
32924 final_end_function ();
32925 }
32926
32927 static void
32928 x86_file_start (void)
32929 {
32930 default_file_start ();
32931 #if TARGET_MACHO
32932 darwin_file_start ();
32933 #endif
32934 if (X86_FILE_START_VERSION_DIRECTIVE)
32935 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32936 if (X86_FILE_START_FLTUSED)
32937 fputs ("\t.global\t__fltused\n", asm_out_file);
32938 if (ix86_asm_dialect == ASM_INTEL)
32939 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32940 }
32941
32942 int
32943 x86_field_alignment (tree field, int computed)
32944 {
32945 enum machine_mode mode;
32946 tree type = TREE_TYPE (field);
32947
32948 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32949 return computed;
32950 mode = TYPE_MODE (strip_array_types (type));
32951 if (mode == DFmode || mode == DCmode
32952 || GET_MODE_CLASS (mode) == MODE_INT
32953 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32954 return MIN (32, computed);
32955 return computed;
32956 }
32957
32958 /* Output assembler code to FILE to increment profiler label # LABELNO
32959 for profiling a function entry. */
32960 void
32961 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32962 {
32963 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32964 : MCOUNT_NAME);
32965
32966 if (TARGET_64BIT)
32967 {
32968 #ifndef NO_PROFILE_COUNTERS
32969 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32970 #endif
32971
32972 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32973 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32974 else
32975 fprintf (file, "\tcall\t%s\n", mcount_name);
32976 }
32977 else if (flag_pic)
32978 {
32979 #ifndef NO_PROFILE_COUNTERS
32980 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32981 LPREFIX, labelno);
32982 #endif
32983 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32984 }
32985 else
32986 {
32987 #ifndef NO_PROFILE_COUNTERS
32988 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32989 LPREFIX, labelno);
32990 #endif
32991 fprintf (file, "\tcall\t%s\n", mcount_name);
32992 }
32993 }
32994
32995 /* We don't have exact information about the insn sizes, but we may assume
32996 quite safely that we are informed about all 1 byte insns and memory
32997 address sizes. This is enough to eliminate unnecessary padding in
32998 99% of cases. */
32999
33000 static int
33001 min_insn_size (rtx insn)
33002 {
33003 int l = 0, len;
33004
33005 if (!INSN_P (insn) || !active_insn_p (insn))
33006 return 0;
33007
33008 /* Discard alignments we've emit and jump instructions. */
33009 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
33010 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
33011 return 0;
33012 if (JUMP_TABLE_DATA_P (insn))
33013 return 0;
33014
33015 /* Important case - calls are always 5 bytes.
33016 It is common to have many calls in the row. */
33017 if (CALL_P (insn)
33018 && symbolic_reference_mentioned_p (PATTERN (insn))
33019 && !SIBLING_CALL_P (insn))
33020 return 5;
33021 len = get_attr_length (insn);
33022 if (len <= 1)
33023 return 1;
33024
33025 /* For normal instructions we rely on get_attr_length being exact,
33026 with a few exceptions. */
33027 if (!JUMP_P (insn))
33028 {
33029 enum attr_type type = get_attr_type (insn);
33030
33031 switch (type)
33032 {
33033 case TYPE_MULTI:
33034 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
33035 || asm_noperands (PATTERN (insn)) >= 0)
33036 return 0;
33037 break;
33038 case TYPE_OTHER:
33039 case TYPE_FCMP:
33040 break;
33041 default:
33042 /* Otherwise trust get_attr_length. */
33043 return len;
33044 }
33045
33046 l = get_attr_length_address (insn);
33047 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
33048 l = 4;
33049 }
33050 if (l)
33051 return 1+l;
33052 else
33053 return 2;
33054 }
33055
33056 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33057
33058 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
33059 window. */
33060
33061 static void
33062 ix86_avoid_jump_mispredicts (void)
33063 {
33064 rtx insn, start = get_insns ();
33065 int nbytes = 0, njumps = 0;
33066 int isjump = 0;
33067
33068 /* Look for all minimal intervals of instructions containing 4 jumps.
33069 The intervals are bounded by START and INSN. NBYTES is the total
33070 size of instructions in the interval including INSN and not including
33071 START. When the NBYTES is smaller than 16 bytes, it is possible
33072 that the end of START and INSN ends up in the same 16byte page.
33073
33074 The smallest offset in the page INSN can start is the case where START
33075 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
33076 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
33077 */
33078 for (insn = start; insn; insn = NEXT_INSN (insn))
33079 {
33080 int min_size;
33081
33082 if (LABEL_P (insn))
33083 {
33084 int align = label_to_alignment (insn);
33085 int max_skip = label_to_max_skip (insn);
33086
33087 if (max_skip > 15)
33088 max_skip = 15;
33089 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
33090 already in the current 16 byte page, because otherwise
33091 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
33092 bytes to reach 16 byte boundary. */
33093 if (align <= 0
33094 || (align <= 3 && max_skip != (1 << align) - 1))
33095 max_skip = 0;
33096 if (dump_file)
33097 fprintf (dump_file, "Label %i with max_skip %i\n",
33098 INSN_UID (insn), max_skip);
33099 if (max_skip)
33100 {
33101 while (nbytes + max_skip >= 16)
33102 {
33103 start = NEXT_INSN (start);
33104 if ((JUMP_P (start)
33105 && GET_CODE (PATTERN (start)) != ADDR_VEC
33106 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33107 || CALL_P (start))
33108 njumps--, isjump = 1;
33109 else
33110 isjump = 0;
33111 nbytes -= min_insn_size (start);
33112 }
33113 }
33114 continue;
33115 }
33116
33117 min_size = min_insn_size (insn);
33118 nbytes += min_size;
33119 if (dump_file)
33120 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
33121 INSN_UID (insn), min_size);
33122 if ((JUMP_P (insn)
33123 && GET_CODE (PATTERN (insn)) != ADDR_VEC
33124 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
33125 || CALL_P (insn))
33126 njumps++;
33127 else
33128 continue;
33129
33130 while (njumps > 3)
33131 {
33132 start = NEXT_INSN (start);
33133 if ((JUMP_P (start)
33134 && GET_CODE (PATTERN (start)) != ADDR_VEC
33135 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33136 || CALL_P (start))
33137 njumps--, isjump = 1;
33138 else
33139 isjump = 0;
33140 nbytes -= min_insn_size (start);
33141 }
33142 gcc_assert (njumps >= 0);
33143 if (dump_file)
33144 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33145 INSN_UID (start), INSN_UID (insn), nbytes);
33146
33147 if (njumps == 3 && isjump && nbytes < 16)
33148 {
33149 int padsize = 15 - nbytes + min_insn_size (insn);
33150
33151 if (dump_file)
33152 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33153 INSN_UID (insn), padsize);
33154 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33155 }
33156 }
33157 }
33158 #endif
33159
33160 /* AMD Athlon works faster
33161 when RET is not destination of conditional jump or directly preceded
33162 by other jump instruction. We avoid the penalty by inserting NOP just
33163 before the RET instructions in such cases. */
33164 static void
33165 ix86_pad_returns (void)
33166 {
33167 edge e;
33168 edge_iterator ei;
33169
33170 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33171 {
33172 basic_block bb = e->src;
33173 rtx ret = BB_END (bb);
33174 rtx prev;
33175 bool replace = false;
33176
33177 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33178 || optimize_bb_for_size_p (bb))
33179 continue;
33180 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33181 if (active_insn_p (prev) || LABEL_P (prev))
33182 break;
33183 if (prev && LABEL_P (prev))
33184 {
33185 edge e;
33186 edge_iterator ei;
33187
33188 FOR_EACH_EDGE (e, ei, bb->preds)
33189 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33190 && !(e->flags & EDGE_FALLTHRU))
33191 replace = true;
33192 }
33193 if (!replace)
33194 {
33195 prev = prev_active_insn (ret);
33196 if (prev
33197 && ((JUMP_P (prev) && any_condjump_p (prev))
33198 || CALL_P (prev)))
33199 replace = true;
33200 /* Empty functions get branch mispredict even when
33201 the jump destination is not visible to us. */
33202 if (!prev && !optimize_function_for_size_p (cfun))
33203 replace = true;
33204 }
33205 if (replace)
33206 {
33207 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33208 delete_insn (ret);
33209 }
33210 }
33211 }
33212
33213 /* Count the minimum number of instructions in BB. Return 4 if the
33214 number of instructions >= 4. */
33215
33216 static int
33217 ix86_count_insn_bb (basic_block bb)
33218 {
33219 rtx insn;
33220 int insn_count = 0;
33221
33222 /* Count number of instructions in this block. Return 4 if the number
33223 of instructions >= 4. */
33224 FOR_BB_INSNS (bb, insn)
33225 {
33226 /* Only happen in exit blocks. */
33227 if (JUMP_P (insn)
33228 && ANY_RETURN_P (PATTERN (insn)))
33229 break;
33230
33231 if (NONDEBUG_INSN_P (insn)
33232 && GET_CODE (PATTERN (insn)) != USE
33233 && GET_CODE (PATTERN (insn)) != CLOBBER)
33234 {
33235 insn_count++;
33236 if (insn_count >= 4)
33237 return insn_count;
33238 }
33239 }
33240
33241 return insn_count;
33242 }
33243
33244
33245 /* Count the minimum number of instructions in code path in BB.
33246 Return 4 if the number of instructions >= 4. */
33247
33248 static int
33249 ix86_count_insn (basic_block bb)
33250 {
33251 edge e;
33252 edge_iterator ei;
33253 int min_prev_count;
33254
33255 /* Only bother counting instructions along paths with no
33256 more than 2 basic blocks between entry and exit. Given
33257 that BB has an edge to exit, determine if a predecessor
33258 of BB has an edge from entry. If so, compute the number
33259 of instructions in the predecessor block. If there
33260 happen to be multiple such blocks, compute the minimum. */
33261 min_prev_count = 4;
33262 FOR_EACH_EDGE (e, ei, bb->preds)
33263 {
33264 edge prev_e;
33265 edge_iterator prev_ei;
33266
33267 if (e->src == ENTRY_BLOCK_PTR)
33268 {
33269 min_prev_count = 0;
33270 break;
33271 }
33272 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33273 {
33274 if (prev_e->src == ENTRY_BLOCK_PTR)
33275 {
33276 int count = ix86_count_insn_bb (e->src);
33277 if (count < min_prev_count)
33278 min_prev_count = count;
33279 break;
33280 }
33281 }
33282 }
33283
33284 if (min_prev_count < 4)
33285 min_prev_count += ix86_count_insn_bb (bb);
33286
33287 return min_prev_count;
33288 }
33289
33290 /* Pad short function to 4 instructions. */
33291
33292 static void
33293 ix86_pad_short_function (void)
33294 {
33295 edge e;
33296 edge_iterator ei;
33297
33298 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33299 {
33300 rtx ret = BB_END (e->src);
33301 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33302 {
33303 int insn_count = ix86_count_insn (e->src);
33304
33305 /* Pad short function. */
33306 if (insn_count < 4)
33307 {
33308 rtx insn = ret;
33309
33310 /* Find epilogue. */
33311 while (insn
33312 && (!NOTE_P (insn)
33313 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33314 insn = PREV_INSN (insn);
33315
33316 if (!insn)
33317 insn = ret;
33318
33319 /* Two NOPs count as one instruction. */
33320 insn_count = 2 * (4 - insn_count);
33321 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33322 }
33323 }
33324 }
33325 }
33326
33327 /* Implement machine specific optimizations. We implement padding of returns
33328 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33329 static void
33330 ix86_reorg (void)
33331 {
33332 /* We are freeing block_for_insn in the toplev to keep compatibility
33333 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33334 compute_bb_for_insn ();
33335
33336 /* Run the vzeroupper optimization if needed. */
33337 if (TARGET_VZEROUPPER)
33338 move_or_delete_vzeroupper ();
33339
33340 if (optimize && optimize_function_for_speed_p (cfun))
33341 {
33342 if (TARGET_PAD_SHORT_FUNCTION)
33343 ix86_pad_short_function ();
33344 else if (TARGET_PAD_RETURNS)
33345 ix86_pad_returns ();
33346 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33347 if (TARGET_FOUR_JUMP_LIMIT)
33348 ix86_avoid_jump_mispredicts ();
33349 #endif
33350 }
33351 }
33352
33353 /* Return nonzero when QImode register that must be represented via REX prefix
33354 is used. */
33355 bool
33356 x86_extended_QIreg_mentioned_p (rtx insn)
33357 {
33358 int i;
33359 extract_insn_cached (insn);
33360 for (i = 0; i < recog_data.n_operands; i++)
33361 if (REG_P (recog_data.operand[i])
33362 && REGNO (recog_data.operand[i]) > BX_REG)
33363 return true;
33364 return false;
33365 }
33366
33367 /* Return nonzero when P points to register encoded via REX prefix.
33368 Called via for_each_rtx. */
33369 static int
33370 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33371 {
33372 unsigned int regno;
33373 if (!REG_P (*p))
33374 return 0;
33375 regno = REGNO (*p);
33376 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33377 }
33378
33379 /* Return true when INSN mentions register that must be encoded using REX
33380 prefix. */
33381 bool
33382 x86_extended_reg_mentioned_p (rtx insn)
33383 {
33384 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33385 extended_reg_mentioned_1, NULL);
33386 }
33387
33388 /* If profitable, negate (without causing overflow) integer constant
33389 of mode MODE at location LOC. Return true in this case. */
33390 bool
33391 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33392 {
33393 HOST_WIDE_INT val;
33394
33395 if (!CONST_INT_P (*loc))
33396 return false;
33397
33398 switch (mode)
33399 {
33400 case DImode:
33401 /* DImode x86_64 constants must fit in 32 bits. */
33402 gcc_assert (x86_64_immediate_operand (*loc, mode));
33403
33404 mode = SImode;
33405 break;
33406
33407 case SImode:
33408 case HImode:
33409 case QImode:
33410 break;
33411
33412 default:
33413 gcc_unreachable ();
33414 }
33415
33416 /* Avoid overflows. */
33417 if (mode_signbit_p (mode, *loc))
33418 return false;
33419
33420 val = INTVAL (*loc);
33421
33422 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33423 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33424 if ((val < 0 && val != -128)
33425 || val == 128)
33426 {
33427 *loc = GEN_INT (-val);
33428 return true;
33429 }
33430
33431 return false;
33432 }
33433
33434 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33435 optabs would emit if we didn't have TFmode patterns. */
33436
33437 void
33438 x86_emit_floatuns (rtx operands[2])
33439 {
33440 rtx neglab, donelab, i0, i1, f0, in, out;
33441 enum machine_mode mode, inmode;
33442
33443 inmode = GET_MODE (operands[1]);
33444 gcc_assert (inmode == SImode || inmode == DImode);
33445
33446 out = operands[0];
33447 in = force_reg (inmode, operands[1]);
33448 mode = GET_MODE (out);
33449 neglab = gen_label_rtx ();
33450 donelab = gen_label_rtx ();
33451 f0 = gen_reg_rtx (mode);
33452
33453 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33454
33455 expand_float (out, in, 0);
33456
33457 emit_jump_insn (gen_jump (donelab));
33458 emit_barrier ();
33459
33460 emit_label (neglab);
33461
33462 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33463 1, OPTAB_DIRECT);
33464 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33465 1, OPTAB_DIRECT);
33466 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33467
33468 expand_float (f0, i0, 0);
33469
33470 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33471
33472 emit_label (donelab);
33473 }
33474 \f
33475 /* AVX2 does support 32-byte integer vector operations,
33476 thus the longest vector we are faced with is V32QImode. */
33477 #define MAX_VECT_LEN 32
33478
33479 struct expand_vec_perm_d
33480 {
33481 rtx target, op0, op1;
33482 unsigned char perm[MAX_VECT_LEN];
33483 enum machine_mode vmode;
33484 unsigned char nelt;
33485 bool one_operand_p;
33486 bool testing_p;
33487 };
33488
33489 static bool canonicalize_perm (struct expand_vec_perm_d *d);
33490 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33491 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33492
33493 /* Get a vector mode of the same size as the original but with elements
33494 twice as wide. This is only guaranteed to apply to integral vectors. */
33495
33496 static inline enum machine_mode
33497 get_mode_wider_vector (enum machine_mode o)
33498 {
33499 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33500 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33501 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33502 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33503 return n;
33504 }
33505
33506 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33507 with all elements equal to VAR. Return true if successful. */
33508
33509 static bool
33510 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33511 rtx target, rtx val)
33512 {
33513 bool ok;
33514
33515 switch (mode)
33516 {
33517 case V2SImode:
33518 case V2SFmode:
33519 if (!mmx_ok)
33520 return false;
33521 /* FALLTHRU */
33522
33523 case V4DFmode:
33524 case V4DImode:
33525 case V8SFmode:
33526 case V8SImode:
33527 case V2DFmode:
33528 case V2DImode:
33529 case V4SFmode:
33530 case V4SImode:
33531 {
33532 rtx insn, dup;
33533
33534 /* First attempt to recognize VAL as-is. */
33535 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33536 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33537 if (recog_memoized (insn) < 0)
33538 {
33539 rtx seq;
33540 /* If that fails, force VAL into a register. */
33541
33542 start_sequence ();
33543 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33544 seq = get_insns ();
33545 end_sequence ();
33546 if (seq)
33547 emit_insn_before (seq, insn);
33548
33549 ok = recog_memoized (insn) >= 0;
33550 gcc_assert (ok);
33551 }
33552 }
33553 return true;
33554
33555 case V4HImode:
33556 if (!mmx_ok)
33557 return false;
33558 if (TARGET_SSE || TARGET_3DNOW_A)
33559 {
33560 rtx x;
33561
33562 val = gen_lowpart (SImode, val);
33563 x = gen_rtx_TRUNCATE (HImode, val);
33564 x = gen_rtx_VEC_DUPLICATE (mode, x);
33565 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33566 return true;
33567 }
33568 goto widen;
33569
33570 case V8QImode:
33571 if (!mmx_ok)
33572 return false;
33573 goto widen;
33574
33575 case V8HImode:
33576 if (TARGET_SSE2)
33577 {
33578 struct expand_vec_perm_d dperm;
33579 rtx tmp1, tmp2;
33580
33581 permute:
33582 memset (&dperm, 0, sizeof (dperm));
33583 dperm.target = target;
33584 dperm.vmode = mode;
33585 dperm.nelt = GET_MODE_NUNITS (mode);
33586 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33587 dperm.one_operand_p = true;
33588
33589 /* Extend to SImode using a paradoxical SUBREG. */
33590 tmp1 = gen_reg_rtx (SImode);
33591 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33592
33593 /* Insert the SImode value as low element of a V4SImode vector. */
33594 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33595 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33596
33597 ok = (expand_vec_perm_1 (&dperm)
33598 || expand_vec_perm_broadcast_1 (&dperm));
33599 gcc_assert (ok);
33600 return ok;
33601 }
33602 goto widen;
33603
33604 case V16QImode:
33605 if (TARGET_SSE2)
33606 goto permute;
33607 goto widen;
33608
33609 widen:
33610 /* Replicate the value once into the next wider mode and recurse. */
33611 {
33612 enum machine_mode smode, wsmode, wvmode;
33613 rtx x;
33614
33615 smode = GET_MODE_INNER (mode);
33616 wvmode = get_mode_wider_vector (mode);
33617 wsmode = GET_MODE_INNER (wvmode);
33618
33619 val = convert_modes (wsmode, smode, val, true);
33620 x = expand_simple_binop (wsmode, ASHIFT, val,
33621 GEN_INT (GET_MODE_BITSIZE (smode)),
33622 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33623 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33624
33625 x = gen_lowpart (wvmode, target);
33626 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33627 gcc_assert (ok);
33628 return ok;
33629 }
33630
33631 case V16HImode:
33632 case V32QImode:
33633 {
33634 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33635 rtx x = gen_reg_rtx (hvmode);
33636
33637 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33638 gcc_assert (ok);
33639
33640 x = gen_rtx_VEC_CONCAT (mode, x, x);
33641 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33642 }
33643 return true;
33644
33645 default:
33646 return false;
33647 }
33648 }
33649
33650 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33651 whose ONE_VAR element is VAR, and other elements are zero. Return true
33652 if successful. */
33653
33654 static bool
33655 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33656 rtx target, rtx var, int one_var)
33657 {
33658 enum machine_mode vsimode;
33659 rtx new_target;
33660 rtx x, tmp;
33661 bool use_vector_set = false;
33662
33663 switch (mode)
33664 {
33665 case V2DImode:
33666 /* For SSE4.1, we normally use vector set. But if the second
33667 element is zero and inter-unit moves are OK, we use movq
33668 instead. */
33669 use_vector_set = (TARGET_64BIT
33670 && TARGET_SSE4_1
33671 && !(TARGET_INTER_UNIT_MOVES
33672 && one_var == 0));
33673 break;
33674 case V16QImode:
33675 case V4SImode:
33676 case V4SFmode:
33677 use_vector_set = TARGET_SSE4_1;
33678 break;
33679 case V8HImode:
33680 use_vector_set = TARGET_SSE2;
33681 break;
33682 case V4HImode:
33683 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33684 break;
33685 case V32QImode:
33686 case V16HImode:
33687 case V8SImode:
33688 case V8SFmode:
33689 case V4DFmode:
33690 use_vector_set = TARGET_AVX;
33691 break;
33692 case V4DImode:
33693 /* Use ix86_expand_vector_set in 64bit mode only. */
33694 use_vector_set = TARGET_AVX && TARGET_64BIT;
33695 break;
33696 default:
33697 break;
33698 }
33699
33700 if (use_vector_set)
33701 {
33702 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33703 var = force_reg (GET_MODE_INNER (mode), var);
33704 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33705 return true;
33706 }
33707
33708 switch (mode)
33709 {
33710 case V2SFmode:
33711 case V2SImode:
33712 if (!mmx_ok)
33713 return false;
33714 /* FALLTHRU */
33715
33716 case V2DFmode:
33717 case V2DImode:
33718 if (one_var != 0)
33719 return false;
33720 var = force_reg (GET_MODE_INNER (mode), var);
33721 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33722 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33723 return true;
33724
33725 case V4SFmode:
33726 case V4SImode:
33727 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33728 new_target = gen_reg_rtx (mode);
33729 else
33730 new_target = target;
33731 var = force_reg (GET_MODE_INNER (mode), var);
33732 x = gen_rtx_VEC_DUPLICATE (mode, var);
33733 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33734 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33735 if (one_var != 0)
33736 {
33737 /* We need to shuffle the value to the correct position, so
33738 create a new pseudo to store the intermediate result. */
33739
33740 /* With SSE2, we can use the integer shuffle insns. */
33741 if (mode != V4SFmode && TARGET_SSE2)
33742 {
33743 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33744 const1_rtx,
33745 GEN_INT (one_var == 1 ? 0 : 1),
33746 GEN_INT (one_var == 2 ? 0 : 1),
33747 GEN_INT (one_var == 3 ? 0 : 1)));
33748 if (target != new_target)
33749 emit_move_insn (target, new_target);
33750 return true;
33751 }
33752
33753 /* Otherwise convert the intermediate result to V4SFmode and
33754 use the SSE1 shuffle instructions. */
33755 if (mode != V4SFmode)
33756 {
33757 tmp = gen_reg_rtx (V4SFmode);
33758 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33759 }
33760 else
33761 tmp = new_target;
33762
33763 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33764 const1_rtx,
33765 GEN_INT (one_var == 1 ? 0 : 1),
33766 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33767 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33768
33769 if (mode != V4SFmode)
33770 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33771 else if (tmp != target)
33772 emit_move_insn (target, tmp);
33773 }
33774 else if (target != new_target)
33775 emit_move_insn (target, new_target);
33776 return true;
33777
33778 case V8HImode:
33779 case V16QImode:
33780 vsimode = V4SImode;
33781 goto widen;
33782 case V4HImode:
33783 case V8QImode:
33784 if (!mmx_ok)
33785 return false;
33786 vsimode = V2SImode;
33787 goto widen;
33788 widen:
33789 if (one_var != 0)
33790 return false;
33791
33792 /* Zero extend the variable element to SImode and recurse. */
33793 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33794
33795 x = gen_reg_rtx (vsimode);
33796 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33797 var, one_var))
33798 gcc_unreachable ();
33799
33800 emit_move_insn (target, gen_lowpart (mode, x));
33801 return true;
33802
33803 default:
33804 return false;
33805 }
33806 }
33807
33808 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33809 consisting of the values in VALS. It is known that all elements
33810 except ONE_VAR are constants. Return true if successful. */
33811
33812 static bool
33813 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33814 rtx target, rtx vals, int one_var)
33815 {
33816 rtx var = XVECEXP (vals, 0, one_var);
33817 enum machine_mode wmode;
33818 rtx const_vec, x;
33819
33820 const_vec = copy_rtx (vals);
33821 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33822 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33823
33824 switch (mode)
33825 {
33826 case V2DFmode:
33827 case V2DImode:
33828 case V2SFmode:
33829 case V2SImode:
33830 /* For the two element vectors, it's just as easy to use
33831 the general case. */
33832 return false;
33833
33834 case V4DImode:
33835 /* Use ix86_expand_vector_set in 64bit mode only. */
33836 if (!TARGET_64BIT)
33837 return false;
33838 case V4DFmode:
33839 case V8SFmode:
33840 case V8SImode:
33841 case V16HImode:
33842 case V32QImode:
33843 case V4SFmode:
33844 case V4SImode:
33845 case V8HImode:
33846 case V4HImode:
33847 break;
33848
33849 case V16QImode:
33850 if (TARGET_SSE4_1)
33851 break;
33852 wmode = V8HImode;
33853 goto widen;
33854 case V8QImode:
33855 wmode = V4HImode;
33856 goto widen;
33857 widen:
33858 /* There's no way to set one QImode entry easily. Combine
33859 the variable value with its adjacent constant value, and
33860 promote to an HImode set. */
33861 x = XVECEXP (vals, 0, one_var ^ 1);
33862 if (one_var & 1)
33863 {
33864 var = convert_modes (HImode, QImode, var, true);
33865 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33866 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33867 x = GEN_INT (INTVAL (x) & 0xff);
33868 }
33869 else
33870 {
33871 var = convert_modes (HImode, QImode, var, true);
33872 x = gen_int_mode (INTVAL (x) << 8, HImode);
33873 }
33874 if (x != const0_rtx)
33875 var = expand_simple_binop (HImode, IOR, var, x, var,
33876 1, OPTAB_LIB_WIDEN);
33877
33878 x = gen_reg_rtx (wmode);
33879 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33880 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33881
33882 emit_move_insn (target, gen_lowpart (mode, x));
33883 return true;
33884
33885 default:
33886 return false;
33887 }
33888
33889 emit_move_insn (target, const_vec);
33890 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33891 return true;
33892 }
33893
33894 /* A subroutine of ix86_expand_vector_init_general. Use vector
33895 concatenate to handle the most general case: all values variable,
33896 and none identical. */
33897
33898 static void
33899 ix86_expand_vector_init_concat (enum machine_mode mode,
33900 rtx target, rtx *ops, int n)
33901 {
33902 enum machine_mode cmode, hmode = VOIDmode;
33903 rtx first[8], second[4];
33904 rtvec v;
33905 int i, j;
33906
33907 switch (n)
33908 {
33909 case 2:
33910 switch (mode)
33911 {
33912 case V8SImode:
33913 cmode = V4SImode;
33914 break;
33915 case V8SFmode:
33916 cmode = V4SFmode;
33917 break;
33918 case V4DImode:
33919 cmode = V2DImode;
33920 break;
33921 case V4DFmode:
33922 cmode = V2DFmode;
33923 break;
33924 case V4SImode:
33925 cmode = V2SImode;
33926 break;
33927 case V4SFmode:
33928 cmode = V2SFmode;
33929 break;
33930 case V2DImode:
33931 cmode = DImode;
33932 break;
33933 case V2SImode:
33934 cmode = SImode;
33935 break;
33936 case V2DFmode:
33937 cmode = DFmode;
33938 break;
33939 case V2SFmode:
33940 cmode = SFmode;
33941 break;
33942 default:
33943 gcc_unreachable ();
33944 }
33945
33946 if (!register_operand (ops[1], cmode))
33947 ops[1] = force_reg (cmode, ops[1]);
33948 if (!register_operand (ops[0], cmode))
33949 ops[0] = force_reg (cmode, ops[0]);
33950 emit_insn (gen_rtx_SET (VOIDmode, target,
33951 gen_rtx_VEC_CONCAT (mode, ops[0],
33952 ops[1])));
33953 break;
33954
33955 case 4:
33956 switch (mode)
33957 {
33958 case V4DImode:
33959 cmode = V2DImode;
33960 break;
33961 case V4DFmode:
33962 cmode = V2DFmode;
33963 break;
33964 case V4SImode:
33965 cmode = V2SImode;
33966 break;
33967 case V4SFmode:
33968 cmode = V2SFmode;
33969 break;
33970 default:
33971 gcc_unreachable ();
33972 }
33973 goto half;
33974
33975 case 8:
33976 switch (mode)
33977 {
33978 case V8SImode:
33979 cmode = V2SImode;
33980 hmode = V4SImode;
33981 break;
33982 case V8SFmode:
33983 cmode = V2SFmode;
33984 hmode = V4SFmode;
33985 break;
33986 default:
33987 gcc_unreachable ();
33988 }
33989 goto half;
33990
33991 half:
33992 /* FIXME: We process inputs backward to help RA. PR 36222. */
33993 i = n - 1;
33994 j = (n >> 1) - 1;
33995 for (; i > 0; i -= 2, j--)
33996 {
33997 first[j] = gen_reg_rtx (cmode);
33998 v = gen_rtvec (2, ops[i - 1], ops[i]);
33999 ix86_expand_vector_init (false, first[j],
34000 gen_rtx_PARALLEL (cmode, v));
34001 }
34002
34003 n >>= 1;
34004 if (n > 2)
34005 {
34006 gcc_assert (hmode != VOIDmode);
34007 for (i = j = 0; i < n; i += 2, j++)
34008 {
34009 second[j] = gen_reg_rtx (hmode);
34010 ix86_expand_vector_init_concat (hmode, second [j],
34011 &first [i], 2);
34012 }
34013 n >>= 1;
34014 ix86_expand_vector_init_concat (mode, target, second, n);
34015 }
34016 else
34017 ix86_expand_vector_init_concat (mode, target, first, n);
34018 break;
34019
34020 default:
34021 gcc_unreachable ();
34022 }
34023 }
34024
34025 /* A subroutine of ix86_expand_vector_init_general. Use vector
34026 interleave to handle the most general case: all values variable,
34027 and none identical. */
34028
34029 static void
34030 ix86_expand_vector_init_interleave (enum machine_mode mode,
34031 rtx target, rtx *ops, int n)
34032 {
34033 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
34034 int i, j;
34035 rtx op0, op1;
34036 rtx (*gen_load_even) (rtx, rtx, rtx);
34037 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
34038 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
34039
34040 switch (mode)
34041 {
34042 case V8HImode:
34043 gen_load_even = gen_vec_setv8hi;
34044 gen_interleave_first_low = gen_vec_interleave_lowv4si;
34045 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34046 inner_mode = HImode;
34047 first_imode = V4SImode;
34048 second_imode = V2DImode;
34049 third_imode = VOIDmode;
34050 break;
34051 case V16QImode:
34052 gen_load_even = gen_vec_setv16qi;
34053 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
34054 gen_interleave_second_low = gen_vec_interleave_lowv4si;
34055 inner_mode = QImode;
34056 first_imode = V8HImode;
34057 second_imode = V4SImode;
34058 third_imode = V2DImode;
34059 break;
34060 default:
34061 gcc_unreachable ();
34062 }
34063
34064 for (i = 0; i < n; i++)
34065 {
34066 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
34067 op0 = gen_reg_rtx (SImode);
34068 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
34069
34070 /* Insert the SImode value as low element of V4SImode vector. */
34071 op1 = gen_reg_rtx (V4SImode);
34072 op0 = gen_rtx_VEC_MERGE (V4SImode,
34073 gen_rtx_VEC_DUPLICATE (V4SImode,
34074 op0),
34075 CONST0_RTX (V4SImode),
34076 const1_rtx);
34077 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
34078
34079 /* Cast the V4SImode vector back to a vector in orignal mode. */
34080 op0 = gen_reg_rtx (mode);
34081 emit_move_insn (op0, gen_lowpart (mode, op1));
34082
34083 /* Load even elements into the second positon. */
34084 emit_insn (gen_load_even (op0,
34085 force_reg (inner_mode,
34086 ops [i + i + 1]),
34087 const1_rtx));
34088
34089 /* Cast vector to FIRST_IMODE vector. */
34090 ops[i] = gen_reg_rtx (first_imode);
34091 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
34092 }
34093
34094 /* Interleave low FIRST_IMODE vectors. */
34095 for (i = j = 0; i < n; i += 2, j++)
34096 {
34097 op0 = gen_reg_rtx (first_imode);
34098 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
34099
34100 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
34101 ops[j] = gen_reg_rtx (second_imode);
34102 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
34103 }
34104
34105 /* Interleave low SECOND_IMODE vectors. */
34106 switch (second_imode)
34107 {
34108 case V4SImode:
34109 for (i = j = 0; i < n / 2; i += 2, j++)
34110 {
34111 op0 = gen_reg_rtx (second_imode);
34112 emit_insn (gen_interleave_second_low (op0, ops[i],
34113 ops[i + 1]));
34114
34115 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
34116 vector. */
34117 ops[j] = gen_reg_rtx (third_imode);
34118 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
34119 }
34120 second_imode = V2DImode;
34121 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34122 /* FALLTHRU */
34123
34124 case V2DImode:
34125 op0 = gen_reg_rtx (second_imode);
34126 emit_insn (gen_interleave_second_low (op0, ops[0],
34127 ops[1]));
34128
34129 /* Cast the SECOND_IMODE vector back to a vector on original
34130 mode. */
34131 emit_insn (gen_rtx_SET (VOIDmode, target,
34132 gen_lowpart (mode, op0)));
34133 break;
34134
34135 default:
34136 gcc_unreachable ();
34137 }
34138 }
34139
34140 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
34141 all values variable, and none identical. */
34142
34143 static void
34144 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34145 rtx target, rtx vals)
34146 {
34147 rtx ops[32], op0, op1;
34148 enum machine_mode half_mode = VOIDmode;
34149 int n, i;
34150
34151 switch (mode)
34152 {
34153 case V2SFmode:
34154 case V2SImode:
34155 if (!mmx_ok && !TARGET_SSE)
34156 break;
34157 /* FALLTHRU */
34158
34159 case V8SFmode:
34160 case V8SImode:
34161 case V4DFmode:
34162 case V4DImode:
34163 case V4SFmode:
34164 case V4SImode:
34165 case V2DFmode:
34166 case V2DImode:
34167 n = GET_MODE_NUNITS (mode);
34168 for (i = 0; i < n; i++)
34169 ops[i] = XVECEXP (vals, 0, i);
34170 ix86_expand_vector_init_concat (mode, target, ops, n);
34171 return;
34172
34173 case V32QImode:
34174 half_mode = V16QImode;
34175 goto half;
34176
34177 case V16HImode:
34178 half_mode = V8HImode;
34179 goto half;
34180
34181 half:
34182 n = GET_MODE_NUNITS (mode);
34183 for (i = 0; i < n; i++)
34184 ops[i] = XVECEXP (vals, 0, i);
34185 op0 = gen_reg_rtx (half_mode);
34186 op1 = gen_reg_rtx (half_mode);
34187 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34188 n >> 2);
34189 ix86_expand_vector_init_interleave (half_mode, op1,
34190 &ops [n >> 1], n >> 2);
34191 emit_insn (gen_rtx_SET (VOIDmode, target,
34192 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34193 return;
34194
34195 case V16QImode:
34196 if (!TARGET_SSE4_1)
34197 break;
34198 /* FALLTHRU */
34199
34200 case V8HImode:
34201 if (!TARGET_SSE2)
34202 break;
34203
34204 /* Don't use ix86_expand_vector_init_interleave if we can't
34205 move from GPR to SSE register directly. */
34206 if (!TARGET_INTER_UNIT_MOVES)
34207 break;
34208
34209 n = GET_MODE_NUNITS (mode);
34210 for (i = 0; i < n; i++)
34211 ops[i] = XVECEXP (vals, 0, i);
34212 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34213 return;
34214
34215 case V4HImode:
34216 case V8QImode:
34217 break;
34218
34219 default:
34220 gcc_unreachable ();
34221 }
34222
34223 {
34224 int i, j, n_elts, n_words, n_elt_per_word;
34225 enum machine_mode inner_mode;
34226 rtx words[4], shift;
34227
34228 inner_mode = GET_MODE_INNER (mode);
34229 n_elts = GET_MODE_NUNITS (mode);
34230 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34231 n_elt_per_word = n_elts / n_words;
34232 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34233
34234 for (i = 0; i < n_words; ++i)
34235 {
34236 rtx word = NULL_RTX;
34237
34238 for (j = 0; j < n_elt_per_word; ++j)
34239 {
34240 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34241 elt = convert_modes (word_mode, inner_mode, elt, true);
34242
34243 if (j == 0)
34244 word = elt;
34245 else
34246 {
34247 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34248 word, 1, OPTAB_LIB_WIDEN);
34249 word = expand_simple_binop (word_mode, IOR, word, elt,
34250 word, 1, OPTAB_LIB_WIDEN);
34251 }
34252 }
34253
34254 words[i] = word;
34255 }
34256
34257 if (n_words == 1)
34258 emit_move_insn (target, gen_lowpart (mode, words[0]));
34259 else if (n_words == 2)
34260 {
34261 rtx tmp = gen_reg_rtx (mode);
34262 emit_clobber (tmp);
34263 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34264 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34265 emit_move_insn (target, tmp);
34266 }
34267 else if (n_words == 4)
34268 {
34269 rtx tmp = gen_reg_rtx (V4SImode);
34270 gcc_assert (word_mode == SImode);
34271 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34272 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34273 emit_move_insn (target, gen_lowpart (mode, tmp));
34274 }
34275 else
34276 gcc_unreachable ();
34277 }
34278 }
34279
34280 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34281 instructions unless MMX_OK is true. */
34282
34283 void
34284 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34285 {
34286 enum machine_mode mode = GET_MODE (target);
34287 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34288 int n_elts = GET_MODE_NUNITS (mode);
34289 int n_var = 0, one_var = -1;
34290 bool all_same = true, all_const_zero = true;
34291 int i;
34292 rtx x;
34293
34294 for (i = 0; i < n_elts; ++i)
34295 {
34296 x = XVECEXP (vals, 0, i);
34297 if (!(CONST_INT_P (x)
34298 || GET_CODE (x) == CONST_DOUBLE
34299 || GET_CODE (x) == CONST_FIXED))
34300 n_var++, one_var = i;
34301 else if (x != CONST0_RTX (inner_mode))
34302 all_const_zero = false;
34303 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34304 all_same = false;
34305 }
34306
34307 /* Constants are best loaded from the constant pool. */
34308 if (n_var == 0)
34309 {
34310 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34311 return;
34312 }
34313
34314 /* If all values are identical, broadcast the value. */
34315 if (all_same
34316 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34317 XVECEXP (vals, 0, 0)))
34318 return;
34319
34320 /* Values where only one field is non-constant are best loaded from
34321 the pool and overwritten via move later. */
34322 if (n_var == 1)
34323 {
34324 if (all_const_zero
34325 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34326 XVECEXP (vals, 0, one_var),
34327 one_var))
34328 return;
34329
34330 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34331 return;
34332 }
34333
34334 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34335 }
34336
34337 void
34338 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34339 {
34340 enum machine_mode mode = GET_MODE (target);
34341 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34342 enum machine_mode half_mode;
34343 bool use_vec_merge = false;
34344 rtx tmp;
34345 static rtx (*gen_extract[6][2]) (rtx, rtx)
34346 = {
34347 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34348 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34349 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34350 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34351 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34352 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34353 };
34354 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34355 = {
34356 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34357 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34358 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34359 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34360 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34361 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34362 };
34363 int i, j, n;
34364
34365 switch (mode)
34366 {
34367 case V2SFmode:
34368 case V2SImode:
34369 if (mmx_ok)
34370 {
34371 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34372 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34373 if (elt == 0)
34374 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34375 else
34376 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34377 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34378 return;
34379 }
34380 break;
34381
34382 case V2DImode:
34383 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34384 if (use_vec_merge)
34385 break;
34386
34387 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34388 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34389 if (elt == 0)
34390 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34391 else
34392 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34393 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34394 return;
34395
34396 case V2DFmode:
34397 {
34398 rtx op0, op1;
34399
34400 /* For the two element vectors, we implement a VEC_CONCAT with
34401 the extraction of the other element. */
34402
34403 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34404 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34405
34406 if (elt == 0)
34407 op0 = val, op1 = tmp;
34408 else
34409 op0 = tmp, op1 = val;
34410
34411 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34412 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34413 }
34414 return;
34415
34416 case V4SFmode:
34417 use_vec_merge = TARGET_SSE4_1;
34418 if (use_vec_merge)
34419 break;
34420
34421 switch (elt)
34422 {
34423 case 0:
34424 use_vec_merge = true;
34425 break;
34426
34427 case 1:
34428 /* tmp = target = A B C D */
34429 tmp = copy_to_reg (target);
34430 /* target = A A B B */
34431 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34432 /* target = X A B B */
34433 ix86_expand_vector_set (false, target, val, 0);
34434 /* target = A X C D */
34435 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34436 const1_rtx, const0_rtx,
34437 GEN_INT (2+4), GEN_INT (3+4)));
34438 return;
34439
34440 case 2:
34441 /* tmp = target = A B C D */
34442 tmp = copy_to_reg (target);
34443 /* tmp = X B C D */
34444 ix86_expand_vector_set (false, tmp, val, 0);
34445 /* target = A B X D */
34446 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34447 const0_rtx, const1_rtx,
34448 GEN_INT (0+4), GEN_INT (3+4)));
34449 return;
34450
34451 case 3:
34452 /* tmp = target = A B C D */
34453 tmp = copy_to_reg (target);
34454 /* tmp = X B C D */
34455 ix86_expand_vector_set (false, tmp, val, 0);
34456 /* target = A B X D */
34457 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34458 const0_rtx, const1_rtx,
34459 GEN_INT (2+4), GEN_INT (0+4)));
34460 return;
34461
34462 default:
34463 gcc_unreachable ();
34464 }
34465 break;
34466
34467 case V4SImode:
34468 use_vec_merge = TARGET_SSE4_1;
34469 if (use_vec_merge)
34470 break;
34471
34472 /* Element 0 handled by vec_merge below. */
34473 if (elt == 0)
34474 {
34475 use_vec_merge = true;
34476 break;
34477 }
34478
34479 if (TARGET_SSE2)
34480 {
34481 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34482 store into element 0, then shuffle them back. */
34483
34484 rtx order[4];
34485
34486 order[0] = GEN_INT (elt);
34487 order[1] = const1_rtx;
34488 order[2] = const2_rtx;
34489 order[3] = GEN_INT (3);
34490 order[elt] = const0_rtx;
34491
34492 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34493 order[1], order[2], order[3]));
34494
34495 ix86_expand_vector_set (false, target, val, 0);
34496
34497 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34498 order[1], order[2], order[3]));
34499 }
34500 else
34501 {
34502 /* For SSE1, we have to reuse the V4SF code. */
34503 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34504 gen_lowpart (SFmode, val), elt);
34505 }
34506 return;
34507
34508 case V8HImode:
34509 use_vec_merge = TARGET_SSE2;
34510 break;
34511 case V4HImode:
34512 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34513 break;
34514
34515 case V16QImode:
34516 use_vec_merge = TARGET_SSE4_1;
34517 break;
34518
34519 case V8QImode:
34520 break;
34521
34522 case V32QImode:
34523 half_mode = V16QImode;
34524 j = 0;
34525 n = 16;
34526 goto half;
34527
34528 case V16HImode:
34529 half_mode = V8HImode;
34530 j = 1;
34531 n = 8;
34532 goto half;
34533
34534 case V8SImode:
34535 half_mode = V4SImode;
34536 j = 2;
34537 n = 4;
34538 goto half;
34539
34540 case V4DImode:
34541 half_mode = V2DImode;
34542 j = 3;
34543 n = 2;
34544 goto half;
34545
34546 case V8SFmode:
34547 half_mode = V4SFmode;
34548 j = 4;
34549 n = 4;
34550 goto half;
34551
34552 case V4DFmode:
34553 half_mode = V2DFmode;
34554 j = 5;
34555 n = 2;
34556 goto half;
34557
34558 half:
34559 /* Compute offset. */
34560 i = elt / n;
34561 elt %= n;
34562
34563 gcc_assert (i <= 1);
34564
34565 /* Extract the half. */
34566 tmp = gen_reg_rtx (half_mode);
34567 emit_insn (gen_extract[j][i] (tmp, target));
34568
34569 /* Put val in tmp at elt. */
34570 ix86_expand_vector_set (false, tmp, val, elt);
34571
34572 /* Put it back. */
34573 emit_insn (gen_insert[j][i] (target, target, tmp));
34574 return;
34575
34576 default:
34577 break;
34578 }
34579
34580 if (use_vec_merge)
34581 {
34582 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34583 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34584 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34585 }
34586 else
34587 {
34588 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34589
34590 emit_move_insn (mem, target);
34591
34592 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34593 emit_move_insn (tmp, val);
34594
34595 emit_move_insn (target, mem);
34596 }
34597 }
34598
34599 void
34600 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34601 {
34602 enum machine_mode mode = GET_MODE (vec);
34603 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34604 bool use_vec_extr = false;
34605 rtx tmp;
34606
34607 switch (mode)
34608 {
34609 case V2SImode:
34610 case V2SFmode:
34611 if (!mmx_ok)
34612 break;
34613 /* FALLTHRU */
34614
34615 case V2DFmode:
34616 case V2DImode:
34617 use_vec_extr = true;
34618 break;
34619
34620 case V4SFmode:
34621 use_vec_extr = TARGET_SSE4_1;
34622 if (use_vec_extr)
34623 break;
34624
34625 switch (elt)
34626 {
34627 case 0:
34628 tmp = vec;
34629 break;
34630
34631 case 1:
34632 case 3:
34633 tmp = gen_reg_rtx (mode);
34634 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34635 GEN_INT (elt), GEN_INT (elt),
34636 GEN_INT (elt+4), GEN_INT (elt+4)));
34637 break;
34638
34639 case 2:
34640 tmp = gen_reg_rtx (mode);
34641 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34642 break;
34643
34644 default:
34645 gcc_unreachable ();
34646 }
34647 vec = tmp;
34648 use_vec_extr = true;
34649 elt = 0;
34650 break;
34651
34652 case V4SImode:
34653 use_vec_extr = TARGET_SSE4_1;
34654 if (use_vec_extr)
34655 break;
34656
34657 if (TARGET_SSE2)
34658 {
34659 switch (elt)
34660 {
34661 case 0:
34662 tmp = vec;
34663 break;
34664
34665 case 1:
34666 case 3:
34667 tmp = gen_reg_rtx (mode);
34668 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34669 GEN_INT (elt), GEN_INT (elt),
34670 GEN_INT (elt), GEN_INT (elt)));
34671 break;
34672
34673 case 2:
34674 tmp = gen_reg_rtx (mode);
34675 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34676 break;
34677
34678 default:
34679 gcc_unreachable ();
34680 }
34681 vec = tmp;
34682 use_vec_extr = true;
34683 elt = 0;
34684 }
34685 else
34686 {
34687 /* For SSE1, we have to reuse the V4SF code. */
34688 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34689 gen_lowpart (V4SFmode, vec), elt);
34690 return;
34691 }
34692 break;
34693
34694 case V8HImode:
34695 use_vec_extr = TARGET_SSE2;
34696 break;
34697 case V4HImode:
34698 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34699 break;
34700
34701 case V16QImode:
34702 use_vec_extr = TARGET_SSE4_1;
34703 break;
34704
34705 case V8SFmode:
34706 if (TARGET_AVX)
34707 {
34708 tmp = gen_reg_rtx (V4SFmode);
34709 if (elt < 4)
34710 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34711 else
34712 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34713 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34714 return;
34715 }
34716 break;
34717
34718 case V4DFmode:
34719 if (TARGET_AVX)
34720 {
34721 tmp = gen_reg_rtx (V2DFmode);
34722 if (elt < 2)
34723 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34724 else
34725 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34726 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34727 return;
34728 }
34729 break;
34730
34731 case V32QImode:
34732 if (TARGET_AVX)
34733 {
34734 tmp = gen_reg_rtx (V16QImode);
34735 if (elt < 16)
34736 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34737 else
34738 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34739 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34740 return;
34741 }
34742 break;
34743
34744 case V16HImode:
34745 if (TARGET_AVX)
34746 {
34747 tmp = gen_reg_rtx (V8HImode);
34748 if (elt < 8)
34749 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34750 else
34751 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34752 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34753 return;
34754 }
34755 break;
34756
34757 case V8SImode:
34758 if (TARGET_AVX)
34759 {
34760 tmp = gen_reg_rtx (V4SImode);
34761 if (elt < 4)
34762 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34763 else
34764 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34765 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34766 return;
34767 }
34768 break;
34769
34770 case V4DImode:
34771 if (TARGET_AVX)
34772 {
34773 tmp = gen_reg_rtx (V2DImode);
34774 if (elt < 2)
34775 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34776 else
34777 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34778 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34779 return;
34780 }
34781 break;
34782
34783 case V8QImode:
34784 /* ??? Could extract the appropriate HImode element and shift. */
34785 default:
34786 break;
34787 }
34788
34789 if (use_vec_extr)
34790 {
34791 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34792 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34793
34794 /* Let the rtl optimizers know about the zero extension performed. */
34795 if (inner_mode == QImode || inner_mode == HImode)
34796 {
34797 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34798 target = gen_lowpart (SImode, target);
34799 }
34800
34801 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34802 }
34803 else
34804 {
34805 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34806
34807 emit_move_insn (mem, vec);
34808
34809 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34810 emit_move_insn (target, tmp);
34811 }
34812 }
34813
34814 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34815 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34816 The upper bits of DEST are undefined, though they shouldn't cause
34817 exceptions (some bits from src or all zeros are ok). */
34818
34819 static void
34820 emit_reduc_half (rtx dest, rtx src, int i)
34821 {
34822 rtx tem;
34823 switch (GET_MODE (src))
34824 {
34825 case V4SFmode:
34826 if (i == 128)
34827 tem = gen_sse_movhlps (dest, src, src);
34828 else
34829 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34830 GEN_INT (1 + 4), GEN_INT (1 + 4));
34831 break;
34832 case V2DFmode:
34833 tem = gen_vec_interleave_highv2df (dest, src, src);
34834 break;
34835 case V16QImode:
34836 case V8HImode:
34837 case V4SImode:
34838 case V2DImode:
34839 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34840 gen_lowpart (V1TImode, src),
34841 GEN_INT (i / 2));
34842 break;
34843 case V8SFmode:
34844 if (i == 256)
34845 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34846 else
34847 tem = gen_avx_shufps256 (dest, src, src,
34848 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34849 break;
34850 case V4DFmode:
34851 if (i == 256)
34852 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34853 else
34854 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34855 break;
34856 case V32QImode:
34857 case V16HImode:
34858 case V8SImode:
34859 case V4DImode:
34860 if (i == 256)
34861 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34862 gen_lowpart (V4DImode, src),
34863 gen_lowpart (V4DImode, src),
34864 const1_rtx);
34865 else
34866 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34867 gen_lowpart (V2TImode, src),
34868 GEN_INT (i / 2));
34869 break;
34870 default:
34871 gcc_unreachable ();
34872 }
34873 emit_insn (tem);
34874 }
34875
34876 /* Expand a vector reduction. FN is the binary pattern to reduce;
34877 DEST is the destination; IN is the input vector. */
34878
34879 void
34880 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34881 {
34882 rtx half, dst, vec = in;
34883 enum machine_mode mode = GET_MODE (in);
34884 int i;
34885
34886 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34887 if (TARGET_SSE4_1
34888 && mode == V8HImode
34889 && fn == gen_uminv8hi3)
34890 {
34891 emit_insn (gen_sse4_1_phminposuw (dest, in));
34892 return;
34893 }
34894
34895 for (i = GET_MODE_BITSIZE (mode);
34896 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34897 i >>= 1)
34898 {
34899 half = gen_reg_rtx (mode);
34900 emit_reduc_half (half, vec, i);
34901 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34902 dst = dest;
34903 else
34904 dst = gen_reg_rtx (mode);
34905 emit_insn (fn (dst, half, vec));
34906 vec = dst;
34907 }
34908 }
34909 \f
34910 /* Target hook for scalar_mode_supported_p. */
34911 static bool
34912 ix86_scalar_mode_supported_p (enum machine_mode mode)
34913 {
34914 if (DECIMAL_FLOAT_MODE_P (mode))
34915 return default_decimal_float_supported_p ();
34916 else if (mode == TFmode)
34917 return true;
34918 else
34919 return default_scalar_mode_supported_p (mode);
34920 }
34921
34922 /* Implements target hook vector_mode_supported_p. */
34923 static bool
34924 ix86_vector_mode_supported_p (enum machine_mode mode)
34925 {
34926 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34927 return true;
34928 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34929 return true;
34930 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34931 return true;
34932 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34933 return true;
34934 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34935 return true;
34936 return false;
34937 }
34938
34939 /* Target hook for c_mode_for_suffix. */
34940 static enum machine_mode
34941 ix86_c_mode_for_suffix (char suffix)
34942 {
34943 if (suffix == 'q')
34944 return TFmode;
34945 if (suffix == 'w')
34946 return XFmode;
34947
34948 return VOIDmode;
34949 }
34950
34951 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34952
34953 We do this in the new i386 backend to maintain source compatibility
34954 with the old cc0-based compiler. */
34955
34956 static tree
34957 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34958 tree inputs ATTRIBUTE_UNUSED,
34959 tree clobbers)
34960 {
34961 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34962 clobbers);
34963 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34964 clobbers);
34965 return clobbers;
34966 }
34967
34968 /* Implements target vector targetm.asm.encode_section_info. */
34969
34970 static void ATTRIBUTE_UNUSED
34971 ix86_encode_section_info (tree decl, rtx rtl, int first)
34972 {
34973 default_encode_section_info (decl, rtl, first);
34974
34975 if (TREE_CODE (decl) == VAR_DECL
34976 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34977 && ix86_in_large_data_p (decl))
34978 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34979 }
34980
34981 /* Worker function for REVERSE_CONDITION. */
34982
34983 enum rtx_code
34984 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34985 {
34986 return (mode != CCFPmode && mode != CCFPUmode
34987 ? reverse_condition (code)
34988 : reverse_condition_maybe_unordered (code));
34989 }
34990
34991 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34992 to OPERANDS[0]. */
34993
34994 const char *
34995 output_387_reg_move (rtx insn, rtx *operands)
34996 {
34997 if (REG_P (operands[0]))
34998 {
34999 if (REG_P (operands[1])
35000 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35001 {
35002 if (REGNO (operands[0]) == FIRST_STACK_REG)
35003 return output_387_ffreep (operands, 0);
35004 return "fstp\t%y0";
35005 }
35006 if (STACK_TOP_P (operands[0]))
35007 return "fld%Z1\t%y1";
35008 return "fst\t%y0";
35009 }
35010 else if (MEM_P (operands[0]))
35011 {
35012 gcc_assert (REG_P (operands[1]));
35013 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35014 return "fstp%Z0\t%y0";
35015 else
35016 {
35017 /* There is no non-popping store to memory for XFmode.
35018 So if we need one, follow the store with a load. */
35019 if (GET_MODE (operands[0]) == XFmode)
35020 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
35021 else
35022 return "fst%Z0\t%y0";
35023 }
35024 }
35025 else
35026 gcc_unreachable();
35027 }
35028
35029 /* Output code to perform a conditional jump to LABEL, if C2 flag in
35030 FP status register is set. */
35031
35032 void
35033 ix86_emit_fp_unordered_jump (rtx label)
35034 {
35035 rtx reg = gen_reg_rtx (HImode);
35036 rtx temp;
35037
35038 emit_insn (gen_x86_fnstsw_1 (reg));
35039
35040 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
35041 {
35042 emit_insn (gen_x86_sahf_1 (reg));
35043
35044 temp = gen_rtx_REG (CCmode, FLAGS_REG);
35045 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
35046 }
35047 else
35048 {
35049 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
35050
35051 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
35052 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
35053 }
35054
35055 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
35056 gen_rtx_LABEL_REF (VOIDmode, label),
35057 pc_rtx);
35058 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
35059
35060 emit_jump_insn (temp);
35061 predict_jump (REG_BR_PROB_BASE * 10 / 100);
35062 }
35063
35064 /* Output code to perform a log1p XFmode calculation. */
35065
35066 void ix86_emit_i387_log1p (rtx op0, rtx op1)
35067 {
35068 rtx label1 = gen_label_rtx ();
35069 rtx label2 = gen_label_rtx ();
35070
35071 rtx tmp = gen_reg_rtx (XFmode);
35072 rtx tmp2 = gen_reg_rtx (XFmode);
35073 rtx test;
35074
35075 emit_insn (gen_absxf2 (tmp, op1));
35076 test = gen_rtx_GE (VOIDmode, tmp,
35077 CONST_DOUBLE_FROM_REAL_VALUE (
35078 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
35079 XFmode));
35080 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
35081
35082 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35083 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
35084 emit_jump (label2);
35085
35086 emit_label (label1);
35087 emit_move_insn (tmp, CONST1_RTX (XFmode));
35088 emit_insn (gen_addxf3 (tmp, op1, tmp));
35089 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35090 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
35091
35092 emit_label (label2);
35093 }
35094
35095 /* Emit code for round calculation. */
35096 void ix86_emit_i387_round (rtx op0, rtx op1)
35097 {
35098 enum machine_mode inmode = GET_MODE (op1);
35099 enum machine_mode outmode = GET_MODE (op0);
35100 rtx e1, e2, res, tmp, tmp1, half;
35101 rtx scratch = gen_reg_rtx (HImode);
35102 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
35103 rtx jump_label = gen_label_rtx ();
35104 rtx insn;
35105 rtx (*gen_abs) (rtx, rtx);
35106 rtx (*gen_neg) (rtx, rtx);
35107
35108 switch (inmode)
35109 {
35110 case SFmode:
35111 gen_abs = gen_abssf2;
35112 break;
35113 case DFmode:
35114 gen_abs = gen_absdf2;
35115 break;
35116 case XFmode:
35117 gen_abs = gen_absxf2;
35118 break;
35119 default:
35120 gcc_unreachable ();
35121 }
35122
35123 switch (outmode)
35124 {
35125 case SFmode:
35126 gen_neg = gen_negsf2;
35127 break;
35128 case DFmode:
35129 gen_neg = gen_negdf2;
35130 break;
35131 case XFmode:
35132 gen_neg = gen_negxf2;
35133 break;
35134 case HImode:
35135 gen_neg = gen_neghi2;
35136 break;
35137 case SImode:
35138 gen_neg = gen_negsi2;
35139 break;
35140 case DImode:
35141 gen_neg = gen_negdi2;
35142 break;
35143 default:
35144 gcc_unreachable ();
35145 }
35146
35147 e1 = gen_reg_rtx (inmode);
35148 e2 = gen_reg_rtx (inmode);
35149 res = gen_reg_rtx (outmode);
35150
35151 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35152
35153 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35154
35155 /* scratch = fxam(op1) */
35156 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35157 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35158 UNSPEC_FXAM)));
35159 /* e1 = fabs(op1) */
35160 emit_insn (gen_abs (e1, op1));
35161
35162 /* e2 = e1 + 0.5 */
35163 half = force_reg (inmode, half);
35164 emit_insn (gen_rtx_SET (VOIDmode, e2,
35165 gen_rtx_PLUS (inmode, e1, half)));
35166
35167 /* res = floor(e2) */
35168 if (inmode != XFmode)
35169 {
35170 tmp1 = gen_reg_rtx (XFmode);
35171
35172 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35173 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35174 }
35175 else
35176 tmp1 = e2;
35177
35178 switch (outmode)
35179 {
35180 case SFmode:
35181 case DFmode:
35182 {
35183 rtx tmp0 = gen_reg_rtx (XFmode);
35184
35185 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35186
35187 emit_insn (gen_rtx_SET (VOIDmode, res,
35188 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35189 UNSPEC_TRUNC_NOOP)));
35190 }
35191 break;
35192 case XFmode:
35193 emit_insn (gen_frndintxf2_floor (res, tmp1));
35194 break;
35195 case HImode:
35196 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35197 break;
35198 case SImode:
35199 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35200 break;
35201 case DImode:
35202 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35203 break;
35204 default:
35205 gcc_unreachable ();
35206 }
35207
35208 /* flags = signbit(a) */
35209 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35210
35211 /* if (flags) then res = -res */
35212 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35213 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35214 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35215 pc_rtx);
35216 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35217 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35218 JUMP_LABEL (insn) = jump_label;
35219
35220 emit_insn (gen_neg (res, res));
35221
35222 emit_label (jump_label);
35223 LABEL_NUSES (jump_label) = 1;
35224
35225 emit_move_insn (op0, res);
35226 }
35227
35228 /* Output code to perform a Newton-Rhapson approximation of a single precision
35229 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35230
35231 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35232 {
35233 rtx x0, x1, e0, e1;
35234
35235 x0 = gen_reg_rtx (mode);
35236 e0 = gen_reg_rtx (mode);
35237 e1 = gen_reg_rtx (mode);
35238 x1 = gen_reg_rtx (mode);
35239
35240 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35241
35242 b = force_reg (mode, b);
35243
35244 /* x0 = rcp(b) estimate */
35245 emit_insn (gen_rtx_SET (VOIDmode, x0,
35246 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35247 UNSPEC_RCP)));
35248 /* e0 = x0 * b */
35249 emit_insn (gen_rtx_SET (VOIDmode, e0,
35250 gen_rtx_MULT (mode, x0, b)));
35251
35252 /* e0 = x0 * e0 */
35253 emit_insn (gen_rtx_SET (VOIDmode, e0,
35254 gen_rtx_MULT (mode, x0, e0)));
35255
35256 /* e1 = x0 + x0 */
35257 emit_insn (gen_rtx_SET (VOIDmode, e1,
35258 gen_rtx_PLUS (mode, x0, x0)));
35259
35260 /* x1 = e1 - e0 */
35261 emit_insn (gen_rtx_SET (VOIDmode, x1,
35262 gen_rtx_MINUS (mode, e1, e0)));
35263
35264 /* res = a * x1 */
35265 emit_insn (gen_rtx_SET (VOIDmode, res,
35266 gen_rtx_MULT (mode, a, x1)));
35267 }
35268
35269 /* Output code to perform a Newton-Rhapson approximation of a
35270 single precision floating point [reciprocal] square root. */
35271
35272 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35273 bool recip)
35274 {
35275 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35276 REAL_VALUE_TYPE r;
35277
35278 x0 = gen_reg_rtx (mode);
35279 e0 = gen_reg_rtx (mode);
35280 e1 = gen_reg_rtx (mode);
35281 e2 = gen_reg_rtx (mode);
35282 e3 = gen_reg_rtx (mode);
35283
35284 real_from_integer (&r, VOIDmode, -3, -1, 0);
35285 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35286
35287 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35288 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35289
35290 if (VECTOR_MODE_P (mode))
35291 {
35292 mthree = ix86_build_const_vector (mode, true, mthree);
35293 mhalf = ix86_build_const_vector (mode, true, mhalf);
35294 }
35295
35296 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35297 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35298
35299 a = force_reg (mode, a);
35300
35301 /* x0 = rsqrt(a) estimate */
35302 emit_insn (gen_rtx_SET (VOIDmode, x0,
35303 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35304 UNSPEC_RSQRT)));
35305
35306 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35307 if (!recip)
35308 {
35309 rtx zero, mask;
35310
35311 zero = gen_reg_rtx (mode);
35312 mask = gen_reg_rtx (mode);
35313
35314 zero = force_reg (mode, CONST0_RTX(mode));
35315 emit_insn (gen_rtx_SET (VOIDmode, mask,
35316 gen_rtx_NE (mode, zero, a)));
35317
35318 emit_insn (gen_rtx_SET (VOIDmode, x0,
35319 gen_rtx_AND (mode, x0, mask)));
35320 }
35321
35322 /* e0 = x0 * a */
35323 emit_insn (gen_rtx_SET (VOIDmode, e0,
35324 gen_rtx_MULT (mode, x0, a)));
35325 /* e1 = e0 * x0 */
35326 emit_insn (gen_rtx_SET (VOIDmode, e1,
35327 gen_rtx_MULT (mode, e0, x0)));
35328
35329 /* e2 = e1 - 3. */
35330 mthree = force_reg (mode, mthree);
35331 emit_insn (gen_rtx_SET (VOIDmode, e2,
35332 gen_rtx_PLUS (mode, e1, mthree)));
35333
35334 mhalf = force_reg (mode, mhalf);
35335 if (recip)
35336 /* e3 = -.5 * x0 */
35337 emit_insn (gen_rtx_SET (VOIDmode, e3,
35338 gen_rtx_MULT (mode, x0, mhalf)));
35339 else
35340 /* e3 = -.5 * e0 */
35341 emit_insn (gen_rtx_SET (VOIDmode, e3,
35342 gen_rtx_MULT (mode, e0, mhalf)));
35343 /* ret = e2 * e3 */
35344 emit_insn (gen_rtx_SET (VOIDmode, res,
35345 gen_rtx_MULT (mode, e2, e3)));
35346 }
35347
35348 #ifdef TARGET_SOLARIS
35349 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35350
35351 static void
35352 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35353 tree decl)
35354 {
35355 /* With Binutils 2.15, the "@unwind" marker must be specified on
35356 every occurrence of the ".eh_frame" section, not just the first
35357 one. */
35358 if (TARGET_64BIT
35359 && strcmp (name, ".eh_frame") == 0)
35360 {
35361 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35362 flags & SECTION_WRITE ? "aw" : "a");
35363 return;
35364 }
35365
35366 #ifndef USE_GAS
35367 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35368 {
35369 solaris_elf_asm_comdat_section (name, flags, decl);
35370 return;
35371 }
35372 #endif
35373
35374 default_elf_asm_named_section (name, flags, decl);
35375 }
35376 #endif /* TARGET_SOLARIS */
35377
35378 /* Return the mangling of TYPE if it is an extended fundamental type. */
35379
35380 static const char *
35381 ix86_mangle_type (const_tree type)
35382 {
35383 type = TYPE_MAIN_VARIANT (type);
35384
35385 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35386 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35387 return NULL;
35388
35389 switch (TYPE_MODE (type))
35390 {
35391 case TFmode:
35392 /* __float128 is "g". */
35393 return "g";
35394 case XFmode:
35395 /* "long double" or __float80 is "e". */
35396 return "e";
35397 default:
35398 return NULL;
35399 }
35400 }
35401
35402 /* For 32-bit code we can save PIC register setup by using
35403 __stack_chk_fail_local hidden function instead of calling
35404 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35405 register, so it is better to call __stack_chk_fail directly. */
35406
35407 static tree ATTRIBUTE_UNUSED
35408 ix86_stack_protect_fail (void)
35409 {
35410 return TARGET_64BIT
35411 ? default_external_stack_protect_fail ()
35412 : default_hidden_stack_protect_fail ();
35413 }
35414
35415 /* Select a format to encode pointers in exception handling data. CODE
35416 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35417 true if the symbol may be affected by dynamic relocations.
35418
35419 ??? All x86 object file formats are capable of representing this.
35420 After all, the relocation needed is the same as for the call insn.
35421 Whether or not a particular assembler allows us to enter such, I
35422 guess we'll have to see. */
35423 int
35424 asm_preferred_eh_data_format (int code, int global)
35425 {
35426 if (flag_pic)
35427 {
35428 int type = DW_EH_PE_sdata8;
35429 if (!TARGET_64BIT
35430 || ix86_cmodel == CM_SMALL_PIC
35431 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35432 type = DW_EH_PE_sdata4;
35433 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35434 }
35435 if (ix86_cmodel == CM_SMALL
35436 || (ix86_cmodel == CM_MEDIUM && code))
35437 return DW_EH_PE_udata4;
35438 return DW_EH_PE_absptr;
35439 }
35440 \f
35441 /* Expand copysign from SIGN to the positive value ABS_VALUE
35442 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35443 the sign-bit. */
35444 static void
35445 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35446 {
35447 enum machine_mode mode = GET_MODE (sign);
35448 rtx sgn = gen_reg_rtx (mode);
35449 if (mask == NULL_RTX)
35450 {
35451 enum machine_mode vmode;
35452
35453 if (mode == SFmode)
35454 vmode = V4SFmode;
35455 else if (mode == DFmode)
35456 vmode = V2DFmode;
35457 else
35458 vmode = mode;
35459
35460 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35461 if (!VECTOR_MODE_P (mode))
35462 {
35463 /* We need to generate a scalar mode mask in this case. */
35464 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35465 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35466 mask = gen_reg_rtx (mode);
35467 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35468 }
35469 }
35470 else
35471 mask = gen_rtx_NOT (mode, mask);
35472 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35473 gen_rtx_AND (mode, mask, sign)));
35474 emit_insn (gen_rtx_SET (VOIDmode, result,
35475 gen_rtx_IOR (mode, abs_value, sgn)));
35476 }
35477
35478 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35479 mask for masking out the sign-bit is stored in *SMASK, if that is
35480 non-null. */
35481 static rtx
35482 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35483 {
35484 enum machine_mode vmode, mode = GET_MODE (op0);
35485 rtx xa, mask;
35486
35487 xa = gen_reg_rtx (mode);
35488 if (mode == SFmode)
35489 vmode = V4SFmode;
35490 else if (mode == DFmode)
35491 vmode = V2DFmode;
35492 else
35493 vmode = mode;
35494 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35495 if (!VECTOR_MODE_P (mode))
35496 {
35497 /* We need to generate a scalar mode mask in this case. */
35498 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35499 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35500 mask = gen_reg_rtx (mode);
35501 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35502 }
35503 emit_insn (gen_rtx_SET (VOIDmode, xa,
35504 gen_rtx_AND (mode, op0, mask)));
35505
35506 if (smask)
35507 *smask = mask;
35508
35509 return xa;
35510 }
35511
35512 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35513 swapping the operands if SWAP_OPERANDS is true. The expanded
35514 code is a forward jump to a newly created label in case the
35515 comparison is true. The generated label rtx is returned. */
35516 static rtx
35517 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35518 bool swap_operands)
35519 {
35520 rtx label, tmp;
35521
35522 if (swap_operands)
35523 {
35524 tmp = op0;
35525 op0 = op1;
35526 op1 = tmp;
35527 }
35528
35529 label = gen_label_rtx ();
35530 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35531 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35532 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35533 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35534 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35535 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35536 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35537 JUMP_LABEL (tmp) = label;
35538
35539 return label;
35540 }
35541
35542 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35543 using comparison code CODE. Operands are swapped for the comparison if
35544 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35545 static rtx
35546 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35547 bool swap_operands)
35548 {
35549 rtx (*insn)(rtx, rtx, rtx, rtx);
35550 enum machine_mode mode = GET_MODE (op0);
35551 rtx mask = gen_reg_rtx (mode);
35552
35553 if (swap_operands)
35554 {
35555 rtx tmp = op0;
35556 op0 = op1;
35557 op1 = tmp;
35558 }
35559
35560 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35561
35562 emit_insn (insn (mask, op0, op1,
35563 gen_rtx_fmt_ee (code, mode, op0, op1)));
35564 return mask;
35565 }
35566
35567 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35568 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35569 static rtx
35570 ix86_gen_TWO52 (enum machine_mode mode)
35571 {
35572 REAL_VALUE_TYPE TWO52r;
35573 rtx TWO52;
35574
35575 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35576 TWO52 = const_double_from_real_value (TWO52r, mode);
35577 TWO52 = force_reg (mode, TWO52);
35578
35579 return TWO52;
35580 }
35581
35582 /* Expand SSE sequence for computing lround from OP1 storing
35583 into OP0. */
35584 void
35585 ix86_expand_lround (rtx op0, rtx op1)
35586 {
35587 /* C code for the stuff we're doing below:
35588 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35589 return (long)tmp;
35590 */
35591 enum machine_mode mode = GET_MODE (op1);
35592 const struct real_format *fmt;
35593 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35594 rtx adj;
35595
35596 /* load nextafter (0.5, 0.0) */
35597 fmt = REAL_MODE_FORMAT (mode);
35598 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35599 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35600
35601 /* adj = copysign (0.5, op1) */
35602 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35603 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35604
35605 /* adj = op1 + adj */
35606 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35607
35608 /* op0 = (imode)adj */
35609 expand_fix (op0, adj, 0);
35610 }
35611
35612 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35613 into OPERAND0. */
35614 void
35615 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35616 {
35617 /* C code for the stuff we're doing below (for do_floor):
35618 xi = (long)op1;
35619 xi -= (double)xi > op1 ? 1 : 0;
35620 return xi;
35621 */
35622 enum machine_mode fmode = GET_MODE (op1);
35623 enum machine_mode imode = GET_MODE (op0);
35624 rtx ireg, freg, label, tmp;
35625
35626 /* reg = (long)op1 */
35627 ireg = gen_reg_rtx (imode);
35628 expand_fix (ireg, op1, 0);
35629
35630 /* freg = (double)reg */
35631 freg = gen_reg_rtx (fmode);
35632 expand_float (freg, ireg, 0);
35633
35634 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35635 label = ix86_expand_sse_compare_and_jump (UNLE,
35636 freg, op1, !do_floor);
35637 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35638 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35639 emit_move_insn (ireg, tmp);
35640
35641 emit_label (label);
35642 LABEL_NUSES (label) = 1;
35643
35644 emit_move_insn (op0, ireg);
35645 }
35646
35647 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35648 result in OPERAND0. */
35649 void
35650 ix86_expand_rint (rtx operand0, rtx operand1)
35651 {
35652 /* C code for the stuff we're doing below:
35653 xa = fabs (operand1);
35654 if (!isless (xa, 2**52))
35655 return operand1;
35656 xa = xa + 2**52 - 2**52;
35657 return copysign (xa, operand1);
35658 */
35659 enum machine_mode mode = GET_MODE (operand0);
35660 rtx res, xa, label, TWO52, mask;
35661
35662 res = gen_reg_rtx (mode);
35663 emit_move_insn (res, operand1);
35664
35665 /* xa = abs (operand1) */
35666 xa = ix86_expand_sse_fabs (res, &mask);
35667
35668 /* if (!isless (xa, TWO52)) goto label; */
35669 TWO52 = ix86_gen_TWO52 (mode);
35670 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35671
35672 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35673 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35674
35675 ix86_sse_copysign_to_positive (res, xa, res, mask);
35676
35677 emit_label (label);
35678 LABEL_NUSES (label) = 1;
35679
35680 emit_move_insn (operand0, res);
35681 }
35682
35683 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35684 into OPERAND0. */
35685 void
35686 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35687 {
35688 /* C code for the stuff we expand below.
35689 double xa = fabs (x), x2;
35690 if (!isless (xa, TWO52))
35691 return x;
35692 xa = xa + TWO52 - TWO52;
35693 x2 = copysign (xa, x);
35694 Compensate. Floor:
35695 if (x2 > x)
35696 x2 -= 1;
35697 Compensate. Ceil:
35698 if (x2 < x)
35699 x2 -= -1;
35700 return x2;
35701 */
35702 enum machine_mode mode = GET_MODE (operand0);
35703 rtx xa, TWO52, tmp, label, one, res, mask;
35704
35705 TWO52 = ix86_gen_TWO52 (mode);
35706
35707 /* Temporary for holding the result, initialized to the input
35708 operand to ease control flow. */
35709 res = gen_reg_rtx (mode);
35710 emit_move_insn (res, operand1);
35711
35712 /* xa = abs (operand1) */
35713 xa = ix86_expand_sse_fabs (res, &mask);
35714
35715 /* if (!isless (xa, TWO52)) goto label; */
35716 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35717
35718 /* xa = xa + TWO52 - TWO52; */
35719 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35720 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35721
35722 /* xa = copysign (xa, operand1) */
35723 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35724
35725 /* generate 1.0 or -1.0 */
35726 one = force_reg (mode,
35727 const_double_from_real_value (do_floor
35728 ? dconst1 : dconstm1, mode));
35729
35730 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35731 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35732 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35733 gen_rtx_AND (mode, one, tmp)));
35734 /* We always need to subtract here to preserve signed zero. */
35735 tmp = expand_simple_binop (mode, MINUS,
35736 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35737 emit_move_insn (res, tmp);
35738
35739 emit_label (label);
35740 LABEL_NUSES (label) = 1;
35741
35742 emit_move_insn (operand0, res);
35743 }
35744
35745 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35746 into OPERAND0. */
35747 void
35748 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35749 {
35750 /* C code for the stuff we expand below.
35751 double xa = fabs (x), x2;
35752 if (!isless (xa, TWO52))
35753 return x;
35754 x2 = (double)(long)x;
35755 Compensate. Floor:
35756 if (x2 > x)
35757 x2 -= 1;
35758 Compensate. Ceil:
35759 if (x2 < x)
35760 x2 += 1;
35761 if (HONOR_SIGNED_ZEROS (mode))
35762 return copysign (x2, x);
35763 return x2;
35764 */
35765 enum machine_mode mode = GET_MODE (operand0);
35766 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35767
35768 TWO52 = ix86_gen_TWO52 (mode);
35769
35770 /* Temporary for holding the result, initialized to the input
35771 operand to ease control flow. */
35772 res = gen_reg_rtx (mode);
35773 emit_move_insn (res, operand1);
35774
35775 /* xa = abs (operand1) */
35776 xa = ix86_expand_sse_fabs (res, &mask);
35777
35778 /* if (!isless (xa, TWO52)) goto label; */
35779 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35780
35781 /* xa = (double)(long)x */
35782 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35783 expand_fix (xi, res, 0);
35784 expand_float (xa, xi, 0);
35785
35786 /* generate 1.0 */
35787 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35788
35789 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35790 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35791 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35792 gen_rtx_AND (mode, one, tmp)));
35793 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35794 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35795 emit_move_insn (res, tmp);
35796
35797 if (HONOR_SIGNED_ZEROS (mode))
35798 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35799
35800 emit_label (label);
35801 LABEL_NUSES (label) = 1;
35802
35803 emit_move_insn (operand0, res);
35804 }
35805
35806 /* Expand SSE sequence for computing round from OPERAND1 storing
35807 into OPERAND0. Sequence that works without relying on DImode truncation
35808 via cvttsd2siq that is only available on 64bit targets. */
35809 void
35810 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35811 {
35812 /* C code for the stuff we expand below.
35813 double xa = fabs (x), xa2, x2;
35814 if (!isless (xa, TWO52))
35815 return x;
35816 Using the absolute value and copying back sign makes
35817 -0.0 -> -0.0 correct.
35818 xa2 = xa + TWO52 - TWO52;
35819 Compensate.
35820 dxa = xa2 - xa;
35821 if (dxa <= -0.5)
35822 xa2 += 1;
35823 else if (dxa > 0.5)
35824 xa2 -= 1;
35825 x2 = copysign (xa2, x);
35826 return x2;
35827 */
35828 enum machine_mode mode = GET_MODE (operand0);
35829 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35830
35831 TWO52 = ix86_gen_TWO52 (mode);
35832
35833 /* Temporary for holding the result, initialized to the input
35834 operand to ease control flow. */
35835 res = gen_reg_rtx (mode);
35836 emit_move_insn (res, operand1);
35837
35838 /* xa = abs (operand1) */
35839 xa = ix86_expand_sse_fabs (res, &mask);
35840
35841 /* if (!isless (xa, TWO52)) goto label; */
35842 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35843
35844 /* xa2 = xa + TWO52 - TWO52; */
35845 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35846 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35847
35848 /* dxa = xa2 - xa; */
35849 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35850
35851 /* generate 0.5, 1.0 and -0.5 */
35852 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35853 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35854 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35855 0, OPTAB_DIRECT);
35856
35857 /* Compensate. */
35858 tmp = gen_reg_rtx (mode);
35859 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35860 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35861 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35862 gen_rtx_AND (mode, one, tmp)));
35863 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35864 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35865 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35866 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35867 gen_rtx_AND (mode, one, tmp)));
35868 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35869
35870 /* res = copysign (xa2, operand1) */
35871 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35872
35873 emit_label (label);
35874 LABEL_NUSES (label) = 1;
35875
35876 emit_move_insn (operand0, res);
35877 }
35878
35879 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35880 into OPERAND0. */
35881 void
35882 ix86_expand_trunc (rtx operand0, rtx operand1)
35883 {
35884 /* C code for SSE variant we expand below.
35885 double xa = fabs (x), x2;
35886 if (!isless (xa, TWO52))
35887 return x;
35888 x2 = (double)(long)x;
35889 if (HONOR_SIGNED_ZEROS (mode))
35890 return copysign (x2, x);
35891 return x2;
35892 */
35893 enum machine_mode mode = GET_MODE (operand0);
35894 rtx xa, xi, TWO52, label, res, mask;
35895
35896 TWO52 = ix86_gen_TWO52 (mode);
35897
35898 /* Temporary for holding the result, initialized to the input
35899 operand to ease control flow. */
35900 res = gen_reg_rtx (mode);
35901 emit_move_insn (res, operand1);
35902
35903 /* xa = abs (operand1) */
35904 xa = ix86_expand_sse_fabs (res, &mask);
35905
35906 /* if (!isless (xa, TWO52)) goto label; */
35907 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35908
35909 /* x = (double)(long)x */
35910 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35911 expand_fix (xi, res, 0);
35912 expand_float (res, xi, 0);
35913
35914 if (HONOR_SIGNED_ZEROS (mode))
35915 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35916
35917 emit_label (label);
35918 LABEL_NUSES (label) = 1;
35919
35920 emit_move_insn (operand0, res);
35921 }
35922
35923 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35924 into OPERAND0. */
35925 void
35926 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35927 {
35928 enum machine_mode mode = GET_MODE (operand0);
35929 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35930
35931 /* C code for SSE variant we expand below.
35932 double xa = fabs (x), x2;
35933 if (!isless (xa, TWO52))
35934 return x;
35935 xa2 = xa + TWO52 - TWO52;
35936 Compensate:
35937 if (xa2 > xa)
35938 xa2 -= 1.0;
35939 x2 = copysign (xa2, x);
35940 return x2;
35941 */
35942
35943 TWO52 = ix86_gen_TWO52 (mode);
35944
35945 /* Temporary for holding the result, initialized to the input
35946 operand to ease control flow. */
35947 res = gen_reg_rtx (mode);
35948 emit_move_insn (res, operand1);
35949
35950 /* xa = abs (operand1) */
35951 xa = ix86_expand_sse_fabs (res, &smask);
35952
35953 /* if (!isless (xa, TWO52)) goto label; */
35954 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35955
35956 /* res = xa + TWO52 - TWO52; */
35957 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35958 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35959 emit_move_insn (res, tmp);
35960
35961 /* generate 1.0 */
35962 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35963
35964 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35965 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35966 emit_insn (gen_rtx_SET (VOIDmode, mask,
35967 gen_rtx_AND (mode, mask, one)));
35968 tmp = expand_simple_binop (mode, MINUS,
35969 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35970 emit_move_insn (res, tmp);
35971
35972 /* res = copysign (res, operand1) */
35973 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35974
35975 emit_label (label);
35976 LABEL_NUSES (label) = 1;
35977
35978 emit_move_insn (operand0, res);
35979 }
35980
35981 /* Expand SSE sequence for computing round from OPERAND1 storing
35982 into OPERAND0. */
35983 void
35984 ix86_expand_round (rtx operand0, rtx operand1)
35985 {
35986 /* C code for the stuff we're doing below:
35987 double xa = fabs (x);
35988 if (!isless (xa, TWO52))
35989 return x;
35990 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35991 return copysign (xa, x);
35992 */
35993 enum machine_mode mode = GET_MODE (operand0);
35994 rtx res, TWO52, xa, label, xi, half, mask;
35995 const struct real_format *fmt;
35996 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35997
35998 /* Temporary for holding the result, initialized to the input
35999 operand to ease control flow. */
36000 res = gen_reg_rtx (mode);
36001 emit_move_insn (res, operand1);
36002
36003 TWO52 = ix86_gen_TWO52 (mode);
36004 xa = ix86_expand_sse_fabs (res, &mask);
36005 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36006
36007 /* load nextafter (0.5, 0.0) */
36008 fmt = REAL_MODE_FORMAT (mode);
36009 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36010 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36011
36012 /* xa = xa + 0.5 */
36013 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
36014 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
36015
36016 /* xa = (double)(int64_t)xa */
36017 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36018 expand_fix (xi, xa, 0);
36019 expand_float (xa, xi, 0);
36020
36021 /* res = copysign (xa, operand1) */
36022 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
36023
36024 emit_label (label);
36025 LABEL_NUSES (label) = 1;
36026
36027 emit_move_insn (operand0, res);
36028 }
36029
36030 /* Expand SSE sequence for computing round
36031 from OP1 storing into OP0 using sse4 round insn. */
36032 void
36033 ix86_expand_round_sse4 (rtx op0, rtx op1)
36034 {
36035 enum machine_mode mode = GET_MODE (op0);
36036 rtx e1, e2, res, half;
36037 const struct real_format *fmt;
36038 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36039 rtx (*gen_copysign) (rtx, rtx, rtx);
36040 rtx (*gen_round) (rtx, rtx, rtx);
36041
36042 switch (mode)
36043 {
36044 case SFmode:
36045 gen_copysign = gen_copysignsf3;
36046 gen_round = gen_sse4_1_roundsf2;
36047 break;
36048 case DFmode:
36049 gen_copysign = gen_copysigndf3;
36050 gen_round = gen_sse4_1_rounddf2;
36051 break;
36052 default:
36053 gcc_unreachable ();
36054 }
36055
36056 /* round (a) = trunc (a + copysign (0.5, a)) */
36057
36058 /* load nextafter (0.5, 0.0) */
36059 fmt = REAL_MODE_FORMAT (mode);
36060 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36061 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36062 half = const_double_from_real_value (pred_half, mode);
36063
36064 /* e1 = copysign (0.5, op1) */
36065 e1 = gen_reg_rtx (mode);
36066 emit_insn (gen_copysign (e1, half, op1));
36067
36068 /* e2 = op1 + e1 */
36069 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
36070
36071 /* res = trunc (e2) */
36072 res = gen_reg_rtx (mode);
36073 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
36074
36075 emit_move_insn (op0, res);
36076 }
36077 \f
36078
36079 /* Table of valid machine attributes. */
36080 static const struct attribute_spec ix86_attribute_table[] =
36081 {
36082 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
36083 affects_type_identity } */
36084 /* Stdcall attribute says callee is responsible for popping arguments
36085 if they are not variable. */
36086 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36087 true },
36088 /* Fastcall attribute says callee is responsible for popping arguments
36089 if they are not variable. */
36090 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36091 true },
36092 /* Thiscall attribute says callee is responsible for popping arguments
36093 if they are not variable. */
36094 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36095 true },
36096 /* Cdecl attribute says the callee is a normal C declaration */
36097 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36098 true },
36099 /* Regparm attribute specifies how many integer arguments are to be
36100 passed in registers. */
36101 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
36102 true },
36103 /* Sseregparm attribute says we are using x86_64 calling conventions
36104 for FP arguments. */
36105 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36106 true },
36107 /* The transactional memory builtins are implicitly regparm or fastcall
36108 depending on the ABI. Override the generic do-nothing attribute that
36109 these builtins were declared with. */
36110 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
36111 true },
36112 /* force_align_arg_pointer says this function realigns the stack at entry. */
36113 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
36114 false, true, true, ix86_handle_cconv_attribute, false },
36115 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
36116 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
36117 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
36118 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
36119 false },
36120 #endif
36121 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36122 false },
36123 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36124 false },
36125 #ifdef SUBTARGET_ATTRIBUTE_TABLE
36126 SUBTARGET_ATTRIBUTE_TABLE,
36127 #endif
36128 /* ms_abi and sysv_abi calling convention function attributes. */
36129 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36130 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36131 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
36132 false },
36133 { "callee_pop_aggregate_return", 1, 1, false, true, true,
36134 ix86_handle_callee_pop_aggregate_return, true },
36135 /* End element. */
36136 { NULL, 0, 0, false, false, false, NULL, false }
36137 };
36138
36139 /* Implement targetm.vectorize.builtin_vectorization_cost. */
36140 static int
36141 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
36142 tree vectype,
36143 int misalign ATTRIBUTE_UNUSED)
36144 {
36145 unsigned elements;
36146
36147 switch (type_of_cost)
36148 {
36149 case scalar_stmt:
36150 return ix86_cost->scalar_stmt_cost;
36151
36152 case scalar_load:
36153 return ix86_cost->scalar_load_cost;
36154
36155 case scalar_store:
36156 return ix86_cost->scalar_store_cost;
36157
36158 case vector_stmt:
36159 return ix86_cost->vec_stmt_cost;
36160
36161 case vector_load:
36162 return ix86_cost->vec_align_load_cost;
36163
36164 case vector_store:
36165 return ix86_cost->vec_store_cost;
36166
36167 case vec_to_scalar:
36168 return ix86_cost->vec_to_scalar_cost;
36169
36170 case scalar_to_vec:
36171 return ix86_cost->scalar_to_vec_cost;
36172
36173 case unaligned_load:
36174 case unaligned_store:
36175 return ix86_cost->vec_unalign_load_cost;
36176
36177 case cond_branch_taken:
36178 return ix86_cost->cond_taken_branch_cost;
36179
36180 case cond_branch_not_taken:
36181 return ix86_cost->cond_not_taken_branch_cost;
36182
36183 case vec_perm:
36184 case vec_promote_demote:
36185 return ix86_cost->vec_stmt_cost;
36186
36187 case vec_construct:
36188 elements = TYPE_VECTOR_SUBPARTS (vectype);
36189 return elements / 2 + 1;
36190
36191 default:
36192 gcc_unreachable ();
36193 }
36194 }
36195
36196 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36197 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36198 insn every time. */
36199
36200 static GTY(()) rtx vselect_insn;
36201
36202 /* Initialize vselect_insn. */
36203
36204 static void
36205 init_vselect_insn (void)
36206 {
36207 unsigned i;
36208 rtx x;
36209
36210 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36211 for (i = 0; i < MAX_VECT_LEN; ++i)
36212 XVECEXP (x, 0, i) = const0_rtx;
36213 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36214 const0_rtx), x);
36215 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36216 start_sequence ();
36217 vselect_insn = emit_insn (x);
36218 end_sequence ();
36219 }
36220
36221 /* Construct (set target (vec_select op0 (parallel perm))) and
36222 return true if that's a valid instruction in the active ISA. */
36223
36224 static bool
36225 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36226 unsigned nelt, bool testing_p)
36227 {
36228 unsigned int i;
36229 rtx x, save_vconcat;
36230 int icode;
36231
36232 if (vselect_insn == NULL_RTX)
36233 init_vselect_insn ();
36234
36235 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36236 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36237 for (i = 0; i < nelt; ++i)
36238 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36239 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36240 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36241 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36242 SET_DEST (PATTERN (vselect_insn)) = target;
36243 icode = recog_memoized (vselect_insn);
36244
36245 if (icode >= 0 && !testing_p)
36246 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36247
36248 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36249 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36250 INSN_CODE (vselect_insn) = -1;
36251
36252 return icode >= 0;
36253 }
36254
36255 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36256
36257 static bool
36258 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36259 const unsigned char *perm, unsigned nelt,
36260 bool testing_p)
36261 {
36262 enum machine_mode v2mode;
36263 rtx x;
36264 bool ok;
36265
36266 if (vselect_insn == NULL_RTX)
36267 init_vselect_insn ();
36268
36269 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36270 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36271 PUT_MODE (x, v2mode);
36272 XEXP (x, 0) = op0;
36273 XEXP (x, 1) = op1;
36274 ok = expand_vselect (target, x, perm, nelt, testing_p);
36275 XEXP (x, 0) = const0_rtx;
36276 XEXP (x, 1) = const0_rtx;
36277 return ok;
36278 }
36279
36280 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36281 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36282
36283 static bool
36284 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36285 {
36286 enum machine_mode vmode = d->vmode;
36287 unsigned i, mask, nelt = d->nelt;
36288 rtx target, op0, op1, x;
36289 rtx rperm[32], vperm;
36290
36291 if (d->one_operand_p)
36292 return false;
36293 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36294 ;
36295 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36296 ;
36297 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36298 ;
36299 else
36300 return false;
36301
36302 /* This is a blend, not a permute. Elements must stay in their
36303 respective lanes. */
36304 for (i = 0; i < nelt; ++i)
36305 {
36306 unsigned e = d->perm[i];
36307 if (!(e == i || e == i + nelt))
36308 return false;
36309 }
36310
36311 if (d->testing_p)
36312 return true;
36313
36314 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36315 decision should be extracted elsewhere, so that we only try that
36316 sequence once all budget==3 options have been tried. */
36317 target = d->target;
36318 op0 = d->op0;
36319 op1 = d->op1;
36320 mask = 0;
36321
36322 switch (vmode)
36323 {
36324 case V4DFmode:
36325 case V8SFmode:
36326 case V2DFmode:
36327 case V4SFmode:
36328 case V8HImode:
36329 case V8SImode:
36330 for (i = 0; i < nelt; ++i)
36331 mask |= (d->perm[i] >= nelt) << i;
36332 break;
36333
36334 case V2DImode:
36335 for (i = 0; i < 2; ++i)
36336 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36337 vmode = V8HImode;
36338 goto do_subreg;
36339
36340 case V4SImode:
36341 for (i = 0; i < 4; ++i)
36342 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36343 vmode = V8HImode;
36344 goto do_subreg;
36345
36346 case V16QImode:
36347 /* See if bytes move in pairs so we can use pblendw with
36348 an immediate argument, rather than pblendvb with a vector
36349 argument. */
36350 for (i = 0; i < 16; i += 2)
36351 if (d->perm[i] + 1 != d->perm[i + 1])
36352 {
36353 use_pblendvb:
36354 for (i = 0; i < nelt; ++i)
36355 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36356
36357 finish_pblendvb:
36358 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36359 vperm = force_reg (vmode, vperm);
36360
36361 if (GET_MODE_SIZE (vmode) == 16)
36362 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36363 else
36364 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36365 return true;
36366 }
36367
36368 for (i = 0; i < 8; ++i)
36369 mask |= (d->perm[i * 2] >= 16) << i;
36370 vmode = V8HImode;
36371 /* FALLTHRU */
36372
36373 do_subreg:
36374 target = gen_lowpart (vmode, target);
36375 op0 = gen_lowpart (vmode, op0);
36376 op1 = gen_lowpart (vmode, op1);
36377 break;
36378
36379 case V32QImode:
36380 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36381 for (i = 0; i < 32; i += 2)
36382 if (d->perm[i] + 1 != d->perm[i + 1])
36383 goto use_pblendvb;
36384 /* See if bytes move in quadruplets. If yes, vpblendd
36385 with immediate can be used. */
36386 for (i = 0; i < 32; i += 4)
36387 if (d->perm[i] + 2 != d->perm[i + 2])
36388 break;
36389 if (i < 32)
36390 {
36391 /* See if bytes move the same in both lanes. If yes,
36392 vpblendw with immediate can be used. */
36393 for (i = 0; i < 16; i += 2)
36394 if (d->perm[i] + 16 != d->perm[i + 16])
36395 goto use_pblendvb;
36396
36397 /* Use vpblendw. */
36398 for (i = 0; i < 16; ++i)
36399 mask |= (d->perm[i * 2] >= 32) << i;
36400 vmode = V16HImode;
36401 goto do_subreg;
36402 }
36403
36404 /* Use vpblendd. */
36405 for (i = 0; i < 8; ++i)
36406 mask |= (d->perm[i * 4] >= 32) << i;
36407 vmode = V8SImode;
36408 goto do_subreg;
36409
36410 case V16HImode:
36411 /* See if words move in pairs. If yes, vpblendd can be used. */
36412 for (i = 0; i < 16; i += 2)
36413 if (d->perm[i] + 1 != d->perm[i + 1])
36414 break;
36415 if (i < 16)
36416 {
36417 /* See if words move the same in both lanes. If not,
36418 vpblendvb must be used. */
36419 for (i = 0; i < 8; i++)
36420 if (d->perm[i] + 8 != d->perm[i + 8])
36421 {
36422 /* Use vpblendvb. */
36423 for (i = 0; i < 32; ++i)
36424 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36425
36426 vmode = V32QImode;
36427 nelt = 32;
36428 target = gen_lowpart (vmode, target);
36429 op0 = gen_lowpart (vmode, op0);
36430 op1 = gen_lowpart (vmode, op1);
36431 goto finish_pblendvb;
36432 }
36433
36434 /* Use vpblendw. */
36435 for (i = 0; i < 16; ++i)
36436 mask |= (d->perm[i] >= 16) << i;
36437 break;
36438 }
36439
36440 /* Use vpblendd. */
36441 for (i = 0; i < 8; ++i)
36442 mask |= (d->perm[i * 2] >= 16) << i;
36443 vmode = V8SImode;
36444 goto do_subreg;
36445
36446 case V4DImode:
36447 /* Use vpblendd. */
36448 for (i = 0; i < 4; ++i)
36449 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36450 vmode = V8SImode;
36451 goto do_subreg;
36452
36453 default:
36454 gcc_unreachable ();
36455 }
36456
36457 /* This matches five different patterns with the different modes. */
36458 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36459 x = gen_rtx_SET (VOIDmode, target, x);
36460 emit_insn (x);
36461
36462 return true;
36463 }
36464
36465 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36466 in terms of the variable form of vpermilps.
36467
36468 Note that we will have already failed the immediate input vpermilps,
36469 which requires that the high and low part shuffle be identical; the
36470 variable form doesn't require that. */
36471
36472 static bool
36473 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36474 {
36475 rtx rperm[8], vperm;
36476 unsigned i;
36477
36478 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36479 return false;
36480
36481 /* We can only permute within the 128-bit lane. */
36482 for (i = 0; i < 8; ++i)
36483 {
36484 unsigned e = d->perm[i];
36485 if (i < 4 ? e >= 4 : e < 4)
36486 return false;
36487 }
36488
36489 if (d->testing_p)
36490 return true;
36491
36492 for (i = 0; i < 8; ++i)
36493 {
36494 unsigned e = d->perm[i];
36495
36496 /* Within each 128-bit lane, the elements of op0 are numbered
36497 from 0 and the elements of op1 are numbered from 4. */
36498 if (e >= 8 + 4)
36499 e -= 8;
36500 else if (e >= 4)
36501 e -= 4;
36502
36503 rperm[i] = GEN_INT (e);
36504 }
36505
36506 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36507 vperm = force_reg (V8SImode, vperm);
36508 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36509
36510 return true;
36511 }
36512
36513 /* Return true if permutation D can be performed as VMODE permutation
36514 instead. */
36515
36516 static bool
36517 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36518 {
36519 unsigned int i, j, chunk;
36520
36521 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36522 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36523 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36524 return false;
36525
36526 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36527 return true;
36528
36529 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36530 for (i = 0; i < d->nelt; i += chunk)
36531 if (d->perm[i] & (chunk - 1))
36532 return false;
36533 else
36534 for (j = 1; j < chunk; ++j)
36535 if (d->perm[i] + j != d->perm[i + j])
36536 return false;
36537
36538 return true;
36539 }
36540
36541 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36542 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36543
36544 static bool
36545 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36546 {
36547 unsigned i, nelt, eltsz, mask;
36548 unsigned char perm[32];
36549 enum machine_mode vmode = V16QImode;
36550 rtx rperm[32], vperm, target, op0, op1;
36551
36552 nelt = d->nelt;
36553
36554 if (!d->one_operand_p)
36555 {
36556 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36557 {
36558 if (TARGET_AVX2
36559 && valid_perm_using_mode_p (V2TImode, d))
36560 {
36561 if (d->testing_p)
36562 return true;
36563
36564 /* Use vperm2i128 insn. The pattern uses
36565 V4DImode instead of V2TImode. */
36566 target = gen_lowpart (V4DImode, d->target);
36567 op0 = gen_lowpart (V4DImode, d->op0);
36568 op1 = gen_lowpart (V4DImode, d->op1);
36569 rperm[0]
36570 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36571 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36572 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36573 return true;
36574 }
36575 return false;
36576 }
36577 }
36578 else
36579 {
36580 if (GET_MODE_SIZE (d->vmode) == 16)
36581 {
36582 if (!TARGET_SSSE3)
36583 return false;
36584 }
36585 else if (GET_MODE_SIZE (d->vmode) == 32)
36586 {
36587 if (!TARGET_AVX2)
36588 return false;
36589
36590 /* V4DImode should be already handled through
36591 expand_vselect by vpermq instruction. */
36592 gcc_assert (d->vmode != V4DImode);
36593
36594 vmode = V32QImode;
36595 if (d->vmode == V8SImode
36596 || d->vmode == V16HImode
36597 || d->vmode == V32QImode)
36598 {
36599 /* First see if vpermq can be used for
36600 V8SImode/V16HImode/V32QImode. */
36601 if (valid_perm_using_mode_p (V4DImode, d))
36602 {
36603 for (i = 0; i < 4; i++)
36604 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36605 if (d->testing_p)
36606 return true;
36607 return expand_vselect (gen_lowpart (V4DImode, d->target),
36608 gen_lowpart (V4DImode, d->op0),
36609 perm, 4, false);
36610 }
36611
36612 /* Next see if vpermd can be used. */
36613 if (valid_perm_using_mode_p (V8SImode, d))
36614 vmode = V8SImode;
36615 }
36616 /* Or if vpermps can be used. */
36617 else if (d->vmode == V8SFmode)
36618 vmode = V8SImode;
36619
36620 if (vmode == V32QImode)
36621 {
36622 /* vpshufb only works intra lanes, it is not
36623 possible to shuffle bytes in between the lanes. */
36624 for (i = 0; i < nelt; ++i)
36625 if ((d->perm[i] ^ i) & (nelt / 2))
36626 return false;
36627 }
36628 }
36629 else
36630 return false;
36631 }
36632
36633 if (d->testing_p)
36634 return true;
36635
36636 if (vmode == V8SImode)
36637 for (i = 0; i < 8; ++i)
36638 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36639 else
36640 {
36641 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36642 if (!d->one_operand_p)
36643 mask = 2 * nelt - 1;
36644 else if (vmode == V16QImode)
36645 mask = nelt - 1;
36646 else
36647 mask = nelt / 2 - 1;
36648
36649 for (i = 0; i < nelt; ++i)
36650 {
36651 unsigned j, e = d->perm[i] & mask;
36652 for (j = 0; j < eltsz; ++j)
36653 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36654 }
36655 }
36656
36657 vperm = gen_rtx_CONST_VECTOR (vmode,
36658 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36659 vperm = force_reg (vmode, vperm);
36660
36661 target = gen_lowpart (vmode, d->target);
36662 op0 = gen_lowpart (vmode, d->op0);
36663 if (d->one_operand_p)
36664 {
36665 if (vmode == V16QImode)
36666 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36667 else if (vmode == V32QImode)
36668 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36669 else if (vmode == V8SFmode)
36670 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
36671 else
36672 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36673 }
36674 else
36675 {
36676 op1 = gen_lowpart (vmode, d->op1);
36677 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36678 }
36679
36680 return true;
36681 }
36682
36683 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36684 in a single instruction. */
36685
36686 static bool
36687 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36688 {
36689 unsigned i, nelt = d->nelt;
36690 unsigned char perm2[MAX_VECT_LEN];
36691
36692 /* Check plain VEC_SELECT first, because AVX has instructions that could
36693 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36694 input where SEL+CONCAT may not. */
36695 if (d->one_operand_p)
36696 {
36697 int mask = nelt - 1;
36698 bool identity_perm = true;
36699 bool broadcast_perm = true;
36700
36701 for (i = 0; i < nelt; i++)
36702 {
36703 perm2[i] = d->perm[i] & mask;
36704 if (perm2[i] != i)
36705 identity_perm = false;
36706 if (perm2[i])
36707 broadcast_perm = false;
36708 }
36709
36710 if (identity_perm)
36711 {
36712 if (!d->testing_p)
36713 emit_move_insn (d->target, d->op0);
36714 return true;
36715 }
36716 else if (broadcast_perm && TARGET_AVX2)
36717 {
36718 /* Use vpbroadcast{b,w,d}. */
36719 rtx (*gen) (rtx, rtx) = NULL;
36720 switch (d->vmode)
36721 {
36722 case V32QImode:
36723 gen = gen_avx2_pbroadcastv32qi_1;
36724 break;
36725 case V16HImode:
36726 gen = gen_avx2_pbroadcastv16hi_1;
36727 break;
36728 case V8SImode:
36729 gen = gen_avx2_pbroadcastv8si_1;
36730 break;
36731 case V16QImode:
36732 gen = gen_avx2_pbroadcastv16qi;
36733 break;
36734 case V8HImode:
36735 gen = gen_avx2_pbroadcastv8hi;
36736 break;
36737 case V8SFmode:
36738 gen = gen_avx2_vec_dupv8sf_1;
36739 break;
36740 /* For other modes prefer other shuffles this function creates. */
36741 default: break;
36742 }
36743 if (gen != NULL)
36744 {
36745 if (!d->testing_p)
36746 emit_insn (gen (d->target, d->op0));
36747 return true;
36748 }
36749 }
36750
36751 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36752 return true;
36753
36754 /* There are plenty of patterns in sse.md that are written for
36755 SEL+CONCAT and are not replicated for a single op. Perhaps
36756 that should be changed, to avoid the nastiness here. */
36757
36758 /* Recognize interleave style patterns, which means incrementing
36759 every other permutation operand. */
36760 for (i = 0; i < nelt; i += 2)
36761 {
36762 perm2[i] = d->perm[i] & mask;
36763 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36764 }
36765 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36766 d->testing_p))
36767 return true;
36768
36769 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36770 if (nelt >= 4)
36771 {
36772 for (i = 0; i < nelt; i += 4)
36773 {
36774 perm2[i + 0] = d->perm[i + 0] & mask;
36775 perm2[i + 1] = d->perm[i + 1] & mask;
36776 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36777 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36778 }
36779
36780 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36781 d->testing_p))
36782 return true;
36783 }
36784 }
36785
36786 /* Finally, try the fully general two operand permute. */
36787 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36788 d->testing_p))
36789 return true;
36790
36791 /* Recognize interleave style patterns with reversed operands. */
36792 if (!d->one_operand_p)
36793 {
36794 for (i = 0; i < nelt; ++i)
36795 {
36796 unsigned e = d->perm[i];
36797 if (e >= nelt)
36798 e -= nelt;
36799 else
36800 e += nelt;
36801 perm2[i] = e;
36802 }
36803
36804 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36805 d->testing_p))
36806 return true;
36807 }
36808
36809 /* Try the SSE4.1 blend variable merge instructions. */
36810 if (expand_vec_perm_blend (d))
36811 return true;
36812
36813 /* Try one of the AVX vpermil variable permutations. */
36814 if (expand_vec_perm_vpermil (d))
36815 return true;
36816
36817 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36818 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36819 if (expand_vec_perm_pshufb (d))
36820 return true;
36821
36822 return false;
36823 }
36824
36825 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36826 in terms of a pair of pshuflw + pshufhw instructions. */
36827
36828 static bool
36829 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36830 {
36831 unsigned char perm2[MAX_VECT_LEN];
36832 unsigned i;
36833 bool ok;
36834
36835 if (d->vmode != V8HImode || !d->one_operand_p)
36836 return false;
36837
36838 /* The two permutations only operate in 64-bit lanes. */
36839 for (i = 0; i < 4; ++i)
36840 if (d->perm[i] >= 4)
36841 return false;
36842 for (i = 4; i < 8; ++i)
36843 if (d->perm[i] < 4)
36844 return false;
36845
36846 if (d->testing_p)
36847 return true;
36848
36849 /* Emit the pshuflw. */
36850 memcpy (perm2, d->perm, 4);
36851 for (i = 4; i < 8; ++i)
36852 perm2[i] = i;
36853 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36854 gcc_assert (ok);
36855
36856 /* Emit the pshufhw. */
36857 memcpy (perm2 + 4, d->perm + 4, 4);
36858 for (i = 0; i < 4; ++i)
36859 perm2[i] = i;
36860 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36861 gcc_assert (ok);
36862
36863 return true;
36864 }
36865
36866 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36867 the permutation using the SSSE3 palignr instruction. This succeeds
36868 when all of the elements in PERM fit within one vector and we merely
36869 need to shift them down so that a single vector permutation has a
36870 chance to succeed. */
36871
36872 static bool
36873 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36874 {
36875 unsigned i, nelt = d->nelt;
36876 unsigned min, max;
36877 bool in_order, ok;
36878 rtx shift;
36879
36880 /* Even with AVX, palignr only operates on 128-bit vectors. */
36881 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36882 return false;
36883
36884 min = nelt, max = 0;
36885 for (i = 0; i < nelt; ++i)
36886 {
36887 unsigned e = d->perm[i];
36888 if (e < min)
36889 min = e;
36890 if (e > max)
36891 max = e;
36892 }
36893 if (min == 0 || max - min >= nelt)
36894 return false;
36895
36896 /* Given that we have SSSE3, we know we'll be able to implement the
36897 single operand permutation after the palignr with pshufb. */
36898 if (d->testing_p)
36899 return true;
36900
36901 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36902 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36903 gen_lowpart (TImode, d->op1),
36904 gen_lowpart (TImode, d->op0), shift));
36905
36906 d->op0 = d->op1 = d->target;
36907 d->one_operand_p = true;
36908
36909 in_order = true;
36910 for (i = 0; i < nelt; ++i)
36911 {
36912 unsigned e = d->perm[i] - min;
36913 if (e != i)
36914 in_order = false;
36915 d->perm[i] = e;
36916 }
36917
36918 /* Test for the degenerate case where the alignment by itself
36919 produces the desired permutation. */
36920 if (in_order)
36921 return true;
36922
36923 ok = expand_vec_perm_1 (d);
36924 gcc_assert (ok);
36925
36926 return ok;
36927 }
36928
36929 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36930
36931 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36932 a two vector permutation into a single vector permutation by using
36933 an interleave operation to merge the vectors. */
36934
36935 static bool
36936 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36937 {
36938 struct expand_vec_perm_d dremap, dfinal;
36939 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36940 unsigned HOST_WIDE_INT contents;
36941 unsigned char remap[2 * MAX_VECT_LEN];
36942 rtx seq;
36943 bool ok, same_halves = false;
36944
36945 if (GET_MODE_SIZE (d->vmode) == 16)
36946 {
36947 if (d->one_operand_p)
36948 return false;
36949 }
36950 else if (GET_MODE_SIZE (d->vmode) == 32)
36951 {
36952 if (!TARGET_AVX)
36953 return false;
36954 /* For 32-byte modes allow even d->one_operand_p.
36955 The lack of cross-lane shuffling in some instructions
36956 might prevent a single insn shuffle. */
36957 dfinal = *d;
36958 dfinal.testing_p = true;
36959 /* If expand_vec_perm_interleave3 can expand this into
36960 a 3 insn sequence, give up and let it be expanded as
36961 3 insn sequence. While that is one insn longer,
36962 it doesn't need a memory operand and in the common
36963 case that both interleave low and high permutations
36964 with the same operands are adjacent needs 4 insns
36965 for both after CSE. */
36966 if (expand_vec_perm_interleave3 (&dfinal))
36967 return false;
36968 }
36969 else
36970 return false;
36971
36972 /* Examine from whence the elements come. */
36973 contents = 0;
36974 for (i = 0; i < nelt; ++i)
36975 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36976
36977 memset (remap, 0xff, sizeof (remap));
36978 dremap = *d;
36979
36980 if (GET_MODE_SIZE (d->vmode) == 16)
36981 {
36982 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36983
36984 /* Split the two input vectors into 4 halves. */
36985 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36986 h2 = h1 << nelt2;
36987 h3 = h2 << nelt2;
36988 h4 = h3 << nelt2;
36989
36990 /* If the elements from the low halves use interleave low, and similarly
36991 for interleave high. If the elements are from mis-matched halves, we
36992 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36993 if ((contents & (h1 | h3)) == contents)
36994 {
36995 /* punpckl* */
36996 for (i = 0; i < nelt2; ++i)
36997 {
36998 remap[i] = i * 2;
36999 remap[i + nelt] = i * 2 + 1;
37000 dremap.perm[i * 2] = i;
37001 dremap.perm[i * 2 + 1] = i + nelt;
37002 }
37003 if (!TARGET_SSE2 && d->vmode == V4SImode)
37004 dremap.vmode = V4SFmode;
37005 }
37006 else if ((contents & (h2 | h4)) == contents)
37007 {
37008 /* punpckh* */
37009 for (i = 0; i < nelt2; ++i)
37010 {
37011 remap[i + nelt2] = i * 2;
37012 remap[i + nelt + nelt2] = i * 2 + 1;
37013 dremap.perm[i * 2] = i + nelt2;
37014 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
37015 }
37016 if (!TARGET_SSE2 && d->vmode == V4SImode)
37017 dremap.vmode = V4SFmode;
37018 }
37019 else if ((contents & (h1 | h4)) == contents)
37020 {
37021 /* shufps */
37022 for (i = 0; i < nelt2; ++i)
37023 {
37024 remap[i] = i;
37025 remap[i + nelt + nelt2] = i + nelt2;
37026 dremap.perm[i] = i;
37027 dremap.perm[i + nelt2] = i + nelt + nelt2;
37028 }
37029 if (nelt != 4)
37030 {
37031 /* shufpd */
37032 dremap.vmode = V2DImode;
37033 dremap.nelt = 2;
37034 dremap.perm[0] = 0;
37035 dremap.perm[1] = 3;
37036 }
37037 }
37038 else if ((contents & (h2 | h3)) == contents)
37039 {
37040 /* shufps */
37041 for (i = 0; i < nelt2; ++i)
37042 {
37043 remap[i + nelt2] = i;
37044 remap[i + nelt] = i + nelt2;
37045 dremap.perm[i] = i + nelt2;
37046 dremap.perm[i + nelt2] = i + nelt;
37047 }
37048 if (nelt != 4)
37049 {
37050 /* shufpd */
37051 dremap.vmode = V2DImode;
37052 dremap.nelt = 2;
37053 dremap.perm[0] = 1;
37054 dremap.perm[1] = 2;
37055 }
37056 }
37057 else
37058 return false;
37059 }
37060 else
37061 {
37062 unsigned int nelt4 = nelt / 4, nzcnt = 0;
37063 unsigned HOST_WIDE_INT q[8];
37064 unsigned int nonzero_halves[4];
37065
37066 /* Split the two input vectors into 8 quarters. */
37067 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
37068 for (i = 1; i < 8; ++i)
37069 q[i] = q[0] << (nelt4 * i);
37070 for (i = 0; i < 4; ++i)
37071 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
37072 {
37073 nonzero_halves[nzcnt] = i;
37074 ++nzcnt;
37075 }
37076
37077 if (nzcnt == 1)
37078 {
37079 gcc_assert (d->one_operand_p);
37080 nonzero_halves[1] = nonzero_halves[0];
37081 same_halves = true;
37082 }
37083 else if (d->one_operand_p)
37084 {
37085 gcc_assert (nonzero_halves[0] == 0);
37086 gcc_assert (nonzero_halves[1] == 1);
37087 }
37088
37089 if (nzcnt <= 2)
37090 {
37091 if (d->perm[0] / nelt2 == nonzero_halves[1])
37092 {
37093 /* Attempt to increase the likelihood that dfinal
37094 shuffle will be intra-lane. */
37095 char tmph = nonzero_halves[0];
37096 nonzero_halves[0] = nonzero_halves[1];
37097 nonzero_halves[1] = tmph;
37098 }
37099
37100 /* vperm2f128 or vperm2i128. */
37101 for (i = 0; i < nelt2; ++i)
37102 {
37103 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
37104 remap[i + nonzero_halves[0] * nelt2] = i;
37105 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
37106 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
37107 }
37108
37109 if (d->vmode != V8SFmode
37110 && d->vmode != V4DFmode
37111 && d->vmode != V8SImode)
37112 {
37113 dremap.vmode = V8SImode;
37114 dremap.nelt = 8;
37115 for (i = 0; i < 4; ++i)
37116 {
37117 dremap.perm[i] = i + nonzero_halves[0] * 4;
37118 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
37119 }
37120 }
37121 }
37122 else if (d->one_operand_p)
37123 return false;
37124 else if (TARGET_AVX2
37125 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
37126 {
37127 /* vpunpckl* */
37128 for (i = 0; i < nelt4; ++i)
37129 {
37130 remap[i] = i * 2;
37131 remap[i + nelt] = i * 2 + 1;
37132 remap[i + nelt2] = i * 2 + nelt2;
37133 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
37134 dremap.perm[i * 2] = i;
37135 dremap.perm[i * 2 + 1] = i + nelt;
37136 dremap.perm[i * 2 + nelt2] = i + nelt2;
37137 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
37138 }
37139 }
37140 else if (TARGET_AVX2
37141 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
37142 {
37143 /* vpunpckh* */
37144 for (i = 0; i < nelt4; ++i)
37145 {
37146 remap[i + nelt4] = i * 2;
37147 remap[i + nelt + nelt4] = i * 2 + 1;
37148 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37149 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37150 dremap.perm[i * 2] = i + nelt4;
37151 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37152 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37153 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37154 }
37155 }
37156 else
37157 return false;
37158 }
37159
37160 /* Use the remapping array set up above to move the elements from their
37161 swizzled locations into their final destinations. */
37162 dfinal = *d;
37163 for (i = 0; i < nelt; ++i)
37164 {
37165 unsigned e = remap[d->perm[i]];
37166 gcc_assert (e < nelt);
37167 /* If same_halves is true, both halves of the remapped vector are the
37168 same. Avoid cross-lane accesses if possible. */
37169 if (same_halves && i >= nelt2)
37170 {
37171 gcc_assert (e < nelt2);
37172 dfinal.perm[i] = e + nelt2;
37173 }
37174 else
37175 dfinal.perm[i] = e;
37176 }
37177 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37178 dfinal.op1 = dfinal.op0;
37179 dfinal.one_operand_p = true;
37180 dremap.target = dfinal.op0;
37181
37182 /* Test if the final remap can be done with a single insn. For V4SFmode or
37183 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37184 start_sequence ();
37185 ok = expand_vec_perm_1 (&dfinal);
37186 seq = get_insns ();
37187 end_sequence ();
37188
37189 if (!ok)
37190 return false;
37191
37192 if (d->testing_p)
37193 return true;
37194
37195 if (dremap.vmode != dfinal.vmode)
37196 {
37197 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37198 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37199 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37200 }
37201
37202 ok = expand_vec_perm_1 (&dremap);
37203 gcc_assert (ok);
37204
37205 emit_insn (seq);
37206 return true;
37207 }
37208
37209 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37210 a single vector cross-lane permutation into vpermq followed
37211 by any of the single insn permutations. */
37212
37213 static bool
37214 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37215 {
37216 struct expand_vec_perm_d dremap, dfinal;
37217 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37218 unsigned contents[2];
37219 bool ok;
37220
37221 if (!(TARGET_AVX2
37222 && (d->vmode == V32QImode || d->vmode == V16HImode)
37223 && d->one_operand_p))
37224 return false;
37225
37226 contents[0] = 0;
37227 contents[1] = 0;
37228 for (i = 0; i < nelt2; ++i)
37229 {
37230 contents[0] |= 1u << (d->perm[i] / nelt4);
37231 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37232 }
37233
37234 for (i = 0; i < 2; ++i)
37235 {
37236 unsigned int cnt = 0;
37237 for (j = 0; j < 4; ++j)
37238 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37239 return false;
37240 }
37241
37242 if (d->testing_p)
37243 return true;
37244
37245 dremap = *d;
37246 dremap.vmode = V4DImode;
37247 dremap.nelt = 4;
37248 dremap.target = gen_reg_rtx (V4DImode);
37249 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37250 dremap.op1 = dremap.op0;
37251 dremap.one_operand_p = true;
37252 for (i = 0; i < 2; ++i)
37253 {
37254 unsigned int cnt = 0;
37255 for (j = 0; j < 4; ++j)
37256 if ((contents[i] & (1u << j)) != 0)
37257 dremap.perm[2 * i + cnt++] = j;
37258 for (; cnt < 2; ++cnt)
37259 dremap.perm[2 * i + cnt] = 0;
37260 }
37261
37262 dfinal = *d;
37263 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37264 dfinal.op1 = dfinal.op0;
37265 dfinal.one_operand_p = true;
37266 for (i = 0, j = 0; i < nelt; ++i)
37267 {
37268 if (i == nelt2)
37269 j = 2;
37270 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37271 if ((d->perm[i] / nelt4) == dremap.perm[j])
37272 ;
37273 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37274 dfinal.perm[i] |= nelt4;
37275 else
37276 gcc_unreachable ();
37277 }
37278
37279 ok = expand_vec_perm_1 (&dremap);
37280 gcc_assert (ok);
37281
37282 ok = expand_vec_perm_1 (&dfinal);
37283 gcc_assert (ok);
37284
37285 return true;
37286 }
37287
37288 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37289 a vector permutation using two instructions, vperm2f128 resp.
37290 vperm2i128 followed by any single in-lane permutation. */
37291
37292 static bool
37293 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37294 {
37295 struct expand_vec_perm_d dfirst, dsecond;
37296 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37297 bool ok;
37298
37299 if (!TARGET_AVX
37300 || GET_MODE_SIZE (d->vmode) != 32
37301 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37302 return false;
37303
37304 dsecond = *d;
37305 dsecond.one_operand_p = false;
37306 dsecond.testing_p = true;
37307
37308 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37309 immediate. For perm < 16 the second permutation uses
37310 d->op0 as first operand, for perm >= 16 it uses d->op1
37311 as first operand. The second operand is the result of
37312 vperm2[fi]128. */
37313 for (perm = 0; perm < 32; perm++)
37314 {
37315 /* Ignore permutations which do not move anything cross-lane. */
37316 if (perm < 16)
37317 {
37318 /* The second shuffle for e.g. V4DFmode has
37319 0123 and ABCD operands.
37320 Ignore AB23, as 23 is already in the second lane
37321 of the first operand. */
37322 if ((perm & 0xc) == (1 << 2)) continue;
37323 /* And 01CD, as 01 is in the first lane of the first
37324 operand. */
37325 if ((perm & 3) == 0) continue;
37326 /* And 4567, as then the vperm2[fi]128 doesn't change
37327 anything on the original 4567 second operand. */
37328 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37329 }
37330 else
37331 {
37332 /* The second shuffle for e.g. V4DFmode has
37333 4567 and ABCD operands.
37334 Ignore AB67, as 67 is already in the second lane
37335 of the first operand. */
37336 if ((perm & 0xc) == (3 << 2)) continue;
37337 /* And 45CD, as 45 is in the first lane of the first
37338 operand. */
37339 if ((perm & 3) == 2) continue;
37340 /* And 0123, as then the vperm2[fi]128 doesn't change
37341 anything on the original 0123 first operand. */
37342 if ((perm & 0xf) == (1 << 2)) continue;
37343 }
37344
37345 for (i = 0; i < nelt; i++)
37346 {
37347 j = d->perm[i] / nelt2;
37348 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37349 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37350 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37351 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37352 else
37353 break;
37354 }
37355
37356 if (i == nelt)
37357 {
37358 start_sequence ();
37359 ok = expand_vec_perm_1 (&dsecond);
37360 end_sequence ();
37361 }
37362 else
37363 ok = false;
37364
37365 if (ok)
37366 {
37367 if (d->testing_p)
37368 return true;
37369
37370 /* Found a usable second shuffle. dfirst will be
37371 vperm2f128 on d->op0 and d->op1. */
37372 dsecond.testing_p = false;
37373 dfirst = *d;
37374 dfirst.target = gen_reg_rtx (d->vmode);
37375 for (i = 0; i < nelt; i++)
37376 dfirst.perm[i] = (i & (nelt2 - 1))
37377 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37378
37379 ok = expand_vec_perm_1 (&dfirst);
37380 gcc_assert (ok);
37381
37382 /* And dsecond is some single insn shuffle, taking
37383 d->op0 and result of vperm2f128 (if perm < 16) or
37384 d->op1 and result of vperm2f128 (otherwise). */
37385 dsecond.op1 = dfirst.target;
37386 if (perm >= 16)
37387 dsecond.op0 = dfirst.op1;
37388
37389 ok = expand_vec_perm_1 (&dsecond);
37390 gcc_assert (ok);
37391
37392 return true;
37393 }
37394
37395 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37396 if (d->one_operand_p)
37397 return false;
37398 }
37399
37400 return false;
37401 }
37402
37403 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37404 a two vector permutation using 2 intra-lane interleave insns
37405 and cross-lane shuffle for 32-byte vectors. */
37406
37407 static bool
37408 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37409 {
37410 unsigned i, nelt;
37411 rtx (*gen) (rtx, rtx, rtx);
37412
37413 if (d->one_operand_p)
37414 return false;
37415 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37416 ;
37417 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37418 ;
37419 else
37420 return false;
37421
37422 nelt = d->nelt;
37423 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37424 return false;
37425 for (i = 0; i < nelt; i += 2)
37426 if (d->perm[i] != d->perm[0] + i / 2
37427 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37428 return false;
37429
37430 if (d->testing_p)
37431 return true;
37432
37433 switch (d->vmode)
37434 {
37435 case V32QImode:
37436 if (d->perm[0])
37437 gen = gen_vec_interleave_highv32qi;
37438 else
37439 gen = gen_vec_interleave_lowv32qi;
37440 break;
37441 case V16HImode:
37442 if (d->perm[0])
37443 gen = gen_vec_interleave_highv16hi;
37444 else
37445 gen = gen_vec_interleave_lowv16hi;
37446 break;
37447 case V8SImode:
37448 if (d->perm[0])
37449 gen = gen_vec_interleave_highv8si;
37450 else
37451 gen = gen_vec_interleave_lowv8si;
37452 break;
37453 case V4DImode:
37454 if (d->perm[0])
37455 gen = gen_vec_interleave_highv4di;
37456 else
37457 gen = gen_vec_interleave_lowv4di;
37458 break;
37459 case V8SFmode:
37460 if (d->perm[0])
37461 gen = gen_vec_interleave_highv8sf;
37462 else
37463 gen = gen_vec_interleave_lowv8sf;
37464 break;
37465 case V4DFmode:
37466 if (d->perm[0])
37467 gen = gen_vec_interleave_highv4df;
37468 else
37469 gen = gen_vec_interleave_lowv4df;
37470 break;
37471 default:
37472 gcc_unreachable ();
37473 }
37474
37475 emit_insn (gen (d->target, d->op0, d->op1));
37476 return true;
37477 }
37478
37479 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37480 a single vector permutation using a single intra-lane vector
37481 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37482 the non-swapped and swapped vectors together. */
37483
37484 static bool
37485 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37486 {
37487 struct expand_vec_perm_d dfirst, dsecond;
37488 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37489 rtx seq;
37490 bool ok;
37491 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37492
37493 if (!TARGET_AVX
37494 || TARGET_AVX2
37495 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37496 || !d->one_operand_p)
37497 return false;
37498
37499 dfirst = *d;
37500 for (i = 0; i < nelt; i++)
37501 dfirst.perm[i] = 0xff;
37502 for (i = 0, msk = 0; i < nelt; i++)
37503 {
37504 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37505 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37506 return false;
37507 dfirst.perm[j] = d->perm[i];
37508 if (j != i)
37509 msk |= (1 << i);
37510 }
37511 for (i = 0; i < nelt; i++)
37512 if (dfirst.perm[i] == 0xff)
37513 dfirst.perm[i] = i;
37514
37515 if (!d->testing_p)
37516 dfirst.target = gen_reg_rtx (dfirst.vmode);
37517
37518 start_sequence ();
37519 ok = expand_vec_perm_1 (&dfirst);
37520 seq = get_insns ();
37521 end_sequence ();
37522
37523 if (!ok)
37524 return false;
37525
37526 if (d->testing_p)
37527 return true;
37528
37529 emit_insn (seq);
37530
37531 dsecond = *d;
37532 dsecond.op0 = dfirst.target;
37533 dsecond.op1 = dfirst.target;
37534 dsecond.one_operand_p = true;
37535 dsecond.target = gen_reg_rtx (dsecond.vmode);
37536 for (i = 0; i < nelt; i++)
37537 dsecond.perm[i] = i ^ nelt2;
37538
37539 ok = expand_vec_perm_1 (&dsecond);
37540 gcc_assert (ok);
37541
37542 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37543 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37544 return true;
37545 }
37546
37547 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
37548 permutation using two vperm2f128, followed by a vshufpd insn blending
37549 the two vectors together. */
37550
37551 static bool
37552 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
37553 {
37554 struct expand_vec_perm_d dfirst, dsecond, dthird;
37555 bool ok;
37556
37557 if (!TARGET_AVX || (d->vmode != V4DFmode))
37558 return false;
37559
37560 if (d->testing_p)
37561 return true;
37562
37563 dfirst = *d;
37564 dsecond = *d;
37565 dthird = *d;
37566
37567 dfirst.perm[0] = (d->perm[0] & ~1);
37568 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
37569 dfirst.perm[2] = (d->perm[2] & ~1);
37570 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
37571 dsecond.perm[0] = (d->perm[1] & ~1);
37572 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
37573 dsecond.perm[2] = (d->perm[3] & ~1);
37574 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
37575 dthird.perm[0] = (d->perm[0] % 2);
37576 dthird.perm[1] = (d->perm[1] % 2) + 4;
37577 dthird.perm[2] = (d->perm[2] % 2) + 2;
37578 dthird.perm[3] = (d->perm[3] % 2) + 6;
37579
37580 dfirst.target = gen_reg_rtx (dfirst.vmode);
37581 dsecond.target = gen_reg_rtx (dsecond.vmode);
37582 dthird.op0 = dfirst.target;
37583 dthird.op1 = dsecond.target;
37584 dthird.one_operand_p = false;
37585
37586 canonicalize_perm (&dfirst);
37587 canonicalize_perm (&dsecond);
37588
37589 ok = expand_vec_perm_1 (&dfirst)
37590 && expand_vec_perm_1 (&dsecond)
37591 && expand_vec_perm_1 (&dthird);
37592
37593 gcc_assert (ok);
37594
37595 return true;
37596 }
37597
37598 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37599 permutation with two pshufb insns and an ior. We should have already
37600 failed all two instruction sequences. */
37601
37602 static bool
37603 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37604 {
37605 rtx rperm[2][16], vperm, l, h, op, m128;
37606 unsigned int i, nelt, eltsz;
37607
37608 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37609 return false;
37610 gcc_assert (!d->one_operand_p);
37611
37612 nelt = d->nelt;
37613 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37614
37615 /* Generate two permutation masks. If the required element is within
37616 the given vector it is shuffled into the proper lane. If the required
37617 element is in the other vector, force a zero into the lane by setting
37618 bit 7 in the permutation mask. */
37619 m128 = GEN_INT (-128);
37620 for (i = 0; i < nelt; ++i)
37621 {
37622 unsigned j, e = d->perm[i];
37623 unsigned which = (e >= nelt);
37624 if (e >= nelt)
37625 e -= nelt;
37626
37627 for (j = 0; j < eltsz; ++j)
37628 {
37629 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37630 rperm[1-which][i*eltsz + j] = m128;
37631 }
37632 }
37633
37634 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37635 vperm = force_reg (V16QImode, vperm);
37636
37637 l = gen_reg_rtx (V16QImode);
37638 op = gen_lowpart (V16QImode, d->op0);
37639 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37640
37641 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37642 vperm = force_reg (V16QImode, vperm);
37643
37644 h = gen_reg_rtx (V16QImode);
37645 op = gen_lowpart (V16QImode, d->op1);
37646 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37647
37648 op = gen_lowpart (V16QImode, d->target);
37649 emit_insn (gen_iorv16qi3 (op, l, h));
37650
37651 return true;
37652 }
37653
37654 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37655 with two vpshufb insns, vpermq and vpor. We should have already failed
37656 all two or three instruction sequences. */
37657
37658 static bool
37659 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37660 {
37661 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37662 unsigned int i, nelt, eltsz;
37663
37664 if (!TARGET_AVX2
37665 || !d->one_operand_p
37666 || (d->vmode != V32QImode && d->vmode != V16HImode))
37667 return false;
37668
37669 if (d->testing_p)
37670 return true;
37671
37672 nelt = d->nelt;
37673 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37674
37675 /* Generate two permutation masks. If the required element is within
37676 the same lane, it is shuffled in. If the required element from the
37677 other lane, force a zero by setting bit 7 in the permutation mask.
37678 In the other mask the mask has non-negative elements if element
37679 is requested from the other lane, but also moved to the other lane,
37680 so that the result of vpshufb can have the two V2TImode halves
37681 swapped. */
37682 m128 = GEN_INT (-128);
37683 for (i = 0; i < nelt; ++i)
37684 {
37685 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37686 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37687
37688 for (j = 0; j < eltsz; ++j)
37689 {
37690 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37691 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37692 }
37693 }
37694
37695 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37696 vperm = force_reg (V32QImode, vperm);
37697
37698 h = gen_reg_rtx (V32QImode);
37699 op = gen_lowpart (V32QImode, d->op0);
37700 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37701
37702 /* Swap the 128-byte lanes of h into hp. */
37703 hp = gen_reg_rtx (V4DImode);
37704 op = gen_lowpart (V4DImode, h);
37705 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37706 const1_rtx));
37707
37708 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37709 vperm = force_reg (V32QImode, vperm);
37710
37711 l = gen_reg_rtx (V32QImode);
37712 op = gen_lowpart (V32QImode, d->op0);
37713 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37714
37715 op = gen_lowpart (V32QImode, d->target);
37716 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37717
37718 return true;
37719 }
37720
37721 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37722 and extract-odd permutations of two V32QImode and V16QImode operand
37723 with two vpshufb insns, vpor and vpermq. We should have already
37724 failed all two or three instruction sequences. */
37725
37726 static bool
37727 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37728 {
37729 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37730 unsigned int i, nelt, eltsz;
37731
37732 if (!TARGET_AVX2
37733 || d->one_operand_p
37734 || (d->vmode != V32QImode && d->vmode != V16HImode))
37735 return false;
37736
37737 for (i = 0; i < d->nelt; ++i)
37738 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37739 return false;
37740
37741 if (d->testing_p)
37742 return true;
37743
37744 nelt = d->nelt;
37745 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37746
37747 /* Generate two permutation masks. In the first permutation mask
37748 the first quarter will contain indexes for the first half
37749 of the op0, the second quarter will contain bit 7 set, third quarter
37750 will contain indexes for the second half of the op0 and the
37751 last quarter bit 7 set. In the second permutation mask
37752 the first quarter will contain bit 7 set, the second quarter
37753 indexes for the first half of the op1, the third quarter bit 7 set
37754 and last quarter indexes for the second half of the op1.
37755 I.e. the first mask e.g. for V32QImode extract even will be:
37756 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37757 (all values masked with 0xf except for -128) and second mask
37758 for extract even will be
37759 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37760 m128 = GEN_INT (-128);
37761 for (i = 0; i < nelt; ++i)
37762 {
37763 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37764 unsigned which = d->perm[i] >= nelt;
37765 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37766
37767 for (j = 0; j < eltsz; ++j)
37768 {
37769 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37770 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37771 }
37772 }
37773
37774 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37775 vperm = force_reg (V32QImode, vperm);
37776
37777 l = gen_reg_rtx (V32QImode);
37778 op = gen_lowpart (V32QImode, d->op0);
37779 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37780
37781 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37782 vperm = force_reg (V32QImode, vperm);
37783
37784 h = gen_reg_rtx (V32QImode);
37785 op = gen_lowpart (V32QImode, d->op1);
37786 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37787
37788 ior = gen_reg_rtx (V32QImode);
37789 emit_insn (gen_iorv32qi3 (ior, l, h));
37790
37791 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37792 op = gen_lowpart (V4DImode, d->target);
37793 ior = gen_lowpart (V4DImode, ior);
37794 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37795 const1_rtx, GEN_INT (3)));
37796
37797 return true;
37798 }
37799
37800 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37801 and extract-odd permutations. */
37802
37803 static bool
37804 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37805 {
37806 rtx t1, t2, t3;
37807
37808 switch (d->vmode)
37809 {
37810 case V4DFmode:
37811 t1 = gen_reg_rtx (V4DFmode);
37812 t2 = gen_reg_rtx (V4DFmode);
37813
37814 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37815 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37816 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37817
37818 /* Now an unpck[lh]pd will produce the result required. */
37819 if (odd)
37820 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37821 else
37822 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37823 emit_insn (t3);
37824 break;
37825
37826 case V8SFmode:
37827 {
37828 int mask = odd ? 0xdd : 0x88;
37829
37830 t1 = gen_reg_rtx (V8SFmode);
37831 t2 = gen_reg_rtx (V8SFmode);
37832 t3 = gen_reg_rtx (V8SFmode);
37833
37834 /* Shuffle within the 128-bit lanes to produce:
37835 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37836 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37837 GEN_INT (mask)));
37838
37839 /* Shuffle the lanes around to produce:
37840 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37841 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37842 GEN_INT (0x3)));
37843
37844 /* Shuffle within the 128-bit lanes to produce:
37845 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37846 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37847
37848 /* Shuffle within the 128-bit lanes to produce:
37849 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37850 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37851
37852 /* Shuffle the lanes around to produce:
37853 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37854 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37855 GEN_INT (0x20)));
37856 }
37857 break;
37858
37859 case V2DFmode:
37860 case V4SFmode:
37861 case V2DImode:
37862 case V4SImode:
37863 /* These are always directly implementable by expand_vec_perm_1. */
37864 gcc_unreachable ();
37865
37866 case V8HImode:
37867 if (TARGET_SSSE3)
37868 return expand_vec_perm_pshufb2 (d);
37869 else
37870 {
37871 /* We need 2*log2(N)-1 operations to achieve odd/even
37872 with interleave. */
37873 t1 = gen_reg_rtx (V8HImode);
37874 t2 = gen_reg_rtx (V8HImode);
37875 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37876 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37877 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37878 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37879 if (odd)
37880 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37881 else
37882 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37883 emit_insn (t3);
37884 }
37885 break;
37886
37887 case V16QImode:
37888 if (TARGET_SSSE3)
37889 return expand_vec_perm_pshufb2 (d);
37890 else
37891 {
37892 t1 = gen_reg_rtx (V16QImode);
37893 t2 = gen_reg_rtx (V16QImode);
37894 t3 = gen_reg_rtx (V16QImode);
37895 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37896 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37897 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37898 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37899 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37900 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37901 if (odd)
37902 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37903 else
37904 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37905 emit_insn (t3);
37906 }
37907 break;
37908
37909 case V16HImode:
37910 case V32QImode:
37911 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37912
37913 case V4DImode:
37914 if (!TARGET_AVX2)
37915 {
37916 struct expand_vec_perm_d d_copy = *d;
37917 d_copy.vmode = V4DFmode;
37918 d_copy.target = gen_lowpart (V4DFmode, d->target);
37919 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37920 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37921 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37922 }
37923
37924 t1 = gen_reg_rtx (V4DImode);
37925 t2 = gen_reg_rtx (V4DImode);
37926
37927 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37928 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37929 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37930
37931 /* Now an vpunpck[lh]qdq will produce the result required. */
37932 if (odd)
37933 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37934 else
37935 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37936 emit_insn (t3);
37937 break;
37938
37939 case V8SImode:
37940 if (!TARGET_AVX2)
37941 {
37942 struct expand_vec_perm_d d_copy = *d;
37943 d_copy.vmode = V8SFmode;
37944 d_copy.target = gen_lowpart (V8SFmode, d->target);
37945 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37946 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37947 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37948 }
37949
37950 t1 = gen_reg_rtx (V8SImode);
37951 t2 = gen_reg_rtx (V8SImode);
37952
37953 /* Shuffle the lanes around into
37954 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37955 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37956 gen_lowpart (V4DImode, d->op0),
37957 gen_lowpart (V4DImode, d->op1),
37958 GEN_INT (0x20)));
37959 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37960 gen_lowpart (V4DImode, d->op0),
37961 gen_lowpart (V4DImode, d->op1),
37962 GEN_INT (0x31)));
37963
37964 /* Swap the 2nd and 3rd position in each lane into
37965 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37966 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37967 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37968 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37969 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37970
37971 /* Now an vpunpck[lh]qdq will produce
37972 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37973 if (odd)
37974 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37975 gen_lowpart (V4DImode, t1),
37976 gen_lowpart (V4DImode, t2));
37977 else
37978 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37979 gen_lowpart (V4DImode, t1),
37980 gen_lowpart (V4DImode, t2));
37981 emit_insn (t3);
37982 break;
37983
37984 default:
37985 gcc_unreachable ();
37986 }
37987
37988 return true;
37989 }
37990
37991 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37992 extract-even and extract-odd permutations. */
37993
37994 static bool
37995 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37996 {
37997 unsigned i, odd, nelt = d->nelt;
37998
37999 odd = d->perm[0];
38000 if (odd != 0 && odd != 1)
38001 return false;
38002
38003 for (i = 1; i < nelt; ++i)
38004 if (d->perm[i] != 2 * i + odd)
38005 return false;
38006
38007 return expand_vec_perm_even_odd_1 (d, odd);
38008 }
38009
38010 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
38011 permutations. We assume that expand_vec_perm_1 has already failed. */
38012
38013 static bool
38014 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
38015 {
38016 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
38017 enum machine_mode vmode = d->vmode;
38018 unsigned char perm2[4];
38019 rtx op0 = d->op0;
38020 bool ok;
38021
38022 switch (vmode)
38023 {
38024 case V4DFmode:
38025 case V8SFmode:
38026 /* These are special-cased in sse.md so that we can optionally
38027 use the vbroadcast instruction. They expand to two insns
38028 if the input happens to be in a register. */
38029 gcc_unreachable ();
38030
38031 case V2DFmode:
38032 case V2DImode:
38033 case V4SFmode:
38034 case V4SImode:
38035 /* These are always implementable using standard shuffle patterns. */
38036 gcc_unreachable ();
38037
38038 case V8HImode:
38039 case V16QImode:
38040 /* These can be implemented via interleave. We save one insn by
38041 stopping once we have promoted to V4SImode and then use pshufd. */
38042 do
38043 {
38044 rtx dest;
38045 rtx (*gen) (rtx, rtx, rtx)
38046 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
38047 : gen_vec_interleave_lowv8hi;
38048
38049 if (elt >= nelt2)
38050 {
38051 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
38052 : gen_vec_interleave_highv8hi;
38053 elt -= nelt2;
38054 }
38055 nelt2 /= 2;
38056
38057 dest = gen_reg_rtx (vmode);
38058 emit_insn (gen (dest, op0, op0));
38059 vmode = get_mode_wider_vector (vmode);
38060 op0 = gen_lowpart (vmode, dest);
38061 }
38062 while (vmode != V4SImode);
38063
38064 memset (perm2, elt, 4);
38065 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
38066 d->testing_p);
38067 gcc_assert (ok);
38068 return true;
38069
38070 case V32QImode:
38071 case V16HImode:
38072 case V8SImode:
38073 case V4DImode:
38074 /* For AVX2 broadcasts of the first element vpbroadcast* or
38075 vpermq should be used by expand_vec_perm_1. */
38076 gcc_assert (!TARGET_AVX2 || d->perm[0]);
38077 return false;
38078
38079 default:
38080 gcc_unreachable ();
38081 }
38082 }
38083
38084 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38085 broadcast permutations. */
38086
38087 static bool
38088 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
38089 {
38090 unsigned i, elt, nelt = d->nelt;
38091
38092 if (!d->one_operand_p)
38093 return false;
38094
38095 elt = d->perm[0];
38096 for (i = 1; i < nelt; ++i)
38097 if (d->perm[i] != elt)
38098 return false;
38099
38100 return expand_vec_perm_broadcast_1 (d);
38101 }
38102
38103 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
38104 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
38105 all the shorter instruction sequences. */
38106
38107 static bool
38108 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
38109 {
38110 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
38111 unsigned int i, nelt, eltsz;
38112 bool used[4];
38113
38114 if (!TARGET_AVX2
38115 || d->one_operand_p
38116 || (d->vmode != V32QImode && d->vmode != V16HImode))
38117 return false;
38118
38119 if (d->testing_p)
38120 return true;
38121
38122 nelt = d->nelt;
38123 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38124
38125 /* Generate 4 permutation masks. If the required element is within
38126 the same lane, it is shuffled in. If the required element from the
38127 other lane, force a zero by setting bit 7 in the permutation mask.
38128 In the other mask the mask has non-negative elements if element
38129 is requested from the other lane, but also moved to the other lane,
38130 so that the result of vpshufb can have the two V2TImode halves
38131 swapped. */
38132 m128 = GEN_INT (-128);
38133 for (i = 0; i < 32; ++i)
38134 {
38135 rperm[0][i] = m128;
38136 rperm[1][i] = m128;
38137 rperm[2][i] = m128;
38138 rperm[3][i] = m128;
38139 }
38140 used[0] = false;
38141 used[1] = false;
38142 used[2] = false;
38143 used[3] = false;
38144 for (i = 0; i < nelt; ++i)
38145 {
38146 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38147 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38148 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
38149
38150 for (j = 0; j < eltsz; ++j)
38151 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
38152 used[which] = true;
38153 }
38154
38155 for (i = 0; i < 2; ++i)
38156 {
38157 if (!used[2 * i + 1])
38158 {
38159 h[i] = NULL_RTX;
38160 continue;
38161 }
38162 vperm = gen_rtx_CONST_VECTOR (V32QImode,
38163 gen_rtvec_v (32, rperm[2 * i + 1]));
38164 vperm = force_reg (V32QImode, vperm);
38165 h[i] = gen_reg_rtx (V32QImode);
38166 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38167 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
38168 }
38169
38170 /* Swap the 128-byte lanes of h[X]. */
38171 for (i = 0; i < 2; ++i)
38172 {
38173 if (h[i] == NULL_RTX)
38174 continue;
38175 op = gen_reg_rtx (V4DImode);
38176 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
38177 const2_rtx, GEN_INT (3), const0_rtx,
38178 const1_rtx));
38179 h[i] = gen_lowpart (V32QImode, op);
38180 }
38181
38182 for (i = 0; i < 2; ++i)
38183 {
38184 if (!used[2 * i])
38185 {
38186 l[i] = NULL_RTX;
38187 continue;
38188 }
38189 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
38190 vperm = force_reg (V32QImode, vperm);
38191 l[i] = gen_reg_rtx (V32QImode);
38192 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38193 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
38194 }
38195
38196 for (i = 0; i < 2; ++i)
38197 {
38198 if (h[i] && l[i])
38199 {
38200 op = gen_reg_rtx (V32QImode);
38201 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38202 l[i] = op;
38203 }
38204 else if (h[i])
38205 l[i] = h[i];
38206 }
38207
38208 gcc_assert (l[0] && l[1]);
38209 op = gen_lowpart (V32QImode, d->target);
38210 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38211 return true;
38212 }
38213
38214 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38215 With all of the interface bits taken care of, perform the expansion
38216 in D and return true on success. */
38217
38218 static bool
38219 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38220 {
38221 /* Try a single instruction expansion. */
38222 if (expand_vec_perm_1 (d))
38223 return true;
38224
38225 /* Try sequences of two instructions. */
38226
38227 if (expand_vec_perm_pshuflw_pshufhw (d))
38228 return true;
38229
38230 if (expand_vec_perm_palignr (d))
38231 return true;
38232
38233 if (expand_vec_perm_interleave2 (d))
38234 return true;
38235
38236 if (expand_vec_perm_broadcast (d))
38237 return true;
38238
38239 if (expand_vec_perm_vpermq_perm_1 (d))
38240 return true;
38241
38242 if (expand_vec_perm_vperm2f128 (d))
38243 return true;
38244
38245 /* Try sequences of three instructions. */
38246
38247 if (expand_vec_perm_2vperm2f128_vshuf (d))
38248 return true;
38249
38250 if (expand_vec_perm_pshufb2 (d))
38251 return true;
38252
38253 if (expand_vec_perm_interleave3 (d))
38254 return true;
38255
38256 if (expand_vec_perm_vperm2f128_vblend (d))
38257 return true;
38258
38259 /* Try sequences of four instructions. */
38260
38261 if (expand_vec_perm_vpshufb2_vpermq (d))
38262 return true;
38263
38264 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38265 return true;
38266
38267 /* ??? Look for narrow permutations whose element orderings would
38268 allow the promotion to a wider mode. */
38269
38270 /* ??? Look for sequences of interleave or a wider permute that place
38271 the data into the correct lanes for a half-vector shuffle like
38272 pshuf[lh]w or vpermilps. */
38273
38274 /* ??? Look for sequences of interleave that produce the desired results.
38275 The combinatorics of punpck[lh] get pretty ugly... */
38276
38277 if (expand_vec_perm_even_odd (d))
38278 return true;
38279
38280 /* Even longer sequences. */
38281 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38282 return true;
38283
38284 return false;
38285 }
38286
38287 /* If a permutation only uses one operand, make it clear. Returns true
38288 if the permutation references both operands. */
38289
38290 static bool
38291 canonicalize_perm (struct expand_vec_perm_d *d)
38292 {
38293 int i, which, nelt = d->nelt;
38294
38295 for (i = which = 0; i < nelt; ++i)
38296 which |= (d->perm[i] < nelt ? 1 : 2);
38297
38298 d->one_operand_p = true;
38299 switch (which)
38300 {
38301 default:
38302 gcc_unreachable();
38303
38304 case 3:
38305 if (!rtx_equal_p (d->op0, d->op1))
38306 {
38307 d->one_operand_p = false;
38308 break;
38309 }
38310 /* The elements of PERM do not suggest that only the first operand
38311 is used, but both operands are identical. Allow easier matching
38312 of the permutation by folding the permutation into the single
38313 input vector. */
38314 /* FALLTHRU */
38315
38316 case 2:
38317 for (i = 0; i < nelt; ++i)
38318 d->perm[i] &= nelt - 1;
38319 d->op0 = d->op1;
38320 break;
38321
38322 case 1:
38323 d->op1 = d->op0;
38324 break;
38325 }
38326
38327 return (which == 3);
38328 }
38329
38330 bool
38331 ix86_expand_vec_perm_const (rtx operands[4])
38332 {
38333 struct expand_vec_perm_d d;
38334 unsigned char perm[MAX_VECT_LEN];
38335 int i, nelt;
38336 bool two_args;
38337 rtx sel;
38338
38339 d.target = operands[0];
38340 d.op0 = operands[1];
38341 d.op1 = operands[2];
38342 sel = operands[3];
38343
38344 d.vmode = GET_MODE (d.target);
38345 gcc_assert (VECTOR_MODE_P (d.vmode));
38346 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38347 d.testing_p = false;
38348
38349 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38350 gcc_assert (XVECLEN (sel, 0) == nelt);
38351 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38352
38353 for (i = 0; i < nelt; ++i)
38354 {
38355 rtx e = XVECEXP (sel, 0, i);
38356 int ei = INTVAL (e) & (2 * nelt - 1);
38357 d.perm[i] = ei;
38358 perm[i] = ei;
38359 }
38360
38361 two_args = canonicalize_perm (&d);
38362
38363 if (ix86_expand_vec_perm_const_1 (&d))
38364 return true;
38365
38366 /* If the selector says both arguments are needed, but the operands are the
38367 same, the above tried to expand with one_operand_p and flattened selector.
38368 If that didn't work, retry without one_operand_p; we succeeded with that
38369 during testing. */
38370 if (two_args && d.one_operand_p)
38371 {
38372 d.one_operand_p = false;
38373 memcpy (d.perm, perm, sizeof (perm));
38374 return ix86_expand_vec_perm_const_1 (&d);
38375 }
38376
38377 return false;
38378 }
38379
38380 /* Implement targetm.vectorize.vec_perm_const_ok. */
38381
38382 static bool
38383 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38384 const unsigned char *sel)
38385 {
38386 struct expand_vec_perm_d d;
38387 unsigned int i, nelt, which;
38388 bool ret;
38389
38390 d.vmode = vmode;
38391 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38392 d.testing_p = true;
38393
38394 /* Given sufficient ISA support we can just return true here
38395 for selected vector modes. */
38396 if (GET_MODE_SIZE (d.vmode) == 16)
38397 {
38398 /* All implementable with a single vpperm insn. */
38399 if (TARGET_XOP)
38400 return true;
38401 /* All implementable with 2 pshufb + 1 ior. */
38402 if (TARGET_SSSE3)
38403 return true;
38404 /* All implementable with shufpd or unpck[lh]pd. */
38405 if (d.nelt == 2)
38406 return true;
38407 }
38408
38409 /* Extract the values from the vector CST into the permutation
38410 array in D. */
38411 memcpy (d.perm, sel, nelt);
38412 for (i = which = 0; i < nelt; ++i)
38413 {
38414 unsigned char e = d.perm[i];
38415 gcc_assert (e < 2 * nelt);
38416 which |= (e < nelt ? 1 : 2);
38417 }
38418
38419 /* For all elements from second vector, fold the elements to first. */
38420 if (which == 2)
38421 for (i = 0; i < nelt; ++i)
38422 d.perm[i] -= nelt;
38423
38424 /* Check whether the mask can be applied to the vector type. */
38425 d.one_operand_p = (which != 3);
38426
38427 /* Implementable with shufps or pshufd. */
38428 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38429 return true;
38430
38431 /* Otherwise we have to go through the motions and see if we can
38432 figure out how to generate the requested permutation. */
38433 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38434 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38435 if (!d.one_operand_p)
38436 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38437
38438 start_sequence ();
38439 ret = ix86_expand_vec_perm_const_1 (&d);
38440 end_sequence ();
38441
38442 return ret;
38443 }
38444
38445 void
38446 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38447 {
38448 struct expand_vec_perm_d d;
38449 unsigned i, nelt;
38450
38451 d.target = targ;
38452 d.op0 = op0;
38453 d.op1 = op1;
38454 d.vmode = GET_MODE (targ);
38455 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38456 d.one_operand_p = false;
38457 d.testing_p = false;
38458
38459 for (i = 0; i < nelt; ++i)
38460 d.perm[i] = i * 2 + odd;
38461
38462 /* We'll either be able to implement the permutation directly... */
38463 if (expand_vec_perm_1 (&d))
38464 return;
38465
38466 /* ... or we use the special-case patterns. */
38467 expand_vec_perm_even_odd_1 (&d, odd);
38468 }
38469
38470 /* Expand a vector operation CODE for a V*QImode in terms of the
38471 same operation on V*HImode. */
38472
38473 void
38474 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
38475 {
38476 enum machine_mode qimode = GET_MODE (dest);
38477 enum machine_mode himode;
38478 rtx (*gen_il) (rtx, rtx, rtx);
38479 rtx (*gen_ih) (rtx, rtx, rtx);
38480 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
38481 struct expand_vec_perm_d d;
38482 bool ok, full_interleave;
38483 bool uns_p = false;
38484 int i;
38485
38486 switch (qimode)
38487 {
38488 case V16QImode:
38489 himode = V8HImode;
38490 gen_il = gen_vec_interleave_lowv16qi;
38491 gen_ih = gen_vec_interleave_highv16qi;
38492 break;
38493 case V32QImode:
38494 himode = V16HImode;
38495 gen_il = gen_avx2_interleave_lowv32qi;
38496 gen_ih = gen_avx2_interleave_highv32qi;
38497 break;
38498 default:
38499 gcc_unreachable ();
38500 }
38501
38502 op2_l = op2_h = op2;
38503 switch (code)
38504 {
38505 case MULT:
38506 /* Unpack data such that we've got a source byte in each low byte of
38507 each word. We don't care what goes into the high byte of each word.
38508 Rather than trying to get zero in there, most convenient is to let
38509 it be a copy of the low byte. */
38510 op2_l = gen_reg_rtx (qimode);
38511 op2_h = gen_reg_rtx (qimode);
38512 emit_insn (gen_il (op2_l, op2, op2));
38513 emit_insn (gen_ih (op2_h, op2, op2));
38514 /* FALLTHRU */
38515
38516 op1_l = gen_reg_rtx (qimode);
38517 op1_h = gen_reg_rtx (qimode);
38518 emit_insn (gen_il (op1_l, op1, op1));
38519 emit_insn (gen_ih (op1_h, op1, op1));
38520 full_interleave = qimode == V16QImode;
38521 break;
38522
38523 case ASHIFT:
38524 case LSHIFTRT:
38525 uns_p = true;
38526 /* FALLTHRU */
38527 case ASHIFTRT:
38528 op1_l = gen_reg_rtx (himode);
38529 op1_h = gen_reg_rtx (himode);
38530 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
38531 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
38532 full_interleave = true;
38533 break;
38534 default:
38535 gcc_unreachable ();
38536 }
38537
38538 /* Perform the operation. */
38539 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
38540 1, OPTAB_DIRECT);
38541 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
38542 1, OPTAB_DIRECT);
38543 gcc_assert (res_l && res_h);
38544
38545 /* Merge the data back into the right place. */
38546 d.target = dest;
38547 d.op0 = gen_lowpart (qimode, res_l);
38548 d.op1 = gen_lowpart (qimode, res_h);
38549 d.vmode = qimode;
38550 d.nelt = GET_MODE_NUNITS (qimode);
38551 d.one_operand_p = false;
38552 d.testing_p = false;
38553
38554 if (full_interleave)
38555 {
38556 /* For SSE2, we used an full interleave, so the desired
38557 results are in the even elements. */
38558 for (i = 0; i < 32; ++i)
38559 d.perm[i] = i * 2;
38560 }
38561 else
38562 {
38563 /* For AVX, the interleave used above was not cross-lane. So the
38564 extraction is evens but with the second and third quarter swapped.
38565 Happily, that is even one insn shorter than even extraction. */
38566 for (i = 0; i < 32; ++i)
38567 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
38568 }
38569
38570 ok = ix86_expand_vec_perm_const_1 (&d);
38571 gcc_assert (ok);
38572
38573 set_unique_reg_note (get_last_insn (), REG_EQUAL,
38574 gen_rtx_fmt_ee (code, qimode, op1, op2));
38575 }
38576
38577 void
38578 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
38579 {
38580 rtx op1_m1, op1_m2;
38581 rtx op2_m1, op2_m2;
38582 rtx res_1, res_2;
38583
38584 /* Shift both input vectors down one element, so that elements 3
38585 and 1 are now in the slots for elements 2 and 0. For K8, at
38586 least, this is faster than using a shuffle. */
38587 op1_m1 = op1 = force_reg (V4SImode, op1);
38588 op1_m2 = gen_reg_rtx (V4SImode);
38589 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
38590 gen_lowpart (V1TImode, op1),
38591 GEN_INT (32)));
38592
38593 if (GET_CODE (op2) == CONST_VECTOR)
38594 {
38595 rtvec v;
38596
38597 /* Constant propagate the vector shift, leaving the dont-care
38598 vector elements as zero. */
38599 v = rtvec_alloc (4);
38600 RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
38601 RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
38602 RTVEC_ELT (v, 1) = const0_rtx;
38603 RTVEC_ELT (v, 3) = const0_rtx;
38604 op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
38605 op2_m1 = force_reg (V4SImode, op2_m1);
38606
38607 v = rtvec_alloc (4);
38608 RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
38609 RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
38610 RTVEC_ELT (v, 1) = const0_rtx;
38611 RTVEC_ELT (v, 3) = const0_rtx;
38612 op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
38613 op2_m2 = force_reg (V4SImode, op2_m2);
38614 }
38615 else
38616 {
38617 op2_m1 = op2 = force_reg (V4SImode, op2);
38618 op2_m2 = gen_reg_rtx (V4SImode);
38619 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
38620 gen_lowpart (V1TImode, op2),
38621 GEN_INT (32)));
38622 }
38623
38624 /* Widening multiply of elements 0+2, and 1+3. */
38625 res_1 = gen_reg_rtx (V4SImode);
38626 res_2 = gen_reg_rtx (V4SImode);
38627 emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
38628 op1_m1, op2_m1));
38629 emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
38630 op1_m2, op2_m2));
38631
38632 /* Move the results in element 2 down to element 1; we don't care
38633 what goes in elements 2 and 3. Then we can merge the parts
38634 back together with an interleave.
38635
38636 Note that two other sequences were tried:
38637 (1) Use interleaves at the start instead of psrldq, which allows
38638 us to use a single shufps to merge things back at the end.
38639 (2) Use shufps here to combine the two vectors, then pshufd to
38640 put the elements in the correct order.
38641 In both cases the cost of the reformatting stall was too high
38642 and the overall sequence slower. */
38643
38644 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
38645 const0_rtx, const0_rtx));
38646 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
38647 const0_rtx, const0_rtx));
38648 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
38649
38650 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
38651 }
38652
38653 /* Expand an insert into a vector register through pinsr insn.
38654 Return true if successful. */
38655
38656 bool
38657 ix86_expand_pinsr (rtx *operands)
38658 {
38659 rtx dst = operands[0];
38660 rtx src = operands[3];
38661
38662 unsigned int size = INTVAL (operands[1]);
38663 unsigned int pos = INTVAL (operands[2]);
38664
38665 if (GET_CODE (dst) == SUBREG)
38666 {
38667 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
38668 dst = SUBREG_REG (dst);
38669 }
38670
38671 if (GET_CODE (src) == SUBREG)
38672 src = SUBREG_REG (src);
38673
38674 switch (GET_MODE (dst))
38675 {
38676 case V16QImode:
38677 case V8HImode:
38678 case V4SImode:
38679 case V2DImode:
38680 {
38681 enum machine_mode srcmode, dstmode;
38682 rtx (*pinsr)(rtx, rtx, rtx, rtx);
38683
38684 srcmode = mode_for_size (size, MODE_INT, 0);
38685
38686 switch (srcmode)
38687 {
38688 case QImode:
38689 if (!TARGET_SSE4_1)
38690 return false;
38691 dstmode = V16QImode;
38692 pinsr = gen_sse4_1_pinsrb;
38693 break;
38694
38695 case HImode:
38696 if (!TARGET_SSE2)
38697 return false;
38698 dstmode = V8HImode;
38699 pinsr = gen_sse2_pinsrw;
38700 break;
38701
38702 case SImode:
38703 if (!TARGET_SSE4_1)
38704 return false;
38705 dstmode = V4SImode;
38706 pinsr = gen_sse4_1_pinsrd;
38707 break;
38708
38709 case DImode:
38710 gcc_assert (TARGET_64BIT);
38711 if (!TARGET_SSE4_1)
38712 return false;
38713 dstmode = V2DImode;
38714 pinsr = gen_sse4_1_pinsrq;
38715 break;
38716
38717 default:
38718 return false;
38719 }
38720
38721 dst = gen_lowpart (dstmode, dst);
38722 src = gen_lowpart (srcmode, src);
38723
38724 pos /= size;
38725
38726 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
38727 return true;
38728 }
38729
38730 default:
38731 return false;
38732 }
38733 }
38734 \f
38735 /* This function returns the calling abi specific va_list type node.
38736 It returns the FNDECL specific va_list type. */
38737
38738 static tree
38739 ix86_fn_abi_va_list (tree fndecl)
38740 {
38741 if (!TARGET_64BIT)
38742 return va_list_type_node;
38743 gcc_assert (fndecl != NULL_TREE);
38744
38745 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
38746 return ms_va_list_type_node;
38747 else
38748 return sysv_va_list_type_node;
38749 }
38750
38751 /* Returns the canonical va_list type specified by TYPE. If there
38752 is no valid TYPE provided, it return NULL_TREE. */
38753
38754 static tree
38755 ix86_canonical_va_list_type (tree type)
38756 {
38757 tree wtype, htype;
38758
38759 /* Resolve references and pointers to va_list type. */
38760 if (TREE_CODE (type) == MEM_REF)
38761 type = TREE_TYPE (type);
38762 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
38763 type = TREE_TYPE (type);
38764 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
38765 type = TREE_TYPE (type);
38766
38767 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
38768 {
38769 wtype = va_list_type_node;
38770 gcc_assert (wtype != NULL_TREE);
38771 htype = type;
38772 if (TREE_CODE (wtype) == ARRAY_TYPE)
38773 {
38774 /* If va_list is an array type, the argument may have decayed
38775 to a pointer type, e.g. by being passed to another function.
38776 In that case, unwrap both types so that we can compare the
38777 underlying records. */
38778 if (TREE_CODE (htype) == ARRAY_TYPE
38779 || POINTER_TYPE_P (htype))
38780 {
38781 wtype = TREE_TYPE (wtype);
38782 htype = TREE_TYPE (htype);
38783 }
38784 }
38785 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38786 return va_list_type_node;
38787 wtype = sysv_va_list_type_node;
38788 gcc_assert (wtype != NULL_TREE);
38789 htype = type;
38790 if (TREE_CODE (wtype) == ARRAY_TYPE)
38791 {
38792 /* If va_list is an array type, the argument may have decayed
38793 to a pointer type, e.g. by being passed to another function.
38794 In that case, unwrap both types so that we can compare the
38795 underlying records. */
38796 if (TREE_CODE (htype) == ARRAY_TYPE
38797 || POINTER_TYPE_P (htype))
38798 {
38799 wtype = TREE_TYPE (wtype);
38800 htype = TREE_TYPE (htype);
38801 }
38802 }
38803 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38804 return sysv_va_list_type_node;
38805 wtype = ms_va_list_type_node;
38806 gcc_assert (wtype != NULL_TREE);
38807 htype = type;
38808 if (TREE_CODE (wtype) == ARRAY_TYPE)
38809 {
38810 /* If va_list is an array type, the argument may have decayed
38811 to a pointer type, e.g. by being passed to another function.
38812 In that case, unwrap both types so that we can compare the
38813 underlying records. */
38814 if (TREE_CODE (htype) == ARRAY_TYPE
38815 || POINTER_TYPE_P (htype))
38816 {
38817 wtype = TREE_TYPE (wtype);
38818 htype = TREE_TYPE (htype);
38819 }
38820 }
38821 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38822 return ms_va_list_type_node;
38823 return NULL_TREE;
38824 }
38825 return std_canonical_va_list_type (type);
38826 }
38827
38828 /* Iterate through the target-specific builtin types for va_list.
38829 IDX denotes the iterator, *PTREE is set to the result type of
38830 the va_list builtin, and *PNAME to its internal type.
38831 Returns zero if there is no element for this index, otherwise
38832 IDX should be increased upon the next call.
38833 Note, do not iterate a base builtin's name like __builtin_va_list.
38834 Used from c_common_nodes_and_builtins. */
38835
38836 static int
38837 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
38838 {
38839 if (TARGET_64BIT)
38840 {
38841 switch (idx)
38842 {
38843 default:
38844 break;
38845
38846 case 0:
38847 *ptree = ms_va_list_type_node;
38848 *pname = "__builtin_ms_va_list";
38849 return 1;
38850
38851 case 1:
38852 *ptree = sysv_va_list_type_node;
38853 *pname = "__builtin_sysv_va_list";
38854 return 1;
38855 }
38856 }
38857
38858 return 0;
38859 }
38860
38861 #undef TARGET_SCHED_DISPATCH
38862 #define TARGET_SCHED_DISPATCH has_dispatch
38863 #undef TARGET_SCHED_DISPATCH_DO
38864 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38865 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38866 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38867 #undef TARGET_SCHED_REORDER
38868 #define TARGET_SCHED_REORDER ix86_sched_reorder
38869
38870 /* The size of the dispatch window is the total number of bytes of
38871 object code allowed in a window. */
38872 #define DISPATCH_WINDOW_SIZE 16
38873
38874 /* Number of dispatch windows considered for scheduling. */
38875 #define MAX_DISPATCH_WINDOWS 3
38876
38877 /* Maximum number of instructions in a window. */
38878 #define MAX_INSN 4
38879
38880 /* Maximum number of immediate operands in a window. */
38881 #define MAX_IMM 4
38882
38883 /* Maximum number of immediate bits allowed in a window. */
38884 #define MAX_IMM_SIZE 128
38885
38886 /* Maximum number of 32 bit immediates allowed in a window. */
38887 #define MAX_IMM_32 4
38888
38889 /* Maximum number of 64 bit immediates allowed in a window. */
38890 #define MAX_IMM_64 2
38891
38892 /* Maximum total of loads or prefetches allowed in a window. */
38893 #define MAX_LOAD 2
38894
38895 /* Maximum total of stores allowed in a window. */
38896 #define MAX_STORE 1
38897
38898 #undef BIG
38899 #define BIG 100
38900
38901
38902 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38903 enum dispatch_group {
38904 disp_no_group = 0,
38905 disp_load,
38906 disp_store,
38907 disp_load_store,
38908 disp_prefetch,
38909 disp_imm,
38910 disp_imm_32,
38911 disp_imm_64,
38912 disp_branch,
38913 disp_cmp,
38914 disp_jcc,
38915 disp_last
38916 };
38917
38918 /* Number of allowable groups in a dispatch window. It is an array
38919 indexed by dispatch_group enum. 100 is used as a big number,
38920 because the number of these kind of operations does not have any
38921 effect in dispatch window, but we need them for other reasons in
38922 the table. */
38923 static unsigned int num_allowable_groups[disp_last] = {
38924 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38925 };
38926
38927 char group_name[disp_last + 1][16] = {
38928 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38929 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38930 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38931 };
38932
38933 /* Instruction path. */
38934 enum insn_path {
38935 no_path = 0,
38936 path_single, /* Single micro op. */
38937 path_double, /* Double micro op. */
38938 path_multi, /* Instructions with more than 2 micro op.. */
38939 last_path
38940 };
38941
38942 /* sched_insn_info defines a window to the instructions scheduled in
38943 the basic block. It contains a pointer to the insn_info table and
38944 the instruction scheduled.
38945
38946 Windows are allocated for each basic block and are linked
38947 together. */
38948 typedef struct sched_insn_info_s {
38949 rtx insn;
38950 enum dispatch_group group;
38951 enum insn_path path;
38952 int byte_len;
38953 int imm_bytes;
38954 } sched_insn_info;
38955
38956 /* Linked list of dispatch windows. This is a two way list of
38957 dispatch windows of a basic block. It contains information about
38958 the number of uops in the window and the total number of
38959 instructions and of bytes in the object code for this dispatch
38960 window. */
38961 typedef struct dispatch_windows_s {
38962 int num_insn; /* Number of insn in the window. */
38963 int num_uops; /* Number of uops in the window. */
38964 int window_size; /* Number of bytes in the window. */
38965 int window_num; /* Window number between 0 or 1. */
38966 int num_imm; /* Number of immediates in an insn. */
38967 int num_imm_32; /* Number of 32 bit immediates in an insn. */
38968 int num_imm_64; /* Number of 64 bit immediates in an insn. */
38969 int imm_size; /* Total immediates in the window. */
38970 int num_loads; /* Total memory loads in the window. */
38971 int num_stores; /* Total memory stores in the window. */
38972 int violation; /* Violation exists in window. */
38973 sched_insn_info *window; /* Pointer to the window. */
38974 struct dispatch_windows_s *next;
38975 struct dispatch_windows_s *prev;
38976 } dispatch_windows;
38977
38978 /* Immediate valuse used in an insn. */
38979 typedef struct imm_info_s
38980 {
38981 int imm;
38982 int imm32;
38983 int imm64;
38984 } imm_info;
38985
38986 static dispatch_windows *dispatch_window_list;
38987 static dispatch_windows *dispatch_window_list1;
38988
38989 /* Get dispatch group of insn. */
38990
38991 static enum dispatch_group
38992 get_mem_group (rtx insn)
38993 {
38994 enum attr_memory memory;
38995
38996 if (INSN_CODE (insn) < 0)
38997 return disp_no_group;
38998 memory = get_attr_memory (insn);
38999 if (memory == MEMORY_STORE)
39000 return disp_store;
39001
39002 if (memory == MEMORY_LOAD)
39003 return disp_load;
39004
39005 if (memory == MEMORY_BOTH)
39006 return disp_load_store;
39007
39008 return disp_no_group;
39009 }
39010
39011 /* Return true if insn is a compare instruction. */
39012
39013 static bool
39014 is_cmp (rtx insn)
39015 {
39016 enum attr_type type;
39017
39018 type = get_attr_type (insn);
39019 return (type == TYPE_TEST
39020 || type == TYPE_ICMP
39021 || type == TYPE_FCMP
39022 || GET_CODE (PATTERN (insn)) == COMPARE);
39023 }
39024
39025 /* Return true if a dispatch violation encountered. */
39026
39027 static bool
39028 dispatch_violation (void)
39029 {
39030 if (dispatch_window_list->next)
39031 return dispatch_window_list->next->violation;
39032 return dispatch_window_list->violation;
39033 }
39034
39035 /* Return true if insn is a branch instruction. */
39036
39037 static bool
39038 is_branch (rtx insn)
39039 {
39040 return (CALL_P (insn) || JUMP_P (insn));
39041 }
39042
39043 /* Return true if insn is a prefetch instruction. */
39044
39045 static bool
39046 is_prefetch (rtx insn)
39047 {
39048 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
39049 }
39050
39051 /* This function initializes a dispatch window and the list container holding a
39052 pointer to the window. */
39053
39054 static void
39055 init_window (int window_num)
39056 {
39057 int i;
39058 dispatch_windows *new_list;
39059
39060 if (window_num == 0)
39061 new_list = dispatch_window_list;
39062 else
39063 new_list = dispatch_window_list1;
39064
39065 new_list->num_insn = 0;
39066 new_list->num_uops = 0;
39067 new_list->window_size = 0;
39068 new_list->next = NULL;
39069 new_list->prev = NULL;
39070 new_list->window_num = window_num;
39071 new_list->num_imm = 0;
39072 new_list->num_imm_32 = 0;
39073 new_list->num_imm_64 = 0;
39074 new_list->imm_size = 0;
39075 new_list->num_loads = 0;
39076 new_list->num_stores = 0;
39077 new_list->violation = false;
39078
39079 for (i = 0; i < MAX_INSN; i++)
39080 {
39081 new_list->window[i].insn = NULL;
39082 new_list->window[i].group = disp_no_group;
39083 new_list->window[i].path = no_path;
39084 new_list->window[i].byte_len = 0;
39085 new_list->window[i].imm_bytes = 0;
39086 }
39087 return;
39088 }
39089
39090 /* This function allocates and initializes a dispatch window and the
39091 list container holding a pointer to the window. */
39092
39093 static dispatch_windows *
39094 allocate_window (void)
39095 {
39096 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
39097 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
39098
39099 return new_list;
39100 }
39101
39102 /* This routine initializes the dispatch scheduling information. It
39103 initiates building dispatch scheduler tables and constructs the
39104 first dispatch window. */
39105
39106 static void
39107 init_dispatch_sched (void)
39108 {
39109 /* Allocate a dispatch list and a window. */
39110 dispatch_window_list = allocate_window ();
39111 dispatch_window_list1 = allocate_window ();
39112 init_window (0);
39113 init_window (1);
39114 }
39115
39116 /* This function returns true if a branch is detected. End of a basic block
39117 does not have to be a branch, but here we assume only branches end a
39118 window. */
39119
39120 static bool
39121 is_end_basic_block (enum dispatch_group group)
39122 {
39123 return group == disp_branch;
39124 }
39125
39126 /* This function is called when the end of a window processing is reached. */
39127
39128 static void
39129 process_end_window (void)
39130 {
39131 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
39132 if (dispatch_window_list->next)
39133 {
39134 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
39135 gcc_assert (dispatch_window_list->window_size
39136 + dispatch_window_list1->window_size <= 48);
39137 init_window (1);
39138 }
39139 init_window (0);
39140 }
39141
39142 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
39143 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
39144 for 48 bytes of instructions. Note that these windows are not dispatch
39145 windows that their sizes are DISPATCH_WINDOW_SIZE. */
39146
39147 static dispatch_windows *
39148 allocate_next_window (int window_num)
39149 {
39150 if (window_num == 0)
39151 {
39152 if (dispatch_window_list->next)
39153 init_window (1);
39154 init_window (0);
39155 return dispatch_window_list;
39156 }
39157
39158 dispatch_window_list->next = dispatch_window_list1;
39159 dispatch_window_list1->prev = dispatch_window_list;
39160
39161 return dispatch_window_list1;
39162 }
39163
39164 /* Increment the number of immediate operands of an instruction. */
39165
39166 static int
39167 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
39168 {
39169 if (*in_rtx == 0)
39170 return 0;
39171
39172 switch ( GET_CODE (*in_rtx))
39173 {
39174 case CONST:
39175 case SYMBOL_REF:
39176 case CONST_INT:
39177 (imm_values->imm)++;
39178 if (x86_64_immediate_operand (*in_rtx, SImode))
39179 (imm_values->imm32)++;
39180 else
39181 (imm_values->imm64)++;
39182 break;
39183
39184 case CONST_DOUBLE:
39185 (imm_values->imm)++;
39186 (imm_values->imm64)++;
39187 break;
39188
39189 case CODE_LABEL:
39190 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
39191 {
39192 (imm_values->imm)++;
39193 (imm_values->imm32)++;
39194 }
39195 break;
39196
39197 default:
39198 break;
39199 }
39200
39201 return 0;
39202 }
39203
39204 /* Compute number of immediate operands of an instruction. */
39205
39206 static void
39207 find_constant (rtx in_rtx, imm_info *imm_values)
39208 {
39209 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
39210 (rtx_function) find_constant_1, (void *) imm_values);
39211 }
39212
39213 /* Return total size of immediate operands of an instruction along with number
39214 of corresponding immediate-operands. It initializes its parameters to zero
39215 befor calling FIND_CONSTANT.
39216 INSN is the input instruction. IMM is the total of immediates.
39217 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
39218 bit immediates. */
39219
39220 static int
39221 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
39222 {
39223 imm_info imm_values = {0, 0, 0};
39224
39225 find_constant (insn, &imm_values);
39226 *imm = imm_values.imm;
39227 *imm32 = imm_values.imm32;
39228 *imm64 = imm_values.imm64;
39229 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
39230 }
39231
39232 /* This function indicates if an operand of an instruction is an
39233 immediate. */
39234
39235 static bool
39236 has_immediate (rtx insn)
39237 {
39238 int num_imm_operand;
39239 int num_imm32_operand;
39240 int num_imm64_operand;
39241
39242 if (insn)
39243 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39244 &num_imm64_operand);
39245 return false;
39246 }
39247
39248 /* Return single or double path for instructions. */
39249
39250 static enum insn_path
39251 get_insn_path (rtx insn)
39252 {
39253 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
39254
39255 if ((int)path == 0)
39256 return path_single;
39257
39258 if ((int)path == 1)
39259 return path_double;
39260
39261 return path_multi;
39262 }
39263
39264 /* Return insn dispatch group. */
39265
39266 static enum dispatch_group
39267 get_insn_group (rtx insn)
39268 {
39269 enum dispatch_group group = get_mem_group (insn);
39270 if (group)
39271 return group;
39272
39273 if (is_branch (insn))
39274 return disp_branch;
39275
39276 if (is_cmp (insn))
39277 return disp_cmp;
39278
39279 if (has_immediate (insn))
39280 return disp_imm;
39281
39282 if (is_prefetch (insn))
39283 return disp_prefetch;
39284
39285 return disp_no_group;
39286 }
39287
39288 /* Count number of GROUP restricted instructions in a dispatch
39289 window WINDOW_LIST. */
39290
39291 static int
39292 count_num_restricted (rtx insn, dispatch_windows *window_list)
39293 {
39294 enum dispatch_group group = get_insn_group (insn);
39295 int imm_size;
39296 int num_imm_operand;
39297 int num_imm32_operand;
39298 int num_imm64_operand;
39299
39300 if (group == disp_no_group)
39301 return 0;
39302
39303 if (group == disp_imm)
39304 {
39305 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39306 &num_imm64_operand);
39307 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
39308 || num_imm_operand + window_list->num_imm > MAX_IMM
39309 || (num_imm32_operand > 0
39310 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
39311 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
39312 || (num_imm64_operand > 0
39313 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
39314 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
39315 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
39316 && num_imm64_operand > 0
39317 && ((window_list->num_imm_64 > 0
39318 && window_list->num_insn >= 2)
39319 || window_list->num_insn >= 3)))
39320 return BIG;
39321
39322 return 1;
39323 }
39324
39325 if ((group == disp_load_store
39326 && (window_list->num_loads >= MAX_LOAD
39327 || window_list->num_stores >= MAX_STORE))
39328 || ((group == disp_load
39329 || group == disp_prefetch)
39330 && window_list->num_loads >= MAX_LOAD)
39331 || (group == disp_store
39332 && window_list->num_stores >= MAX_STORE))
39333 return BIG;
39334
39335 return 1;
39336 }
39337
39338 /* This function returns true if insn satisfies dispatch rules on the
39339 last window scheduled. */
39340
39341 static bool
39342 fits_dispatch_window (rtx insn)
39343 {
39344 dispatch_windows *window_list = dispatch_window_list;
39345 dispatch_windows *window_list_next = dispatch_window_list->next;
39346 unsigned int num_restrict;
39347 enum dispatch_group group = get_insn_group (insn);
39348 enum insn_path path = get_insn_path (insn);
39349 int sum;
39350
39351 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
39352 instructions should be given the lowest priority in the
39353 scheduling process in Haifa scheduler to make sure they will be
39354 scheduled in the same dispatch window as the reference to them. */
39355 if (group == disp_jcc || group == disp_cmp)
39356 return false;
39357
39358 /* Check nonrestricted. */
39359 if (group == disp_no_group || group == disp_branch)
39360 return true;
39361
39362 /* Get last dispatch window. */
39363 if (window_list_next)
39364 window_list = window_list_next;
39365
39366 if (window_list->window_num == 1)
39367 {
39368 sum = window_list->prev->window_size + window_list->window_size;
39369
39370 if (sum == 32
39371 || (min_insn_size (insn) + sum) >= 48)
39372 /* Window 1 is full. Go for next window. */
39373 return true;
39374 }
39375
39376 num_restrict = count_num_restricted (insn, window_list);
39377
39378 if (num_restrict > num_allowable_groups[group])
39379 return false;
39380
39381 /* See if it fits in the first window. */
39382 if (window_list->window_num == 0)
39383 {
39384 /* The first widow should have only single and double path
39385 uops. */
39386 if (path == path_double
39387 && (window_list->num_uops + 2) > MAX_INSN)
39388 return false;
39389 else if (path != path_single)
39390 return false;
39391 }
39392 return true;
39393 }
39394
39395 /* Add an instruction INSN with NUM_UOPS micro-operations to the
39396 dispatch window WINDOW_LIST. */
39397
39398 static void
39399 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
39400 {
39401 int byte_len = min_insn_size (insn);
39402 int num_insn = window_list->num_insn;
39403 int imm_size;
39404 sched_insn_info *window = window_list->window;
39405 enum dispatch_group group = get_insn_group (insn);
39406 enum insn_path path = get_insn_path (insn);
39407 int num_imm_operand;
39408 int num_imm32_operand;
39409 int num_imm64_operand;
39410
39411 if (!window_list->violation && group != disp_cmp
39412 && !fits_dispatch_window (insn))
39413 window_list->violation = true;
39414
39415 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39416 &num_imm64_operand);
39417
39418 /* Initialize window with new instruction. */
39419 window[num_insn].insn = insn;
39420 window[num_insn].byte_len = byte_len;
39421 window[num_insn].group = group;
39422 window[num_insn].path = path;
39423 window[num_insn].imm_bytes = imm_size;
39424
39425 window_list->window_size += byte_len;
39426 window_list->num_insn = num_insn + 1;
39427 window_list->num_uops = window_list->num_uops + num_uops;
39428 window_list->imm_size += imm_size;
39429 window_list->num_imm += num_imm_operand;
39430 window_list->num_imm_32 += num_imm32_operand;
39431 window_list->num_imm_64 += num_imm64_operand;
39432
39433 if (group == disp_store)
39434 window_list->num_stores += 1;
39435 else if (group == disp_load
39436 || group == disp_prefetch)
39437 window_list->num_loads += 1;
39438 else if (group == disp_load_store)
39439 {
39440 window_list->num_stores += 1;
39441 window_list->num_loads += 1;
39442 }
39443 }
39444
39445 /* Adds a scheduled instruction, INSN, to the current dispatch window.
39446 If the total bytes of instructions or the number of instructions in
39447 the window exceed allowable, it allocates a new window. */
39448
39449 static void
39450 add_to_dispatch_window (rtx insn)
39451 {
39452 int byte_len;
39453 dispatch_windows *window_list;
39454 dispatch_windows *next_list;
39455 dispatch_windows *window0_list;
39456 enum insn_path path;
39457 enum dispatch_group insn_group;
39458 bool insn_fits;
39459 int num_insn;
39460 int num_uops;
39461 int window_num;
39462 int insn_num_uops;
39463 int sum;
39464
39465 if (INSN_CODE (insn) < 0)
39466 return;
39467
39468 byte_len = min_insn_size (insn);
39469 window_list = dispatch_window_list;
39470 next_list = window_list->next;
39471 path = get_insn_path (insn);
39472 insn_group = get_insn_group (insn);
39473
39474 /* Get the last dispatch window. */
39475 if (next_list)
39476 window_list = dispatch_window_list->next;
39477
39478 if (path == path_single)
39479 insn_num_uops = 1;
39480 else if (path == path_double)
39481 insn_num_uops = 2;
39482 else
39483 insn_num_uops = (int) path;
39484
39485 /* If current window is full, get a new window.
39486 Window number zero is full, if MAX_INSN uops are scheduled in it.
39487 Window number one is full, if window zero's bytes plus window
39488 one's bytes is 32, or if the bytes of the new instruction added
39489 to the total makes it greater than 48, or it has already MAX_INSN
39490 instructions in it. */
39491 num_insn = window_list->num_insn;
39492 num_uops = window_list->num_uops;
39493 window_num = window_list->window_num;
39494 insn_fits = fits_dispatch_window (insn);
39495
39496 if (num_insn >= MAX_INSN
39497 || num_uops + insn_num_uops > MAX_INSN
39498 || !(insn_fits))
39499 {
39500 window_num = ~window_num & 1;
39501 window_list = allocate_next_window (window_num);
39502 }
39503
39504 if (window_num == 0)
39505 {
39506 add_insn_window (insn, window_list, insn_num_uops);
39507 if (window_list->num_insn >= MAX_INSN
39508 && insn_group == disp_branch)
39509 {
39510 process_end_window ();
39511 return;
39512 }
39513 }
39514 else if (window_num == 1)
39515 {
39516 window0_list = window_list->prev;
39517 sum = window0_list->window_size + window_list->window_size;
39518 if (sum == 32
39519 || (byte_len + sum) >= 48)
39520 {
39521 process_end_window ();
39522 window_list = dispatch_window_list;
39523 }
39524
39525 add_insn_window (insn, window_list, insn_num_uops);
39526 }
39527 else
39528 gcc_unreachable ();
39529
39530 if (is_end_basic_block (insn_group))
39531 {
39532 /* End of basic block is reached do end-basic-block process. */
39533 process_end_window ();
39534 return;
39535 }
39536 }
39537
39538 /* Print the dispatch window, WINDOW_NUM, to FILE. */
39539
39540 DEBUG_FUNCTION static void
39541 debug_dispatch_window_file (FILE *file, int window_num)
39542 {
39543 dispatch_windows *list;
39544 int i;
39545
39546 if (window_num == 0)
39547 list = dispatch_window_list;
39548 else
39549 list = dispatch_window_list1;
39550
39551 fprintf (file, "Window #%d:\n", list->window_num);
39552 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
39553 list->num_insn, list->num_uops, list->window_size);
39554 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39555 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
39556
39557 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
39558 list->num_stores);
39559 fprintf (file, " insn info:\n");
39560
39561 for (i = 0; i < MAX_INSN; i++)
39562 {
39563 if (!list->window[i].insn)
39564 break;
39565 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
39566 i, group_name[list->window[i].group],
39567 i, (void *)list->window[i].insn,
39568 i, list->window[i].path,
39569 i, list->window[i].byte_len,
39570 i, list->window[i].imm_bytes);
39571 }
39572 }
39573
39574 /* Print to stdout a dispatch window. */
39575
39576 DEBUG_FUNCTION void
39577 debug_dispatch_window (int window_num)
39578 {
39579 debug_dispatch_window_file (stdout, window_num);
39580 }
39581
39582 /* Print INSN dispatch information to FILE. */
39583
39584 DEBUG_FUNCTION static void
39585 debug_insn_dispatch_info_file (FILE *file, rtx insn)
39586 {
39587 int byte_len;
39588 enum insn_path path;
39589 enum dispatch_group group;
39590 int imm_size;
39591 int num_imm_operand;
39592 int num_imm32_operand;
39593 int num_imm64_operand;
39594
39595 if (INSN_CODE (insn) < 0)
39596 return;
39597
39598 byte_len = min_insn_size (insn);
39599 path = get_insn_path (insn);
39600 group = get_insn_group (insn);
39601 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39602 &num_imm64_operand);
39603
39604 fprintf (file, " insn info:\n");
39605 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
39606 group_name[group], path, byte_len);
39607 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
39608 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
39609 }
39610
39611 /* Print to STDERR the status of the ready list with respect to
39612 dispatch windows. */
39613
39614 DEBUG_FUNCTION void
39615 debug_ready_dispatch (void)
39616 {
39617 int i;
39618 int no_ready = number_in_ready ();
39619
39620 fprintf (stdout, "Number of ready: %d\n", no_ready);
39621
39622 for (i = 0; i < no_ready; i++)
39623 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
39624 }
39625
39626 /* This routine is the driver of the dispatch scheduler. */
39627
39628 static void
39629 do_dispatch (rtx insn, int mode)
39630 {
39631 if (mode == DISPATCH_INIT)
39632 init_dispatch_sched ();
39633 else if (mode == ADD_TO_DISPATCH_WINDOW)
39634 add_to_dispatch_window (insn);
39635 }
39636
39637 /* Return TRUE if Dispatch Scheduling is supported. */
39638
39639 static bool
39640 has_dispatch (rtx insn, int action)
39641 {
39642 if ((TARGET_BDVER1 || TARGET_BDVER2)
39643 && flag_dispatch_scheduler)
39644 switch (action)
39645 {
39646 default:
39647 return false;
39648
39649 case IS_DISPATCH_ON:
39650 return true;
39651 break;
39652
39653 case IS_CMP:
39654 return is_cmp (insn);
39655
39656 case DISPATCH_VIOLATION:
39657 return dispatch_violation ();
39658
39659 case FITS_DISPATCH_WINDOW:
39660 return fits_dispatch_window (insn);
39661 }
39662
39663 return false;
39664 }
39665
39666 /* Implementation of reassociation_width target hook used by
39667 reassoc phase to identify parallelism level in reassociated
39668 tree. Statements tree_code is passed in OPC. Arguments type
39669 is passed in MODE.
39670
39671 Currently parallel reassociation is enabled for Atom
39672 processors only and we set reassociation width to be 2
39673 because Atom may issue up to 2 instructions per cycle.
39674
39675 Return value should be fixed if parallel reassociation is
39676 enabled for other processors. */
39677
39678 static int
39679 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
39680 enum machine_mode mode)
39681 {
39682 int res = 1;
39683
39684 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
39685 res = 2;
39686 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
39687 res = 2;
39688
39689 return res;
39690 }
39691
39692 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
39693 place emms and femms instructions. */
39694
39695 static enum machine_mode
39696 ix86_preferred_simd_mode (enum machine_mode mode)
39697 {
39698 if (!TARGET_SSE)
39699 return word_mode;
39700
39701 switch (mode)
39702 {
39703 case QImode:
39704 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
39705 case HImode:
39706 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
39707 case SImode:
39708 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
39709 case DImode:
39710 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
39711
39712 case SFmode:
39713 if (TARGET_AVX && !TARGET_PREFER_AVX128)
39714 return V8SFmode;
39715 else
39716 return V4SFmode;
39717
39718 case DFmode:
39719 if (!TARGET_VECTORIZE_DOUBLE)
39720 return word_mode;
39721 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
39722 return V4DFmode;
39723 else if (TARGET_SSE2)
39724 return V2DFmode;
39725 /* FALLTHRU */
39726
39727 default:
39728 return word_mode;
39729 }
39730 }
39731
39732 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
39733 vectors. */
39734
39735 static unsigned int
39736 ix86_autovectorize_vector_sizes (void)
39737 {
39738 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
39739 }
39740
39741 /* Validate target specific memory model bits in VAL. */
39742
39743 static unsigned HOST_WIDE_INT
39744 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
39745 {
39746 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
39747 unsigned HOST_WIDE_INT strong;
39748
39749 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
39750 |MEMMODEL_MASK)
39751 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
39752 {
39753 warning (OPT_Winvalid_memory_model,
39754 "Unknown architecture specific memory model");
39755 return MEMMODEL_SEQ_CST;
39756 }
39757 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
39758 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
39759 {
39760 warning (OPT_Winvalid_memory_model,
39761 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
39762 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
39763 }
39764 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
39765 {
39766 warning (OPT_Winvalid_memory_model,
39767 "HLE_RELEASE not used with RELEASE or stronger memory model");
39768 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
39769 }
39770 return val;
39771 }
39772
39773 /* Initialize the GCC target structure. */
39774 #undef TARGET_RETURN_IN_MEMORY
39775 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
39776
39777 #undef TARGET_LEGITIMIZE_ADDRESS
39778 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
39779
39780 #undef TARGET_ATTRIBUTE_TABLE
39781 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
39782 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39783 # undef TARGET_MERGE_DECL_ATTRIBUTES
39784 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
39785 #endif
39786
39787 #undef TARGET_COMP_TYPE_ATTRIBUTES
39788 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
39789
39790 #undef TARGET_INIT_BUILTINS
39791 #define TARGET_INIT_BUILTINS ix86_init_builtins
39792 #undef TARGET_BUILTIN_DECL
39793 #define TARGET_BUILTIN_DECL ix86_builtin_decl
39794 #undef TARGET_EXPAND_BUILTIN
39795 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
39796
39797 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
39798 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
39799 ix86_builtin_vectorized_function
39800
39801 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
39802 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
39803
39804 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
39805 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
39806
39807 #undef TARGET_VECTORIZE_BUILTIN_GATHER
39808 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
39809
39810 #undef TARGET_BUILTIN_RECIPROCAL
39811 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
39812
39813 #undef TARGET_ASM_FUNCTION_EPILOGUE
39814 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
39815
39816 #undef TARGET_ENCODE_SECTION_INFO
39817 #ifndef SUBTARGET_ENCODE_SECTION_INFO
39818 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
39819 #else
39820 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
39821 #endif
39822
39823 #undef TARGET_ASM_OPEN_PAREN
39824 #define TARGET_ASM_OPEN_PAREN ""
39825 #undef TARGET_ASM_CLOSE_PAREN
39826 #define TARGET_ASM_CLOSE_PAREN ""
39827
39828 #undef TARGET_ASM_BYTE_OP
39829 #define TARGET_ASM_BYTE_OP ASM_BYTE
39830
39831 #undef TARGET_ASM_ALIGNED_HI_OP
39832 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
39833 #undef TARGET_ASM_ALIGNED_SI_OP
39834 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
39835 #ifdef ASM_QUAD
39836 #undef TARGET_ASM_ALIGNED_DI_OP
39837 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
39838 #endif
39839
39840 #undef TARGET_PROFILE_BEFORE_PROLOGUE
39841 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
39842
39843 #undef TARGET_ASM_UNALIGNED_HI_OP
39844 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
39845 #undef TARGET_ASM_UNALIGNED_SI_OP
39846 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
39847 #undef TARGET_ASM_UNALIGNED_DI_OP
39848 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
39849
39850 #undef TARGET_PRINT_OPERAND
39851 #define TARGET_PRINT_OPERAND ix86_print_operand
39852 #undef TARGET_PRINT_OPERAND_ADDRESS
39853 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
39854 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
39855 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
39856 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
39857 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
39858
39859 #undef TARGET_SCHED_INIT_GLOBAL
39860 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
39861 #undef TARGET_SCHED_ADJUST_COST
39862 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
39863 #undef TARGET_SCHED_ISSUE_RATE
39864 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
39865 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
39866 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
39867 ia32_multipass_dfa_lookahead
39868
39869 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
39870 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
39871
39872 #undef TARGET_MEMMODEL_CHECK
39873 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
39874
39875 #ifdef HAVE_AS_TLS
39876 #undef TARGET_HAVE_TLS
39877 #define TARGET_HAVE_TLS true
39878 #endif
39879 #undef TARGET_CANNOT_FORCE_CONST_MEM
39880 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39881 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39882 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39883
39884 #undef TARGET_DELEGITIMIZE_ADDRESS
39885 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39886
39887 #undef TARGET_MS_BITFIELD_LAYOUT_P
39888 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39889
39890 #if TARGET_MACHO
39891 #undef TARGET_BINDS_LOCAL_P
39892 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39893 #endif
39894 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39895 #undef TARGET_BINDS_LOCAL_P
39896 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39897 #endif
39898
39899 #undef TARGET_ASM_OUTPUT_MI_THUNK
39900 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39901 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39902 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39903
39904 #undef TARGET_ASM_FILE_START
39905 #define TARGET_ASM_FILE_START x86_file_start
39906
39907 #undef TARGET_OPTION_OVERRIDE
39908 #define TARGET_OPTION_OVERRIDE ix86_option_override
39909
39910 #undef TARGET_REGISTER_MOVE_COST
39911 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39912 #undef TARGET_MEMORY_MOVE_COST
39913 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39914 #undef TARGET_RTX_COSTS
39915 #define TARGET_RTX_COSTS ix86_rtx_costs
39916 #undef TARGET_ADDRESS_COST
39917 #define TARGET_ADDRESS_COST ix86_address_cost
39918
39919 #undef TARGET_FIXED_CONDITION_CODE_REGS
39920 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39921 #undef TARGET_CC_MODES_COMPATIBLE
39922 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39923
39924 #undef TARGET_MACHINE_DEPENDENT_REORG
39925 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39926
39927 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39928 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39929
39930 #undef TARGET_BUILD_BUILTIN_VA_LIST
39931 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39932
39933 #undef TARGET_FOLD_BUILTIN
39934 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
39935
39936 #undef TARGET_ENUM_VA_LIST_P
39937 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39938
39939 #undef TARGET_FN_ABI_VA_LIST
39940 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39941
39942 #undef TARGET_CANONICAL_VA_LIST_TYPE
39943 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39944
39945 #undef TARGET_EXPAND_BUILTIN_VA_START
39946 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39947
39948 #undef TARGET_MD_ASM_CLOBBERS
39949 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
39950
39951 #undef TARGET_PROMOTE_PROTOTYPES
39952 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
39953 #undef TARGET_STRUCT_VALUE_RTX
39954 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
39955 #undef TARGET_SETUP_INCOMING_VARARGS
39956 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
39957 #undef TARGET_MUST_PASS_IN_STACK
39958 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
39959 #undef TARGET_FUNCTION_ARG_ADVANCE
39960 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
39961 #undef TARGET_FUNCTION_ARG
39962 #define TARGET_FUNCTION_ARG ix86_function_arg
39963 #undef TARGET_FUNCTION_ARG_BOUNDARY
39964 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
39965 #undef TARGET_PASS_BY_REFERENCE
39966 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39967 #undef TARGET_INTERNAL_ARG_POINTER
39968 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39969 #undef TARGET_UPDATE_STACK_BOUNDARY
39970 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39971 #undef TARGET_GET_DRAP_RTX
39972 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39973 #undef TARGET_STRICT_ARGUMENT_NAMING
39974 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39975 #undef TARGET_STATIC_CHAIN
39976 #define TARGET_STATIC_CHAIN ix86_static_chain
39977 #undef TARGET_TRAMPOLINE_INIT
39978 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39979 #undef TARGET_RETURN_POPS_ARGS
39980 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39981
39982 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39983 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39984
39985 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39986 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39987
39988 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39989 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39990
39991 #undef TARGET_C_MODE_FOR_SUFFIX
39992 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39993
39994 #ifdef HAVE_AS_TLS
39995 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39996 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39997 #endif
39998
39999 #ifdef SUBTARGET_INSERT_ATTRIBUTES
40000 #undef TARGET_INSERT_ATTRIBUTES
40001 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
40002 #endif
40003
40004 #undef TARGET_MANGLE_TYPE
40005 #define TARGET_MANGLE_TYPE ix86_mangle_type
40006
40007 #if !TARGET_MACHO
40008 #undef TARGET_STACK_PROTECT_FAIL
40009 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
40010 #endif
40011
40012 #undef TARGET_FUNCTION_VALUE
40013 #define TARGET_FUNCTION_VALUE ix86_function_value
40014
40015 #undef TARGET_FUNCTION_VALUE_REGNO_P
40016 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
40017
40018 #undef TARGET_PROMOTE_FUNCTION_MODE
40019 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
40020
40021 #undef TARGET_SECONDARY_RELOAD
40022 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
40023
40024 #undef TARGET_CLASS_MAX_NREGS
40025 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
40026
40027 #undef TARGET_PREFERRED_RELOAD_CLASS
40028 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
40029 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
40030 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
40031 #undef TARGET_CLASS_LIKELY_SPILLED_P
40032 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
40033
40034 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
40035 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
40036 ix86_builtin_vectorization_cost
40037 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
40038 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
40039 ix86_vectorize_vec_perm_const_ok
40040 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
40041 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
40042 ix86_preferred_simd_mode
40043 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
40044 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
40045 ix86_autovectorize_vector_sizes
40046
40047 #undef TARGET_SET_CURRENT_FUNCTION
40048 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
40049
40050 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
40051 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
40052
40053 #undef TARGET_OPTION_SAVE
40054 #define TARGET_OPTION_SAVE ix86_function_specific_save
40055
40056 #undef TARGET_OPTION_RESTORE
40057 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
40058
40059 #undef TARGET_OPTION_PRINT
40060 #define TARGET_OPTION_PRINT ix86_function_specific_print
40061
40062 #undef TARGET_CAN_INLINE_P
40063 #define TARGET_CAN_INLINE_P ix86_can_inline_p
40064
40065 #undef TARGET_EXPAND_TO_RTL_HOOK
40066 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
40067
40068 #undef TARGET_LEGITIMATE_ADDRESS_P
40069 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
40070
40071 #undef TARGET_LEGITIMATE_CONSTANT_P
40072 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
40073
40074 #undef TARGET_FRAME_POINTER_REQUIRED
40075 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
40076
40077 #undef TARGET_CAN_ELIMINATE
40078 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
40079
40080 #undef TARGET_EXTRA_LIVE_ON_ENTRY
40081 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
40082
40083 #undef TARGET_ASM_CODE_END
40084 #define TARGET_ASM_CODE_END ix86_code_end
40085
40086 #undef TARGET_CONDITIONAL_REGISTER_USAGE
40087 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
40088
40089 #if TARGET_MACHO
40090 #undef TARGET_INIT_LIBFUNCS
40091 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
40092 #endif
40093
40094 struct gcc_target targetm = TARGET_INITIALIZER;
40095 \f
40096 #include "gt-i386.h"