1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
157 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
158 GET_MODE (op
) == VOIDmode
159 ? mode
: GET_MODE (op
), byte
);
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
168 ix86_expand_clear (rtx dest
)
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed
);
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
177 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
178 tmp
= gen_rtx_SET (dest
, const0_rtx
);
180 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
182 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
183 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
190 ix86_expand_move (machine_mode mode
, rtx operands
[])
193 rtx tmp
, addend
= NULL_RTX
;
194 enum tls_model model
;
199 switch (GET_CODE (op1
))
204 if (GET_CODE (tmp
) != PLUS
205 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
209 addend
= XEXP (tmp
, 1);
213 model
= SYMBOL_REF_TLS_MODEL (op1
);
216 op1
= legitimize_tls_address (op1
, model
, true);
217 else if (ix86_force_load_from_GOT_p (op1
))
219 /* Load the external function address via GOT slot to avoid PLT. */
220 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
224 op1
= gen_rtx_CONST (Pmode
, op1
);
225 op1
= gen_const_mem (Pmode
, op1
);
226 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
230 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
246 op1
= force_operand (op1
, NULL_RTX
);
247 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
248 op0
, 1, OPTAB_DIRECT
);
251 op1
= force_operand (op1
, op0
);
256 op1
= convert_to_mode (mode
, op1
, 1);
262 if ((flag_pic
|| MACHOPIC_INDIRECT
)
263 && symbolic_operand (op1
, mode
))
265 if (TARGET_MACHO
&& !TARGET_64BIT
)
269 if (MACHOPIC_INDIRECT
)
271 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
272 ? op0
: gen_reg_rtx (Pmode
);
273 op1
= machopic_indirect_data_reference (op1
, temp
);
275 op1
= machopic_legitimize_pic_address (op1
, mode
,
276 temp
== op1
? 0 : temp
);
278 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
280 rtx insn
= gen_rtx_SET (op0
, op1
);
284 if (GET_CODE (op0
) == MEM
)
285 op1
= force_reg (Pmode
, op1
);
289 if (GET_CODE (temp
) != REG
)
290 temp
= gen_reg_rtx (Pmode
);
291 temp
= legitimize_pic_address (op1
, temp
);
302 op1
= force_reg (mode
, op1
);
303 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
305 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
306 op1
= legitimize_pic_address (op1
, reg
);
309 op1
= convert_to_mode (mode
, op1
, 1);
316 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
317 || !push_operand (op0
, mode
))
319 op1
= force_reg (mode
, op1
);
321 if (push_operand (op0
, mode
)
322 && ! general_no_elim_operand (op1
, mode
))
323 op1
= copy_to_mode_reg (mode
, op1
);
325 /* Force large constants in 64bit compilation into register
326 to get them CSEed. */
327 if (can_create_pseudo_p ()
328 && (mode
== DImode
) && TARGET_64BIT
329 && immediate_operand (op1
, mode
)
330 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
331 && !register_operand (op0
, mode
)
333 op1
= copy_to_mode_reg (mode
, op1
);
335 if (can_create_pseudo_p ()
336 && CONST_DOUBLE_P (op1
))
338 /* If we are loading a floating point constant to a register,
339 force the value to memory now, since we'll get better code
342 op1
= validize_mem (force_const_mem (mode
, op1
));
343 if (!register_operand (op0
, mode
))
345 rtx temp
= gen_reg_rtx (mode
);
346 emit_insn (gen_rtx_SET (temp
, op1
));
347 emit_move_insn (op0
, temp
);
353 emit_insn (gen_rtx_SET (op0
, op1
));
357 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
359 rtx op0
= operands
[0], op1
= operands
[1];
360 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
361 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
362 unsigned int align
= (TARGET_IAMCU
363 ? GET_MODE_BITSIZE (mode
)
364 : GET_MODE_ALIGNMENT (mode
));
366 if (push_operand (op0
, VOIDmode
))
367 op0
= emit_move_resolve_push (mode
, op0
);
369 /* Force constants other than zero into memory. We do not know how
370 the instructions used to build constants modify the upper 64 bits
371 of the register, once we have that information we may be able
372 to handle some of them more efficiently. */
373 if (can_create_pseudo_p ()
376 && CONSTANT_P (SUBREG_REG (op1
))))
377 && ((register_operand (op0
, mode
)
378 && !standard_sse_constant_p (op1
, mode
))
379 /* ix86_expand_vector_move_misalign() does not like constants. */
380 || (SSE_REG_MODE_P (mode
)
382 && MEM_ALIGN (op0
) < align
)))
386 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
387 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
389 r
= validize_mem (r
);
391 r
= force_reg (imode
, SUBREG_REG (op1
));
392 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
395 op1
= validize_mem (force_const_mem (mode
, op1
));
398 /* We need to check memory alignment for SSE mode since attribute
399 can make operands unaligned. */
400 if (can_create_pseudo_p ()
401 && SSE_REG_MODE_P (mode
)
402 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
403 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
407 /* ix86_expand_vector_move_misalign() does not like both
408 arguments in memory. */
409 if (!register_operand (op0
, mode
)
410 && !register_operand (op1
, mode
))
411 op1
= force_reg (mode
, op1
);
413 tmp
[0] = op0
; tmp
[1] = op1
;
414 ix86_expand_vector_move_misalign (mode
, tmp
);
418 /* Make operand1 a register if it isn't already. */
419 if (can_create_pseudo_p ()
420 && !register_operand (op0
, mode
)
421 && !register_operand (op1
, mode
))
423 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
427 emit_insn (gen_rtx_SET (op0
, op1
));
430 /* Split 32-byte AVX unaligned load and store if needed. */
433 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
436 rtx (*extract
) (rtx
, rtx
, rtx
);
439 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
440 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
442 emit_insn (gen_rtx_SET (op0
, op1
));
446 rtx orig_op0
= NULL_RTX
;
447 mode
= GET_MODE (op0
);
448 switch (GET_MODE_CLASS (mode
))
450 case MODE_VECTOR_INT
:
452 if (mode
!= V32QImode
)
457 op0
= gen_reg_rtx (V32QImode
);
460 op0
= gen_lowpart (V32QImode
, op0
);
461 op1
= gen_lowpart (V32QImode
, op1
);
465 case MODE_VECTOR_FLOAT
:
476 extract
= gen_avx_vextractf128v32qi
;
480 extract
= gen_avx_vextractf128v8sf
;
484 extract
= gen_avx_vextractf128v4df
;
491 rtx r
= gen_reg_rtx (mode
);
492 m
= adjust_address (op1
, mode
, 0);
493 emit_move_insn (r
, m
);
494 m
= adjust_address (op1
, mode
, 16);
495 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
496 emit_move_insn (op0
, r
);
498 else if (MEM_P (op0
))
500 m
= adjust_address (op0
, mode
, 0);
501 emit_insn (extract (m
, op1
, const0_rtx
));
502 m
= adjust_address (op0
, mode
, 16);
503 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
509 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
512 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
513 straight to ix86_expand_vector_move. */
514 /* Code generation for scalar reg-reg moves of single and double precision data:
515 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
519 if (x86_sse_partial_reg_dependency == true)
524 Code generation for scalar loads of double precision data:
525 if (x86_sse_split_regs == true)
526 movlpd mem, reg (gas syntax)
530 Code generation for unaligned packed loads of single precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
532 if (x86_sse_unaligned_move_optimal)
535 if (x86_sse_partial_reg_dependency == true)
547 Code generation for unaligned packed loads of double precision data
548 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
549 if (x86_sse_unaligned_move_optimal)
552 if (x86_sse_split_regs == true)
565 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
572 /* Use unaligned load/store for AVX512 or when optimizing for size. */
573 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
575 emit_insn (gen_rtx_SET (op0
, op1
));
581 if (GET_MODE_SIZE (mode
) == 32)
582 ix86_avx256_split_vector_move_misalign (op0
, op1
);
584 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
585 emit_insn (gen_rtx_SET (op0
, op1
));
589 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
590 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
592 emit_insn (gen_rtx_SET (op0
, op1
));
596 /* ??? If we have typed data, then it would appear that using
597 movdqu is the only way to get unaligned data loaded with
599 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
601 emit_insn (gen_rtx_SET (op0
, op1
));
607 if (TARGET_SSE2
&& mode
== V2DFmode
)
611 /* When SSE registers are split into halves, we can avoid
612 writing to the top half twice. */
613 if (TARGET_SSE_SPLIT_REGS
)
620 /* ??? Not sure about the best option for the Intel chips.
621 The following would seem to satisfy; the register is
622 entirely cleared, breaking the dependency chain. We
623 then store to the upper half, with a dependency depth
624 of one. A rumor has it that Intel recommends two movsd
625 followed by an unpacklpd, but this is unconfirmed. And
626 given that the dependency depth of the unpacklpd would
627 still be one, I'm not sure why this would be better. */
628 zero
= CONST0_RTX (V2DFmode
);
631 m
= adjust_address (op1
, DFmode
, 0);
632 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
633 m
= adjust_address (op1
, DFmode
, 8);
634 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
640 if (mode
!= V4SFmode
)
641 t
= gen_reg_rtx (V4SFmode
);
645 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
646 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
650 m
= adjust_address (op1
, V2SFmode
, 0);
651 emit_insn (gen_sse_loadlps (t
, t
, m
));
652 m
= adjust_address (op1
, V2SFmode
, 8);
653 emit_insn (gen_sse_loadhps (t
, t
, m
));
654 if (mode
!= V4SFmode
)
655 emit_move_insn (op0
, gen_lowpart (mode
, t
));
658 else if (MEM_P (op0
))
660 if (TARGET_SSE2
&& mode
== V2DFmode
)
662 m
= adjust_address (op0
, DFmode
, 0);
663 emit_insn (gen_sse2_storelpd (m
, op1
));
664 m
= adjust_address (op0
, DFmode
, 8);
665 emit_insn (gen_sse2_storehpd (m
, op1
));
669 if (mode
!= V4SFmode
)
670 op1
= gen_lowpart (V4SFmode
, op1
);
672 m
= adjust_address (op0
, V2SFmode
, 0);
673 emit_insn (gen_sse_storelps (m
, op1
));
674 m
= adjust_address (op0
, V2SFmode
, 8);
675 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
682 /* Move bits 64:95 to bits 32:63. */
685 ix86_move_vector_high_sse_to_mmx (rtx op
)
687 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
688 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
689 GEN_INT (0), GEN_INT (0)));
690 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
691 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
692 rtx insn
= gen_rtx_SET (dest
, op
);
696 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
699 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
701 rtx op0
= operands
[0];
702 rtx op1
= operands
[1];
703 rtx op2
= operands
[2];
705 machine_mode dmode
= GET_MODE (op0
);
706 machine_mode smode
= GET_MODE (op1
);
707 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
708 machine_mode inner_smode
= GET_MODE_INNER (smode
);
710 /* Get the corresponding SSE mode for destination. */
711 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
712 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
714 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
715 nunits
/ 2).require ();
717 /* Get the corresponding SSE mode for source. */
718 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
719 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
722 /* Generate SSE pack with signed/unsigned saturation. */
723 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
724 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
725 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
727 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
728 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
729 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
733 ix86_move_vector_high_sse_to_mmx (op0
);
736 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
739 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
741 rtx op0
= operands
[0];
742 rtx op1
= operands
[1];
743 rtx op2
= operands
[2];
744 machine_mode mode
= GET_MODE (op0
);
746 /* The corresponding SSE mode. */
747 machine_mode sse_mode
, double_sse_mode
;
752 sse_mode
= V16QImode
;
753 double_sse_mode
= V32QImode
;
754 mask
= gen_rtx_PARALLEL (VOIDmode
,
756 GEN_INT (0), GEN_INT (16),
757 GEN_INT (1), GEN_INT (17),
758 GEN_INT (2), GEN_INT (18),
759 GEN_INT (3), GEN_INT (19),
760 GEN_INT (4), GEN_INT (20),
761 GEN_INT (5), GEN_INT (21),
762 GEN_INT (6), GEN_INT (22),
763 GEN_INT (7), GEN_INT (23)));
768 double_sse_mode
= V16HImode
;
769 mask
= gen_rtx_PARALLEL (VOIDmode
,
771 GEN_INT (0), GEN_INT (8),
772 GEN_INT (1), GEN_INT (9),
773 GEN_INT (2), GEN_INT (10),
774 GEN_INT (3), GEN_INT (11)));
779 double_sse_mode
= V8SImode
;
780 mask
= gen_rtx_PARALLEL (VOIDmode
,
782 GEN_INT (0), GEN_INT (4),
783 GEN_INT (1), GEN_INT (5)));
790 /* Generate SSE punpcklXX. */
791 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
792 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
793 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
795 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
796 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
797 rtx insn
= gen_rtx_SET (dest
, op2
);
802 /* Move bits 64:127 to bits 0:63. */
803 mask
= gen_rtx_PARALLEL (VOIDmode
,
804 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
805 GEN_INT (0), GEN_INT (0)));
806 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
807 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
808 insn
= gen_rtx_SET (dest
, op1
);
813 /* Helper function of ix86_fixup_binary_operands to canonicalize
814 operand order. Returns true if the operands should be swapped. */
817 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
820 rtx dst
= operands
[0];
821 rtx src1
= operands
[1];
822 rtx src2
= operands
[2];
824 /* If the operation is not commutative, we can't do anything. */
825 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
826 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
829 /* Highest priority is that src1 should match dst. */
830 if (rtx_equal_p (dst
, src1
))
832 if (rtx_equal_p (dst
, src2
))
835 /* Next highest priority is that immediate constants come second. */
836 if (immediate_operand (src2
, mode
))
838 if (immediate_operand (src1
, mode
))
841 /* Lowest priority is that memory references should come second. */
851 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
852 destination to use for the operation. If different from the true
853 destination in operands[0], a copy operation will be required. */
856 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
859 rtx dst
= operands
[0];
860 rtx src1
= operands
[1];
861 rtx src2
= operands
[2];
863 /* Canonicalize operand order. */
864 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
866 /* It is invalid to swap operands of different modes. */
867 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
869 std::swap (src1
, src2
);
872 /* Both source operands cannot be in memory. */
873 if (MEM_P (src1
) && MEM_P (src2
))
875 /* Optimization: Only read from memory once. */
876 if (rtx_equal_p (src1
, src2
))
878 src2
= force_reg (mode
, src2
);
881 else if (rtx_equal_p (dst
, src1
))
882 src2
= force_reg (mode
, src2
);
884 src1
= force_reg (mode
, src1
);
887 /* If the destination is memory, and we do not have matching source
888 operands, do things in registers. */
889 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
890 dst
= gen_reg_rtx (mode
);
892 /* Source 1 cannot be a constant. */
893 if (CONSTANT_P (src1
))
894 src1
= force_reg (mode
, src1
);
896 /* Source 1 cannot be a non-matching memory. */
897 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
898 src1
= force_reg (mode
, src1
);
900 /* Improve address combine. */
902 && GET_MODE_CLASS (mode
) == MODE_INT
904 src2
= force_reg (mode
, src2
);
911 /* Similarly, but assume that the destination has already been
915 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
916 machine_mode mode
, rtx operands
[])
918 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
919 gcc_assert (dst
== operands
[0]);
922 /* Attempt to expand a binary operator. Make the expansion closer to the
923 actual machine, then just general_operand, which will allow 3 separate
924 memory references (one output, two input) in a single insn. */
927 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
930 rtx src1
, src2
, dst
, op
, clob
;
932 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
936 /* Emit the instruction. */
938 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
942 && !rtx_equal_p (dst
, src1
))
944 /* This is going to be an LEA; avoid splitting it later. */
949 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
950 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
953 /* Fix up the destination if needed. */
954 if (dst
!= operands
[0])
955 emit_move_insn (operands
[0], dst
);
958 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
959 the given OPERANDS. */
962 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
965 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
966 if (SUBREG_P (operands
[1]))
971 else if (SUBREG_P (operands
[2]))
976 /* Optimize (__m128i) d | (__m128i) e and similar code
977 when d and e are float vectors into float vector logical
978 insn. In C/C++ without using intrinsics there is no other way
979 to express vector logical operation on float vectors than
980 to cast them temporarily to integer vectors. */
982 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
983 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
984 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
985 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
986 && SUBREG_BYTE (op1
) == 0
987 && (GET_CODE (op2
) == CONST_VECTOR
988 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
989 && SUBREG_BYTE (op2
) == 0))
990 && can_create_pseudo_p ())
993 switch (GET_MODE (SUBREG_REG (op1
)))
1001 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1002 if (GET_CODE (op2
) == CONST_VECTOR
)
1004 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1005 op2
= force_reg (GET_MODE (dst
), op2
);
1010 op2
= SUBREG_REG (operands
[2]);
1011 if (!vector_operand (op2
, GET_MODE (dst
)))
1012 op2
= force_reg (GET_MODE (dst
), op2
);
1014 op1
= SUBREG_REG (op1
);
1015 if (!vector_operand (op1
, GET_MODE (dst
)))
1016 op1
= force_reg (GET_MODE (dst
), op1
);
1017 emit_insn (gen_rtx_SET (dst
,
1018 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1020 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1026 if (!vector_operand (operands
[1], mode
))
1027 operands
[1] = force_reg (mode
, operands
[1]);
1028 if (!vector_operand (operands
[2], mode
))
1029 operands
[2] = force_reg (mode
, operands
[2]);
1030 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1031 emit_insn (gen_rtx_SET (operands
[0],
1032 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1036 /* Return TRUE or FALSE depending on whether the binary operator meets the
1037 appropriate constraints. */
1040 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1043 rtx dst
= operands
[0];
1044 rtx src1
= operands
[1];
1045 rtx src2
= operands
[2];
1047 /* Both source operands cannot be in memory. */
1048 if (MEM_P (src1
) && MEM_P (src2
))
1051 /* Canonicalize operand order for commutative operators. */
1052 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1053 std::swap (src1
, src2
);
1055 /* If the destination is memory, we must have a matching source operand. */
1056 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1059 /* Source 1 cannot be a constant. */
1060 if (CONSTANT_P (src1
))
1063 /* Source 1 cannot be a non-matching memory. */
1064 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1065 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1069 || (TARGET_64BIT
&& mode
== DImode
))
1070 && satisfies_constraint_L (src2
));
1075 /* Attempt to expand a unary operator. Make the expansion closer to the
1076 actual machine, then just general_operand, which will allow 2 separate
1077 memory references (one output, one input) in a single insn. */
1080 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1083 bool matching_memory
= false;
1084 rtx src
, dst
, op
, clob
;
1089 /* If the destination is memory, and we do not have matching source
1090 operands, do things in registers. */
1093 if (rtx_equal_p (dst
, src
))
1094 matching_memory
= true;
1096 dst
= gen_reg_rtx (mode
);
1099 /* When source operand is memory, destination must match. */
1100 if (MEM_P (src
) && !matching_memory
)
1101 src
= force_reg (mode
, src
);
1103 /* Emit the instruction. */
1105 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1111 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1112 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1115 /* Fix up the destination if needed. */
1116 if (dst
!= operands
[0])
1117 emit_move_insn (operands
[0], dst
);
1120 /* Predict just emitted jump instruction to be taken with probability PROB. */
1123 predict_jump (int prob
)
1125 rtx_insn
*insn
= get_last_insn ();
1126 gcc_assert (JUMP_P (insn
));
1127 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1130 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1131 divisor are within the range [0-255]. */
1134 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1137 rtx_code_label
*end_label
, *qimode_label
;
1140 rtx scratch
, tmp0
, tmp1
, tmp2
;
1141 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1146 if (GET_MODE (operands
[0]) == SImode
)
1148 if (GET_MODE (operands
[1]) == SImode
)
1149 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1152 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1156 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1160 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1167 end_label
= gen_label_rtx ();
1168 qimode_label
= gen_label_rtx ();
1170 scratch
= gen_reg_rtx (mode
);
1172 /* Use 8bit unsigned divimod if dividend and divisor are within
1173 the range [0-255]. */
1174 emit_move_insn (scratch
, operands
[2]);
1175 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1176 scratch
, 1, OPTAB_DIRECT
);
1177 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1178 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1179 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1180 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1181 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1183 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1184 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1185 JUMP_LABEL (insn
) = qimode_label
;
1187 /* Generate original signed/unsigned divimod. */
1188 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1189 operands
[2], operands
[3]));
1191 /* Branch to the end. */
1192 emit_jump_insn (gen_jump (end_label
));
1195 /* Generate 8bit unsigned divide. */
1196 emit_label (qimode_label
);
1197 /* Don't use operands[0] for result of 8bit divide since not all
1198 registers support QImode ZERO_EXTRACT. */
1199 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1200 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1201 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1202 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1206 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1207 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1211 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1212 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1216 if (GET_MODE (operands
[0]) != SImode
)
1217 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1218 if (GET_MODE (operands
[1]) != SImode
)
1219 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1222 /* Extract remainder from AH. */
1223 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1224 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1225 GEN_INT (8), GEN_INT (8));
1226 insn
= emit_move_insn (operands
[1], tmp1
);
1227 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1229 /* Zero extend quotient from AL. */
1230 tmp1
= gen_lowpart (QImode
, tmp0
);
1231 insn
= emit_insn (gen_extend_insn
1233 GET_MODE (operands
[0]), QImode
, 1));
1234 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1236 emit_label (end_label
);
1239 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1240 matches destination. RTX includes clobber of FLAGS_REG. */
1243 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1248 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1249 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1251 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1254 /* Return true if regno1 def is nearest to the insn. */
1257 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1259 rtx_insn
*prev
= insn
;
1260 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1264 while (prev
&& prev
!= start
)
1266 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1268 prev
= PREV_INSN (prev
);
1271 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1273 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1275 prev
= PREV_INSN (prev
);
1278 /* None of the regs is defined in the bb. */
1282 /* Split lea instructions into a sequence of instructions
1283 which are executed on ALU to avoid AGU stalls.
1284 It is assumed that it is allowed to clobber flags register
1288 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1290 unsigned int regno0
, regno1
, regno2
;
1291 struct ix86_address parts
;
1295 ok
= ix86_decompose_address (operands
[1], &parts
);
1298 target
= gen_lowpart (mode
, operands
[0]);
1300 regno0
= true_regnum (target
);
1301 regno1
= INVALID_REGNUM
;
1302 regno2
= INVALID_REGNUM
;
1306 parts
.base
= gen_lowpart (mode
, parts
.base
);
1307 regno1
= true_regnum (parts
.base
);
1312 parts
.index
= gen_lowpart (mode
, parts
.index
);
1313 regno2
= true_regnum (parts
.index
);
1317 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1319 if (parts
.scale
> 1)
1321 /* Case r1 = r1 + ... */
1322 if (regno1
== regno0
)
1324 /* If we have a case r1 = r1 + C * r2 then we
1325 should use multiplication which is very
1326 expensive. Assume cost model is wrong if we
1327 have such case here. */
1328 gcc_assert (regno2
!= regno0
);
1330 for (adds
= parts
.scale
; adds
> 0; adds
--)
1331 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1335 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1336 if (regno0
!= regno2
)
1337 emit_insn (gen_rtx_SET (target
, parts
.index
));
1339 /* Use shift for scaling. */
1340 ix86_emit_binop (ASHIFT
, mode
, target
,
1341 GEN_INT (exact_log2 (parts
.scale
)));
1344 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1346 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1347 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1350 else if (!parts
.base
&& !parts
.index
)
1352 gcc_assert(parts
.disp
);
1353 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1359 if (regno0
!= regno2
)
1360 emit_insn (gen_rtx_SET (target
, parts
.index
));
1362 else if (!parts
.index
)
1364 if (regno0
!= regno1
)
1365 emit_insn (gen_rtx_SET (target
, parts
.base
));
1369 if (regno0
== regno1
)
1371 else if (regno0
== regno2
)
1377 /* Find better operand for SET instruction, depending
1378 on which definition is farther from the insn. */
1379 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1380 tmp
= parts
.index
, tmp1
= parts
.base
;
1382 tmp
= parts
.base
, tmp1
= parts
.index
;
1384 emit_insn (gen_rtx_SET (target
, tmp
));
1386 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1387 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1389 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1393 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1396 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1397 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1401 /* Post-reload splitter for converting an SF or DFmode value in an
1402 SSE register into an unsigned SImode. */
1405 ix86_split_convert_uns_si_sse (rtx operands
[])
1407 machine_mode vecmode
;
1408 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1410 large
= operands
[1];
1411 zero_or_two31
= operands
[2];
1412 input
= operands
[3];
1413 two31
= operands
[4];
1414 vecmode
= GET_MODE (large
);
1415 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1417 /* Load up the value into the low element. We must ensure that the other
1418 elements are valid floats -- zero is the easiest such value. */
1421 if (vecmode
== V4SFmode
)
1422 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1424 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1428 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1429 emit_move_insn (value
, CONST0_RTX (vecmode
));
1430 if (vecmode
== V4SFmode
)
1431 emit_insn (gen_sse_movss (value
, value
, input
));
1433 emit_insn (gen_sse2_movsd (value
, value
, input
));
1436 emit_move_insn (large
, two31
);
1437 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1439 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1440 emit_insn (gen_rtx_SET (large
, x
));
1442 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1443 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1445 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1446 emit_insn (gen_rtx_SET (value
, x
));
1448 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1449 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1451 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1452 if (vecmode
== V4SFmode
)
1453 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1455 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1458 emit_insn (gen_xorv4si3 (value
, value
, large
));
1461 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1462 machine_mode mode
, rtx target
,
1463 rtx var
, int one_var
);
1465 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1466 Expects the 64-bit DImode to be supplied in a pair of integral
1467 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1468 -mfpmath=sse, !optimize_size only. */
1471 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1473 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1474 rtx int_xmm
, fp_xmm
;
1475 rtx biases
, exponents
;
1478 int_xmm
= gen_reg_rtx (V4SImode
);
1479 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1480 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1481 else if (TARGET_SSE_SPLIT_REGS
)
1483 emit_clobber (int_xmm
);
1484 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1488 x
= gen_reg_rtx (V2DImode
);
1489 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1490 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1493 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1494 gen_rtvec (4, GEN_INT (0x43300000UL
),
1495 GEN_INT (0x45300000UL
),
1496 const0_rtx
, const0_rtx
));
1497 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1499 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1500 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1502 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1503 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1504 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1505 (0x1.0p84 + double(fp_value_hi_xmm)).
1506 Note these exponents differ by 32. */
1508 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1510 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1511 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1512 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1513 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1514 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1515 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1516 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1517 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1518 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1520 /* Add the upper and lower DFmode values together. */
1522 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1525 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1526 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1527 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1530 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1533 /* Not used, but eases macroization of patterns. */
1535 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1540 /* Convert an unsigned SImode value into a DFmode. Only currently used
1541 for SSE, but applicable anywhere. */
1544 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1546 REAL_VALUE_TYPE TWO31r
;
1549 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1550 NULL
, 1, OPTAB_DIRECT
);
1552 fp
= gen_reg_rtx (DFmode
);
1553 emit_insn (gen_floatsidf2 (fp
, x
));
1555 real_ldexp (&TWO31r
, &dconst1
, 31);
1556 x
= const_double_from_real_value (TWO31r
, DFmode
);
1558 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1560 emit_move_insn (target
, x
);
1563 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1564 32-bit mode; otherwise we have a direct convert instruction. */
1567 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1569 REAL_VALUE_TYPE TWO32r
;
1570 rtx fp_lo
, fp_hi
, x
;
1572 fp_lo
= gen_reg_rtx (DFmode
);
1573 fp_hi
= gen_reg_rtx (DFmode
);
1575 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1577 real_ldexp (&TWO32r
, &dconst1
, 32);
1578 x
= const_double_from_real_value (TWO32r
, DFmode
);
1579 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1581 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1583 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1586 emit_move_insn (target
, x
);
1589 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1590 For x86_32, -mfpmath=sse, !optimize_size only. */
1592 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1594 REAL_VALUE_TYPE ONE16r
;
1595 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1597 real_ldexp (&ONE16r
, &dconst1
, 16);
1598 x
= const_double_from_real_value (ONE16r
, SFmode
);
1599 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1600 NULL
, 0, OPTAB_DIRECT
);
1601 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1602 NULL
, 0, OPTAB_DIRECT
);
1603 fp_hi
= gen_reg_rtx (SFmode
);
1604 fp_lo
= gen_reg_rtx (SFmode
);
1605 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1606 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1607 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1609 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1611 if (!rtx_equal_p (target
, fp_hi
))
1612 emit_move_insn (target
, fp_hi
);
1615 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1616 a vector of unsigned ints VAL to vector of floats TARGET. */
1619 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1622 REAL_VALUE_TYPE TWO16r
;
1623 machine_mode intmode
= GET_MODE (val
);
1624 machine_mode fltmode
= GET_MODE (target
);
1625 rtx (*cvt
) (rtx
, rtx
);
1627 if (intmode
== V4SImode
)
1628 cvt
= gen_floatv4siv4sf2
;
1630 cvt
= gen_floatv8siv8sf2
;
1631 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1632 tmp
[0] = force_reg (intmode
, tmp
[0]);
1633 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1635 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1636 NULL_RTX
, 1, OPTAB_DIRECT
);
1637 tmp
[3] = gen_reg_rtx (fltmode
);
1638 emit_insn (cvt (tmp
[3], tmp
[1]));
1639 tmp
[4] = gen_reg_rtx (fltmode
);
1640 emit_insn (cvt (tmp
[4], tmp
[2]));
1641 real_ldexp (&TWO16r
, &dconst1
, 16);
1642 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1643 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1644 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1646 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1648 if (tmp
[7] != target
)
1649 emit_move_insn (target
, tmp
[7]);
1652 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1653 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1654 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1655 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1658 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1660 REAL_VALUE_TYPE TWO31r
;
1662 machine_mode mode
= GET_MODE (val
);
1663 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1664 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1665 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1668 for (i
= 0; i
< 3; i
++)
1669 tmp
[i
] = gen_reg_rtx (mode
);
1670 real_ldexp (&TWO31r
, &dconst1
, 31);
1671 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1672 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1673 two31r
= force_reg (mode
, two31r
);
1676 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1677 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1678 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1679 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1680 default: gcc_unreachable ();
1682 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1683 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1684 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1686 if (intmode
== V4SImode
|| TARGET_AVX2
)
1687 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1688 gen_lowpart (intmode
, tmp
[0]),
1689 GEN_INT (31), NULL_RTX
, 0,
1693 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1694 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1695 *xorp
= expand_simple_binop (intmode
, AND
,
1696 gen_lowpart (intmode
, tmp
[0]),
1700 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1704 /* Generate code for floating point ABS or NEG. */
1707 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1711 bool use_sse
= false;
1712 bool vector_mode
= VECTOR_MODE_P (mode
);
1713 machine_mode vmode
= mode
;
1716 if (vector_mode
|| mode
== TFmode
)
1718 else if (TARGET_SSE_MATH
)
1720 use_sse
= SSE_FLOAT_MODE_P (mode
);
1723 else if (mode
== DFmode
)
1730 set
= gen_rtx_fmt_e (code
, mode
, src
);
1731 set
= gen_rtx_SET (dst
, set
);
1735 rtx mask
, use
, clob
;
1737 /* NEG and ABS performed with SSE use bitwise mask operations.
1738 Create the appropriate mask now. */
1739 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1740 use
= gen_rtx_USE (VOIDmode
, mask
);
1741 if (vector_mode
|| mode
== TFmode
)
1742 par
= gen_rtvec (2, set
, use
);
1745 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1746 par
= gen_rtvec (3, set
, use
, clob
);
1753 /* Changing of sign for FP values is doable using integer unit too. */
1754 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1755 par
= gen_rtvec (2, set
, clob
);
1758 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1761 /* Deconstruct a floating point ABS or NEG operation
1762 with integer registers into integer operations. */
1765 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1768 enum rtx_code absneg_op
;
1771 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1776 dst
= gen_lowpart (SImode
, operands
[0]);
1780 set
= gen_int_mode (0x7fffffff, SImode
);
1785 set
= gen_int_mode (0x80000000, SImode
);
1788 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1794 dst
= gen_lowpart (DImode
, operands
[0]);
1795 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1800 set
= gen_rtx_NOT (DImode
, dst
);
1804 dst
= gen_highpart (SImode
, operands
[0]);
1808 set
= gen_int_mode (0x7fffffff, SImode
);
1813 set
= gen_int_mode (0x80000000, SImode
);
1816 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1821 dst
= gen_rtx_REG (SImode
,
1822 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1825 set
= GEN_INT (0x7fff);
1830 set
= GEN_INT (0x8000);
1833 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1840 set
= gen_rtx_SET (dst
, set
);
1842 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1843 rtvec par
= gen_rtvec (2, set
, clob
);
1845 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1848 /* Expand a copysign operation. Special case operand 0 being a constant. */
1851 ix86_expand_copysign (rtx operands
[])
1853 machine_mode mode
, vmode
;
1854 rtx dest
, op0
, op1
, mask
;
1860 mode
= GET_MODE (dest
);
1864 else if (mode
== DFmode
)
1866 else if (mode
== TFmode
)
1871 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1873 if (CONST_DOUBLE_P (op0
))
1875 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1876 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1878 if (mode
== SFmode
|| mode
== DFmode
)
1880 if (op0
== CONST0_RTX (mode
))
1881 op0
= CONST0_RTX (vmode
);
1884 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1886 op0
= force_reg (vmode
, v
);
1889 else if (op0
!= CONST0_RTX (mode
))
1890 op0
= force_reg (mode
, op0
);
1892 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1896 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1898 emit_insn (gen_copysign3_var
1899 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1903 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1904 be a constant, and so has already been expanded into a vector constant. */
1907 ix86_split_copysign_const (rtx operands
[])
1909 machine_mode mode
, vmode
;
1910 rtx dest
, op0
, mask
, x
;
1916 mode
= GET_MODE (dest
);
1917 vmode
= GET_MODE (mask
);
1919 dest
= lowpart_subreg (vmode
, dest
, mode
);
1920 x
= gen_rtx_AND (vmode
, dest
, mask
);
1921 emit_insn (gen_rtx_SET (dest
, x
));
1923 if (op0
!= CONST0_RTX (vmode
))
1925 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1926 emit_insn (gen_rtx_SET (dest
, x
));
1930 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1931 so we have to do two masks. */
1934 ix86_split_copysign_var (rtx operands
[])
1936 machine_mode mode
, vmode
;
1937 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1940 scratch
= operands
[1];
1943 nmask
= operands
[4];
1946 mode
= GET_MODE (dest
);
1947 vmode
= GET_MODE (mask
);
1949 if (rtx_equal_p (op0
, op1
))
1951 /* Shouldn't happen often (it's useless, obviously), but when it does
1952 we'd generate incorrect code if we continue below. */
1953 emit_move_insn (dest
, op0
);
1957 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1959 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1961 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1962 emit_insn (gen_rtx_SET (scratch
, x
));
1965 op0
= lowpart_subreg (vmode
, op0
, mode
);
1966 x
= gen_rtx_NOT (vmode
, dest
);
1967 x
= gen_rtx_AND (vmode
, x
, op0
);
1968 emit_insn (gen_rtx_SET (dest
, x
));
1972 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1974 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1976 else /* alternative 2,4 */
1978 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1979 op1
= lowpart_subreg (vmode
, op1
, mode
);
1980 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1982 emit_insn (gen_rtx_SET (scratch
, x
));
1984 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1986 dest
= lowpart_subreg (vmode
, op0
, mode
);
1987 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1989 else /* alternative 3,4 */
1991 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1993 op0
= lowpart_subreg (vmode
, op0
, mode
);
1994 x
= gen_rtx_AND (vmode
, dest
, op0
);
1996 emit_insn (gen_rtx_SET (dest
, x
));
1999 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
2000 emit_insn (gen_rtx_SET (dest
, x
));
2003 /* Expand an xorsign operation. */
2006 ix86_expand_xorsign (rtx operands
[])
2008 machine_mode mode
, vmode
;
2009 rtx dest
, op0
, op1
, mask
;
2015 mode
= GET_MODE (dest
);
2019 else if (mode
== DFmode
)
2024 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2026 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2029 /* Deconstruct an xorsign operation into bit masks. */
2032 ix86_split_xorsign (rtx operands
[])
2034 machine_mode mode
, vmode
;
2035 rtx dest
, op0
, mask
, x
;
2041 mode
= GET_MODE (dest
);
2042 vmode
= GET_MODE (mask
);
2044 dest
= lowpart_subreg (vmode
, dest
, mode
);
2045 x
= gen_rtx_AND (vmode
, dest
, mask
);
2046 emit_insn (gen_rtx_SET (dest
, x
));
2048 op0
= lowpart_subreg (vmode
, op0
, mode
);
2049 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2050 emit_insn (gen_rtx_SET (dest
, x
));
2053 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2056 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2058 machine_mode mode
= GET_MODE (op0
);
2061 /* Handle special case - vector comparsion with boolean result, transform
2062 it using ptest instruction. */
2063 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2065 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2066 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2068 gcc_assert (code
== EQ
|| code
== NE
);
2069 /* Generate XOR since we can't check that one operand is zero vector. */
2070 tmp
= gen_reg_rtx (mode
);
2071 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2072 tmp
= gen_lowpart (p_mode
, tmp
);
2073 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2074 gen_rtx_UNSPEC (CCmode
,
2075 gen_rtvec (2, tmp
, tmp
),
2077 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2078 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2079 gen_rtx_LABEL_REF (VOIDmode
, label
),
2081 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2094 tmp
= ix86_expand_compare (code
, op0
, op1
);
2095 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2096 gen_rtx_LABEL_REF (VOIDmode
, label
),
2098 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2104 /* For 32-bit target DI comparison may be performed on
2105 SSE registers. To allow this we should avoid split
2106 to SI mode which is achieved by doing xor in DI mode
2107 and then comparing with zero (which is recognized by
2108 STV pass). We don't compare using xor when optimizing
2110 if (!optimize_insn_for_size_p ()
2112 && (code
== EQ
|| code
== NE
))
2114 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2119 /* Expand DImode branch into multiple compare+branch. */
2122 rtx_code_label
*label2
;
2123 enum rtx_code code1
, code2
, code3
;
2124 machine_mode submode
;
2126 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2128 std::swap (op0
, op1
);
2129 code
= swap_condition (code
);
2132 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2133 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2135 submode
= mode
== DImode
? SImode
: DImode
;
2137 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2138 avoid two branches. This costs one extra insn, so disable when
2139 optimizing for size. */
2141 if ((code
== EQ
|| code
== NE
)
2142 && (!optimize_insn_for_size_p ()
2143 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2148 if (hi
[1] != const0_rtx
)
2149 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2150 NULL_RTX
, 0, OPTAB_WIDEN
);
2153 if (lo
[1] != const0_rtx
)
2154 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2155 NULL_RTX
, 0, OPTAB_WIDEN
);
2157 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2158 NULL_RTX
, 0, OPTAB_WIDEN
);
2160 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2164 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2165 op1 is a constant and the low word is zero, then we can just
2166 examine the high word. Similarly for low word -1 and
2167 less-or-equal-than or greater-than. */
2169 if (CONST_INT_P (hi
[1]))
2172 case LT
: case LTU
: case GE
: case GEU
:
2173 if (lo
[1] == const0_rtx
)
2175 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2179 case LE
: case LEU
: case GT
: case GTU
:
2180 if (lo
[1] == constm1_rtx
)
2182 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2190 /* Emulate comparisons that do not depend on Zero flag with
2191 double-word subtraction. Note that only Overflow, Sign
2192 and Carry flags are valid, so swap arguments and condition
2193 of comparisons that would otherwise test Zero flag. */
2197 case LE
: case LEU
: case GT
: case GTU
:
2198 std::swap (lo
[0], lo
[1]);
2199 std::swap (hi
[0], hi
[1]);
2200 code
= swap_condition (code
);
2203 case LT
: case LTU
: case GE
: case GEU
:
2205 bool uns
= (code
== LTU
|| code
== GEU
);
2206 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2207 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2209 if (!nonimmediate_operand (lo
[0], submode
))
2210 lo
[0] = force_reg (submode
, lo
[0]);
2211 if (!x86_64_general_operand (lo
[1], submode
))
2212 lo
[1] = force_reg (submode
, lo
[1]);
2214 if (!register_operand (hi
[0], submode
))
2215 hi
[0] = force_reg (submode
, hi
[0]);
2216 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2217 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2218 hi
[1] = force_reg (submode
, hi
[1]);
2220 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2222 tmp
= gen_rtx_SCRATCH (submode
);
2223 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2225 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2226 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2234 /* Otherwise, we need two or three jumps. */
2236 label2
= gen_label_rtx ();
2239 code2
= swap_condition (code
);
2240 code3
= unsigned_condition (code
);
2244 case LT
: case GT
: case LTU
: case GTU
:
2247 case LE
: code1
= LT
; code2
= GT
; break;
2248 case GE
: code1
= GT
; code2
= LT
; break;
2249 case LEU
: code1
= LTU
; code2
= GTU
; break;
2250 case GEU
: code1
= GTU
; code2
= LTU
; break;
2252 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2253 case NE
: code2
= UNKNOWN
; break;
2261 * if (hi(a) < hi(b)) goto true;
2262 * if (hi(a) > hi(b)) goto false;
2263 * if (lo(a) < lo(b)) goto true;
2267 if (code1
!= UNKNOWN
)
2268 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2269 if (code2
!= UNKNOWN
)
2270 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2272 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2274 if (code2
!= UNKNOWN
)
2275 emit_label (label2
);
2280 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2285 /* Figure out whether to use unordered fp comparisons. */
2288 ix86_unordered_fp_compare (enum rtx_code code
)
2290 if (!TARGET_IEEE_FP
)
2319 /* Return a comparison we can do and that it is equivalent to
2320 swap_condition (code) apart possibly from orderedness.
2321 But, never change orderedness if TARGET_IEEE_FP, returning
2322 UNKNOWN in that case if necessary. */
2324 static enum rtx_code
2325 ix86_fp_swap_condition (enum rtx_code code
)
2329 case GT
: /* GTU - CF=0 & ZF=0 */
2330 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2331 case GE
: /* GEU - CF=0 */
2332 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2333 case UNLT
: /* LTU - CF=1 */
2334 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2335 case UNLE
: /* LEU - CF=1 | ZF=1 */
2336 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2338 return swap_condition (code
);
2342 /* Return cost of comparison CODE using the best strategy for performance.
2343 All following functions do use number of instructions as a cost metrics.
2344 In future this should be tweaked to compute bytes for optimize_size and
2345 take into account performance of various instructions on various CPUs. */
2348 ix86_fp_comparison_cost (enum rtx_code code
)
2352 /* The cost of code using bit-twiddling on %ah. */
2369 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2373 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2379 switch (ix86_fp_comparison_strategy (code
))
2381 case IX86_FPCMP_COMI
:
2382 return arith_cost
> 4 ? 3 : 2;
2383 case IX86_FPCMP_SAHF
:
2384 return arith_cost
> 4 ? 4 : 3;
2390 /* Swap, force into registers, or otherwise massage the two operands
2391 to a fp comparison. The operands are updated in place; the new
2392 comparison code is returned. */
2394 static enum rtx_code
2395 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2397 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2398 rtx op0
= *pop0
, op1
= *pop1
;
2399 machine_mode op_mode
= GET_MODE (op0
);
2400 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2402 /* All of the unordered compare instructions only work on registers.
2403 The same is true of the fcomi compare instructions. The XFmode
2404 compare instructions require registers except when comparing
2405 against zero or when converting operand 1 from fixed point to
2409 && (unordered_compare
2410 || (op_mode
== XFmode
2411 && ! (standard_80387_constant_p (op0
) == 1
2412 || standard_80387_constant_p (op1
) == 1)
2413 && GET_CODE (op1
) != FLOAT
)
2414 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2416 op0
= force_reg (op_mode
, op0
);
2417 op1
= force_reg (op_mode
, op1
);
2421 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2422 things around if they appear profitable, otherwise force op0
2425 if (standard_80387_constant_p (op0
) == 0
2427 && ! (standard_80387_constant_p (op1
) == 0
2430 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2431 if (new_code
!= UNKNOWN
)
2433 std::swap (op0
, op1
);
2439 op0
= force_reg (op_mode
, op0
);
2441 if (CONSTANT_P (op1
))
2443 int tmp
= standard_80387_constant_p (op1
);
2445 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2449 op1
= force_reg (op_mode
, op1
);
2452 op1
= force_reg (op_mode
, op1
);
2456 /* Try to rearrange the comparison to make it cheaper. */
2457 if (ix86_fp_comparison_cost (code
)
2458 > ix86_fp_comparison_cost (swap_condition (code
))
2459 && (REG_P (op1
) || can_create_pseudo_p ()))
2461 std::swap (op0
, op1
);
2462 code
= swap_condition (code
);
2464 op0
= force_reg (op_mode
, op0
);
2472 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2475 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2477 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2478 machine_mode cmp_mode
;
2481 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2483 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2484 if (unordered_compare
)
2485 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2487 /* Do fcomi/sahf based test when profitable. */
2488 switch (ix86_fp_comparison_strategy (code
))
2490 case IX86_FPCMP_COMI
:
2491 cmp_mode
= CCFPmode
;
2492 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2495 case IX86_FPCMP_SAHF
:
2496 cmp_mode
= CCFPmode
;
2497 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2498 scratch
= gen_reg_rtx (HImode
);
2499 emit_insn (gen_rtx_SET (scratch
, tmp
));
2500 emit_insn (gen_x86_sahf_1 (scratch
));
2503 case IX86_FPCMP_ARITH
:
2504 cmp_mode
= CCNOmode
;
2505 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2506 scratch
= gen_reg_rtx (HImode
);
2507 emit_insn (gen_rtx_SET (scratch
, tmp
));
2509 /* In the unordered case, we have to check C2 for NaN's, which
2510 doesn't happen to work out to anything nice combination-wise.
2511 So do some bit twiddling on the value we've got in AH to come
2512 up with an appropriate set of condition codes. */
2518 if (code
== GT
|| !TARGET_IEEE_FP
)
2520 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2525 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2526 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2527 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2534 if (code
== LT
&& TARGET_IEEE_FP
)
2536 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2537 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2543 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2549 if (code
== GE
|| !TARGET_IEEE_FP
)
2551 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2556 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2557 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2563 if (code
== LE
&& TARGET_IEEE_FP
)
2565 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2566 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2567 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2573 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2579 if (code
== EQ
&& TARGET_IEEE_FP
)
2581 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2582 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2588 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2594 if (code
== NE
&& TARGET_IEEE_FP
)
2596 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2597 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2603 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2609 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2613 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2626 /* Return the test that should be put into the flags user, i.e.
2627 the bcc, scc, or cmov instruction. */
2628 return gen_rtx_fmt_ee (code
, VOIDmode
,
2629 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2633 /* Generate insn patterns to do an integer compare of OPERANDS. */
2636 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2638 machine_mode cmpmode
;
2641 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2642 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2644 /* This is very simple, but making the interface the same as in the
2645 FP case makes the rest of the code easier. */
2646 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2647 emit_insn (gen_rtx_SET (flags
, tmp
));
2649 /* Return the test that should be put into the flags user, i.e.
2650 the bcc, scc, or cmov instruction. */
2651 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2655 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2659 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2660 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2662 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2664 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2665 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2668 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2674 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2678 gcc_assert (GET_MODE (dest
) == QImode
);
2680 ret
= ix86_expand_compare (code
, op0
, op1
);
2681 PUT_MODE (ret
, QImode
);
2682 emit_insn (gen_rtx_SET (dest
, ret
));
2685 /* Expand comparison setting or clearing carry flag. Return true when
2686 successful and set pop for the operation. */
2688 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2691 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2693 /* Do not handle double-mode compares that go through special path. */
2694 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2697 if (SCALAR_FLOAT_MODE_P (mode
))
2700 rtx_insn
*compare_seq
;
2702 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2704 /* Shortcut: following common codes never translate
2705 into carry flag compares. */
2706 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2707 || code
== ORDERED
|| code
== UNORDERED
)
2710 /* These comparisons require zero flag; swap operands so they won't. */
2711 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2714 std::swap (op0
, op1
);
2715 code
= swap_condition (code
);
2718 /* Try to expand the comparison and verify that we end up with
2719 carry flag based comparison. This fails to be true only when
2720 we decide to expand comparison using arithmetic that is not
2721 too common scenario. */
2723 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2724 compare_seq
= get_insns ();
2727 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2728 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2730 code
= GET_CODE (compare_op
);
2732 if (code
!= LTU
&& code
!= GEU
)
2735 emit_insn (compare_seq
);
2740 if (!INTEGRAL_MODE_P (mode
))
2749 /* Convert a==0 into (unsigned)a<1. */
2752 if (op1
!= const0_rtx
)
2755 code
= (code
== EQ
? LTU
: GEU
);
2758 /* Convert a>b into b<a or a>=b-1. */
2761 if (CONST_INT_P (op1
))
2763 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2764 /* Bail out on overflow. We still can swap operands but that
2765 would force loading of the constant into register. */
2766 if (op1
== const0_rtx
2767 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2769 code
= (code
== GTU
? GEU
: LTU
);
2773 std::swap (op0
, op1
);
2774 code
= (code
== GTU
? LTU
: GEU
);
2778 /* Convert a>=0 into (unsigned)a<0x80000000. */
2781 if (mode
== DImode
|| op1
!= const0_rtx
)
2783 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2784 code
= (code
== LT
? GEU
: LTU
);
2788 if (mode
== DImode
|| op1
!= constm1_rtx
)
2790 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2791 code
= (code
== LE
? GEU
: LTU
);
2797 /* Swapping operands may cause constant to appear as first operand. */
2798 if (!nonimmediate_operand (op0
, VOIDmode
))
2800 if (!can_create_pseudo_p ())
2802 op0
= force_reg (mode
, op0
);
2804 *pop
= ix86_expand_compare (code
, op0
, op1
);
2805 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2809 /* Expand conditional increment or decrement using adb/sbb instructions.
2810 The default case using setcc followed by the conditional move can be
2811 done by generic code. */
2813 ix86_expand_int_addcc (rtx operands
[])
2815 enum rtx_code code
= GET_CODE (operands
[1]);
2817 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2819 rtx val
= const0_rtx
;
2822 rtx op0
= XEXP (operands
[1], 0);
2823 rtx op1
= XEXP (operands
[1], 1);
2825 if (operands
[3] != const1_rtx
2826 && operands
[3] != constm1_rtx
)
2828 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2830 code
= GET_CODE (compare_op
);
2832 flags
= XEXP (compare_op
, 0);
2834 if (GET_MODE (flags
) == CCFPmode
)
2837 code
= ix86_fp_compare_code_to_integer (code
);
2844 PUT_CODE (compare_op
,
2845 reverse_condition_maybe_unordered
2846 (GET_CODE (compare_op
)));
2848 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2851 mode
= GET_MODE (operands
[0]);
2853 /* Construct either adc or sbb insn. */
2854 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2855 insn
= gen_sub3_carry
;
2857 insn
= gen_add3_carry
;
2859 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2865 ix86_expand_int_movcc (rtx operands
[])
2867 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2868 rtx_insn
*compare_seq
;
2870 machine_mode mode
= GET_MODE (operands
[0]);
2871 bool sign_bit_compare_p
= false;
2872 rtx op0
= XEXP (operands
[1], 0);
2873 rtx op1
= XEXP (operands
[1], 1);
2875 if (GET_MODE (op0
) == TImode
2876 || (GET_MODE (op0
) == DImode
2881 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2882 compare_seq
= get_insns ();
2885 compare_code
= GET_CODE (compare_op
);
2887 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2888 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2889 sign_bit_compare_p
= true;
2891 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2892 HImode insns, we'd be swallowed in word prefix ops. */
2894 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2895 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2896 && CONST_INT_P (operands
[2])
2897 && CONST_INT_P (operands
[3]))
2899 rtx out
= operands
[0];
2900 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2901 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2905 /* Sign bit compares are better done using shifts than we do by using
2907 if (sign_bit_compare_p
2908 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2910 /* Detect overlap between destination and compare sources. */
2913 if (!sign_bit_compare_p
)
2918 compare_code
= GET_CODE (compare_op
);
2920 flags
= XEXP (compare_op
, 0);
2922 if (GET_MODE (flags
) == CCFPmode
)
2926 = ix86_fp_compare_code_to_integer (compare_code
);
2929 /* To simplify rest of code, restrict to the GEU case. */
2930 if (compare_code
== LTU
)
2933 compare_code
= reverse_condition (compare_code
);
2934 code
= reverse_condition (code
);
2939 PUT_CODE (compare_op
,
2940 reverse_condition_maybe_unordered
2941 (GET_CODE (compare_op
)));
2943 PUT_CODE (compare_op
,
2944 reverse_condition (GET_CODE (compare_op
)));
2948 if (reg_overlap_mentioned_p (out
, op0
)
2949 || reg_overlap_mentioned_p (out
, op1
))
2950 tmp
= gen_reg_rtx (mode
);
2953 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2955 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2956 flags
, compare_op
));
2960 if (code
== GT
|| code
== GE
)
2961 code
= reverse_condition (code
);
2967 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2980 tmp
= expand_simple_binop (mode
, PLUS
,
2982 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2993 tmp
= expand_simple_binop (mode
, IOR
,
2995 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2997 else if (diff
== -1 && ct
)
3007 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3009 tmp
= expand_simple_binop (mode
, PLUS
,
3010 copy_rtx (tmp
), GEN_INT (cf
),
3011 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3019 * andl cf - ct, dest
3029 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3032 tmp
= expand_simple_binop (mode
, AND
,
3034 gen_int_mode (cf
- ct
, mode
),
3035 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3037 tmp
= expand_simple_binop (mode
, PLUS
,
3038 copy_rtx (tmp
), GEN_INT (ct
),
3039 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3042 if (!rtx_equal_p (tmp
, out
))
3043 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3050 machine_mode cmp_mode
= GET_MODE (op0
);
3051 enum rtx_code new_code
;
3053 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3055 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3057 /* We may be reversing a non-trapping
3058 comparison to a trapping comparison. */
3059 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3060 && code
!= EQ
&& code
!= NE
3061 && code
!= ORDERED
&& code
!= UNORDERED
)
3064 new_code
= reverse_condition_maybe_unordered (code
);
3067 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3068 if (new_code
!= UNKNOWN
)
3076 compare_code
= UNKNOWN
;
3077 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3078 && CONST_INT_P (op1
))
3080 if (op1
== const0_rtx
3081 && (code
== LT
|| code
== GE
))
3082 compare_code
= code
;
3083 else if (op1
== constm1_rtx
)
3087 else if (code
== GT
)
3092 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3093 if (compare_code
!= UNKNOWN
3094 && GET_MODE (op0
) == GET_MODE (out
)
3095 && (cf
== -1 || ct
== -1))
3097 /* If lea code below could be used, only optimize
3098 if it results in a 2 insn sequence. */
3100 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3101 || diff
== 3 || diff
== 5 || diff
== 9)
3102 || (compare_code
== LT
&& ct
== -1)
3103 || (compare_code
== GE
&& cf
== -1))
3106 * notl op1 (if necessary)
3114 code
= reverse_condition (code
);
3117 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3119 out
= expand_simple_binop (mode
, IOR
,
3121 out
, 1, OPTAB_DIRECT
);
3122 if (out
!= operands
[0])
3123 emit_move_insn (operands
[0], out
);
3130 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3131 || diff
== 3 || diff
== 5 || diff
== 9)
3132 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3134 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3140 * lea cf(dest*(ct-cf)),dest
3144 * This also catches the degenerate setcc-only case.
3150 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3153 /* On x86_64 the lea instruction operates on Pmode, so we need
3154 to get arithmetics done in proper mode to match. */
3156 tmp
= copy_rtx (out
);
3160 out1
= copy_rtx (out
);
3161 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3165 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3171 tmp
= plus_constant (mode
, tmp
, cf
);
3174 if (!rtx_equal_p (tmp
, out
))
3177 out
= force_operand (tmp
, copy_rtx (out
));
3179 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3181 if (!rtx_equal_p (out
, operands
[0]))
3182 emit_move_insn (operands
[0], copy_rtx (out
));
3188 * General case: Jumpful:
3189 * xorl dest,dest cmpl op1, op2
3190 * cmpl op1, op2 movl ct, dest
3192 * decl dest movl cf, dest
3193 * andl (cf-ct),dest 1:
3198 * This is reasonably steep, but branch mispredict costs are
3199 * high on modern cpus, so consider failing only if optimizing
3203 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3204 && BRANCH_COST (optimize_insn_for_speed_p (),
3209 machine_mode cmp_mode
= GET_MODE (op0
);
3210 enum rtx_code new_code
;
3212 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3214 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3216 /* We may be reversing a non-trapping
3217 comparison to a trapping comparison. */
3218 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3219 && code
!= EQ
&& code
!= NE
3220 && code
!= ORDERED
&& code
!= UNORDERED
)
3223 new_code
= reverse_condition_maybe_unordered (code
);
3228 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3229 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3230 compare_code
= reverse_condition (compare_code
);
3233 if (new_code
!= UNKNOWN
)
3241 if (compare_code
!= UNKNOWN
)
3243 /* notl op1 (if needed)
3248 For x < 0 (resp. x <= -1) there will be no notl,
3249 so if possible swap the constants to get rid of the
3251 True/false will be -1/0 while code below (store flag
3252 followed by decrement) is 0/-1, so the constants need
3253 to be exchanged once more. */
3255 if (compare_code
== GE
|| !cf
)
3257 code
= reverse_condition (code
);
3263 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3267 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3269 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3271 copy_rtx (out
), 1, OPTAB_DIRECT
);
3274 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3275 gen_int_mode (cf
- ct
, mode
),
3276 copy_rtx (out
), 1, OPTAB_DIRECT
);
3278 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3279 copy_rtx (out
), 1, OPTAB_DIRECT
);
3280 if (!rtx_equal_p (out
, operands
[0]))
3281 emit_move_insn (operands
[0], copy_rtx (out
));
3287 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3289 /* Try a few things more with specific constants and a variable. */
3292 rtx var
, orig_out
, out
, tmp
;
3294 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3297 /* If one of the two operands is an interesting constant, load a
3298 constant with the above and mask it in with a logical operation. */
3300 if (CONST_INT_P (operands
[2]))
3303 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3304 operands
[3] = constm1_rtx
, op
= and_optab
;
3305 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3306 operands
[3] = const0_rtx
, op
= ior_optab
;
3310 else if (CONST_INT_P (operands
[3]))
3313 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3315 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3316 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3317 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3318 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3322 operands
[2] = constm1_rtx
;
3325 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3326 operands
[2] = const0_rtx
, op
= ior_optab
;
3333 orig_out
= operands
[0];
3334 tmp
= gen_reg_rtx (mode
);
3337 /* Recurse to get the constant loaded. */
3338 if (!ix86_expand_int_movcc (operands
))
3341 /* Mask in the interesting variable. */
3342 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3344 if (!rtx_equal_p (out
, orig_out
))
3345 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3351 * For comparison with above,
3361 if (! nonimmediate_operand (operands
[2], mode
))
3362 operands
[2] = force_reg (mode
, operands
[2]);
3363 if (! nonimmediate_operand (operands
[3], mode
))
3364 operands
[3] = force_reg (mode
, operands
[3]);
3366 if (! register_operand (operands
[2], VOIDmode
)
3368 || ! register_operand (operands
[3], VOIDmode
)))
3369 operands
[2] = force_reg (mode
, operands
[2]);
3372 && ! register_operand (operands
[3], VOIDmode
))
3373 operands
[3] = force_reg (mode
, operands
[3]);
3375 emit_insn (compare_seq
);
3376 emit_insn (gen_rtx_SET (operands
[0],
3377 gen_rtx_IF_THEN_ELSE (mode
,
3378 compare_op
, operands
[2],
3383 /* Detect conditional moves that exactly match min/max operational
3384 semantics. Note that this is IEEE safe, as long as we don't
3385 interchange the operands.
3387 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3388 and TRUE if the operation is successful and instructions are emitted. */
3391 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3392 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3400 else if (code
== UNGE
)
3401 std::swap (if_true
, if_false
);
3405 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3407 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3412 mode
= GET_MODE (dest
);
3414 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3415 but MODE may be a vector mode and thus not appropriate. */
3416 if (!flag_finite_math_only
|| flag_signed_zeros
)
3418 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3421 if_true
= force_reg (mode
, if_true
);
3422 v
= gen_rtvec (2, if_true
, if_false
);
3423 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3427 code
= is_min
? SMIN
: SMAX
;
3428 if (MEM_P (if_true
) && MEM_P (if_false
))
3429 if_true
= force_reg (mode
, if_true
);
3430 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3433 emit_insn (gen_rtx_SET (dest
, tmp
));
3437 /* Return true if MODE is valid for vector compare to mask register,
3438 Same result for conditionl vector move with mask register. */
3440 ix86_valid_mask_cmp_mode (machine_mode mode
)
3442 /* XOP has its own vector conditional movement. */
3443 if (TARGET_XOP
&& !TARGET_AVX512F
)
3446 /* AVX512F is needed for mask operation. */
3447 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3450 /* AVX512BW is needed for vector QI/HImode,
3451 AVX512VL is needed for 128/256-bit vector. */
3452 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3453 int vector_size
= GET_MODE_SIZE (mode
);
3454 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3457 return vector_size
== 64 || TARGET_AVX512VL
;
3460 /* Expand an SSE comparison. Return the register with the result. */
3463 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3464 rtx op_true
, rtx op_false
)
3466 machine_mode mode
= GET_MODE (dest
);
3467 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3469 /* In general case result of comparison can differ from operands' type. */
3470 machine_mode cmp_mode
;
3472 /* In AVX512F the result of comparison is an integer mask. */
3473 bool maskcmp
= false;
3476 if (ix86_valid_mask_cmp_mode (cmp_ops_mode
))
3478 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3480 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3483 cmp_mode
= cmp_ops_mode
;
3485 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3487 int (*op1_predicate
)(rtx
, machine_mode
)
3488 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3490 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3491 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3494 || (maskcmp
&& cmp_mode
!= mode
)
3495 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3496 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3497 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3501 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3506 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3508 if (cmp_mode
!= mode
&& !maskcmp
)
3510 x
= force_reg (cmp_ops_mode
, x
);
3511 convert_move (dest
, x
, false);
3514 emit_insn (gen_rtx_SET (dest
, x
));
3519 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3520 operations. This is used for both scalar and vector conditional moves. */
3523 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3525 machine_mode mode
= GET_MODE (dest
);
3526 machine_mode cmpmode
= GET_MODE (cmp
);
3528 /* In AVX512F the result of comparison is an integer mask. */
3529 bool maskcmp
= mode
!= cmpmode
&& ix86_valid_mask_cmp_mode (mode
);
3533 /* If we have an integer mask and FP value then we need
3534 to cast mask to FP mode. */
3535 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3537 cmp
= force_reg (cmpmode
, cmp
);
3538 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3543 /* Using vector move with mask register. */
3544 cmp
= force_reg (cmpmode
, cmp
);
3545 /* Optimize for mask zero. */
3546 op_true
= (op_true
!= CONST0_RTX (mode
)
3547 ? force_reg (mode
, op_true
) : op_true
);
3548 op_false
= (op_false
!= CONST0_RTX (mode
)
3549 ? force_reg (mode
, op_false
) : op_false
);
3550 if (op_true
== CONST0_RTX (mode
))
3552 rtx (*gen_not
) (rtx
, rtx
);
3555 case E_QImode
: gen_not
= gen_knotqi
; break;
3556 case E_HImode
: gen_not
= gen_knothi
; break;
3557 case E_SImode
: gen_not
= gen_knotsi
; break;
3558 case E_DImode
: gen_not
= gen_knotdi
; break;
3559 default: gcc_unreachable ();
3561 rtx n
= gen_reg_rtx (cmpmode
);
3562 emit_insn (gen_not (n
, cmp
));
3564 /* Reverse op_true op_false. */
3565 std::swap (op_true
, op_false
);
3568 rtx vec_merge
= gen_rtx_VEC_MERGE (mode
, op_true
, op_false
, cmp
);
3569 emit_insn (gen_rtx_SET (dest
, vec_merge
));
3572 else if (vector_all_ones_operand (op_true
, mode
)
3573 && op_false
== CONST0_RTX (mode
))
3575 emit_insn (gen_rtx_SET (dest
, cmp
));
3578 else if (op_false
== CONST0_RTX (mode
))
3580 op_true
= force_reg (mode
, op_true
);
3581 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3582 emit_insn (gen_rtx_SET (dest
, x
));
3585 else if (op_true
== CONST0_RTX (mode
))
3587 op_false
= force_reg (mode
, op_false
);
3588 x
= gen_rtx_NOT (mode
, cmp
);
3589 x
= gen_rtx_AND (mode
, x
, op_false
);
3590 emit_insn (gen_rtx_SET (dest
, x
));
3593 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3595 op_false
= force_reg (mode
, op_false
);
3596 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3597 emit_insn (gen_rtx_SET (dest
, x
));
3600 else if (TARGET_XOP
)
3602 op_true
= force_reg (mode
, op_true
);
3604 if (!nonimmediate_operand (op_false
, mode
))
3605 op_false
= force_reg (mode
, op_false
);
3607 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3613 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3616 if (!vector_operand (op_true
, mode
))
3617 op_true
= force_reg (mode
, op_true
);
3619 op_false
= force_reg (mode
, op_false
);
3625 gen
= gen_sse4_1_blendvps
;
3629 gen
= gen_sse4_1_blendvpd
;
3634 gen
= gen_sse4_1_blendvss
;
3635 op_true
= force_reg (mode
, op_true
);
3641 gen
= gen_sse4_1_blendvsd
;
3642 op_true
= force_reg (mode
, op_true
);
3651 gen
= gen_sse4_1_pblendvb
;
3652 if (mode
!= V16QImode
)
3653 d
= gen_reg_rtx (V16QImode
);
3654 op_false
= gen_lowpart (V16QImode
, op_false
);
3655 op_true
= gen_lowpart (V16QImode
, op_true
);
3656 cmp
= gen_lowpart (V16QImode
, cmp
);
3661 gen
= gen_avx_blendvps256
;
3665 gen
= gen_avx_blendvpd256
;
3673 gen
= gen_avx2_pblendvb
;
3674 if (mode
!= V32QImode
)
3675 d
= gen_reg_rtx (V32QImode
);
3676 op_false
= gen_lowpart (V32QImode
, op_false
);
3677 op_true
= gen_lowpart (V32QImode
, op_true
);
3678 cmp
= gen_lowpart (V32QImode
, cmp
);
3683 gen
= gen_avx512bw_blendmv64qi
;
3686 gen
= gen_avx512bw_blendmv32hi
;
3689 gen
= gen_avx512f_blendmv16si
;
3692 gen
= gen_avx512f_blendmv8di
;
3695 gen
= gen_avx512f_blendmv8df
;
3698 gen
= gen_avx512f_blendmv16sf
;
3707 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3709 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3713 op_true
= force_reg (mode
, op_true
);
3715 t2
= gen_reg_rtx (mode
);
3717 t3
= gen_reg_rtx (mode
);
3721 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3722 emit_insn (gen_rtx_SET (t2
, x
));
3724 x
= gen_rtx_NOT (mode
, cmp
);
3725 x
= gen_rtx_AND (mode
, x
, op_false
);
3726 emit_insn (gen_rtx_SET (t3
, x
));
3728 x
= gen_rtx_IOR (mode
, t3
, t2
);
3729 emit_insn (gen_rtx_SET (dest
, x
));
3733 /* Swap, force into registers, or otherwise massage the two operands
3734 to an sse comparison with a mask result. Thus we differ a bit from
3735 ix86_prepare_fp_compare_args which expects to produce a flags result.
3737 The DEST operand exists to help determine whether to commute commutative
3738 operators. The POP0/POP1 operands are updated in place. The new
3739 comparison code is returned, or UNKNOWN if not implementable. */
3741 static enum rtx_code
3742 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3743 rtx
*pop0
, rtx
*pop1
)
3749 /* AVX supports all the needed comparisons. */
3752 /* We have no LTGT as an operator. We could implement it with
3753 NE & ORDERED, but this requires an extra temporary. It's
3754 not clear that it's worth it. */
3761 /* These are supported directly. */
3768 /* AVX has 3 operand comparisons, no need to swap anything. */
3771 /* For commutative operators, try to canonicalize the destination
3772 operand to be first in the comparison - this helps reload to
3773 avoid extra moves. */
3774 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3782 /* These are not supported directly before AVX, and furthermore
3783 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3784 comparison operands to transform into something that is
3786 std::swap (*pop0
, *pop1
);
3787 code
= swap_condition (code
);
3797 /* Expand a floating-point conditional move. Return true if successful. */
3800 ix86_expand_fp_movcc (rtx operands
[])
3802 machine_mode mode
= GET_MODE (operands
[0]);
3803 enum rtx_code code
= GET_CODE (operands
[1]);
3804 rtx tmp
, compare_op
;
3805 rtx op0
= XEXP (operands
[1], 0);
3806 rtx op1
= XEXP (operands
[1], 1);
3808 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3812 /* Since we've no cmove for sse registers, don't force bad register
3813 allocation just to gain access to it. Deny movcc when the
3814 comparison mode doesn't match the move mode. */
3815 cmode
= GET_MODE (op0
);
3816 if (cmode
== VOIDmode
)
3817 cmode
= GET_MODE (op1
);
3821 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3822 if (code
== UNKNOWN
)
3825 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3826 operands
[2], operands
[3]))
3829 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3830 operands
[2], operands
[3]);
3831 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3835 if (GET_MODE (op0
) == TImode
3836 || (GET_MODE (op0
) == DImode
3840 /* The floating point conditional move instructions don't directly
3841 support conditions resulting from a signed integer comparison. */
3843 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3844 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3846 tmp
= gen_reg_rtx (QImode
);
3847 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3849 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3852 emit_insn (gen_rtx_SET (operands
[0],
3853 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3854 operands
[2], operands
[3])));
3859 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3862 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3887 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3890 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3927 /* Return immediate value to be used in UNSPEC_PCMP
3928 for comparison CODE in MODE. */
3931 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3933 if (FLOAT_MODE_P (mode
))
3934 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3935 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3938 /* Expand AVX-512 vector comparison. */
3941 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
3943 machine_mode mask_mode
= GET_MODE (dest
);
3944 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
3945 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3955 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3959 unspec_code
= UNSPEC_PCMP
;
3962 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
3964 emit_insn (gen_rtx_SET (dest
, unspec
));
3969 /* Expand fp vector comparison. */
3972 ix86_expand_fp_vec_cmp (rtx operands
[])
3974 enum rtx_code code
= GET_CODE (operands
[1]);
3977 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
3978 &operands
[2], &operands
[3]);
3979 if (code
== UNKNOWN
)
3982 switch (GET_CODE (operands
[1]))
3985 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
3986 operands
[3], NULL
, NULL
);
3987 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
3988 operands
[3], NULL
, NULL
);
3992 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
3993 operands
[3], NULL
, NULL
);
3994 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
3995 operands
[3], NULL
, NULL
);
4001 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4005 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4006 operands
[1], operands
[2]);
4008 if (operands
[0] != cmp
)
4009 emit_move_insn (operands
[0], cmp
);
4015 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4016 rtx op_true
, rtx op_false
, bool *negate
)
4018 machine_mode data_mode
= GET_MODE (dest
);
4019 machine_mode mode
= GET_MODE (cop0
);
4024 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4026 && (mode
== V16QImode
|| mode
== V8HImode
4027 || mode
== V4SImode
|| mode
== V2DImode
))
4029 /* AVX512F supports all of the comparsions
4030 on all 128/256/512-bit vector int types. */
4031 else if (ix86_valid_mask_cmp_mode (mode
))
4035 /* Canonicalize the comparison to EQ, GT, GTU. */
4046 code
= reverse_condition (code
);
4052 code
= reverse_condition (code
);
4058 std::swap (cop0
, cop1
);
4059 code
= swap_condition (code
);
4066 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4067 if (mode
== V2DImode
)
4072 /* SSE4.1 supports EQ. */
4079 /* SSE4.2 supports GT/GTU. */
4089 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4090 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4092 std::swap (optrue
, opfalse
);
4094 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4095 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4096 min (x, y) == x). While we add one instruction (the minimum),
4097 we remove the need for two instructions in the negation, as the
4098 result is done this way.
4099 When using masks, do it for SI/DImode element types, as it is shorter
4100 than the two subtractions. */
4102 && GET_MODE_SIZE (mode
) != 64
4103 && vector_all_ones_operand (opfalse
, data_mode
)
4104 && optrue
== CONST0_RTX (data_mode
))
4106 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4107 /* Don't do it if not using integer masks and we'd end up with
4108 the right values in the registers though. */
4109 && (GET_MODE_SIZE (mode
) == 64
4110 || !vector_all_ones_operand (optrue
, data_mode
)
4111 || opfalse
!= CONST0_RTX (data_mode
))))
4113 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4118 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4121 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4122 cop0
= force_reg (mode
, cop0
);
4123 cop1
= force_reg (mode
, cop1
);
4127 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4131 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4135 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4138 if (TARGET_AVX512VL
)
4140 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4141 cop0
= force_reg (mode
, cop0
);
4142 cop1
= force_reg (mode
, cop1
);
4146 if (code
== GTU
&& TARGET_SSE2
)
4147 gen
= gen_uminv16qi3
;
4148 else if (code
== GT
&& TARGET_SSE4_1
)
4149 gen
= gen_sminv16qi3
;
4152 if (code
== GTU
&& TARGET_SSE4_1
)
4153 gen
= gen_uminv8hi3
;
4154 else if (code
== GT
&& TARGET_SSE2
)
4155 gen
= gen_sminv8hi3
;
4159 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4162 if (TARGET_AVX512VL
)
4164 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4165 cop0
= force_reg (mode
, cop0
);
4166 cop1
= force_reg (mode
, cop1
);
4175 rtx tem
= gen_reg_rtx (mode
);
4176 if (!vector_operand (cop0
, mode
))
4177 cop0
= force_reg (mode
, cop0
);
4178 if (!vector_operand (cop1
, mode
))
4179 cop1
= force_reg (mode
, cop1
);
4181 emit_insn (gen (tem
, cop0
, cop1
));
4187 /* Unsigned parallel compare is not supported by the hardware.
4188 Play some tricks to turn this into a signed comparison
4192 cop0
= force_reg (mode
, cop0
);
4205 /* Subtract (-(INT MAX) - 1) from both operands to make
4207 mask
= ix86_build_signbit_mask (mode
, true, false);
4208 t1
= gen_reg_rtx (mode
);
4209 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4211 t2
= gen_reg_rtx (mode
);
4212 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4226 /* Perform a parallel unsigned saturating subtraction. */
4227 x
= gen_reg_rtx (mode
);
4228 emit_insn (gen_rtx_SET
4229 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4231 cop1
= CONST0_RTX (mode
);
4243 std::swap (op_true
, op_false
);
4245 /* Allow the comparison to be done in one mode, but the movcc to
4246 happen in another mode. */
4247 if (data_mode
== mode
)
4249 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4254 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4255 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4257 if (GET_MODE (x
) == mode
)
4258 x
= gen_lowpart (data_mode
, x
);
4264 /* Expand integer vector comparison. */
4267 ix86_expand_int_vec_cmp (rtx operands
[])
4269 rtx_code code
= GET_CODE (operands
[1]);
4270 bool negate
= false;
4271 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4272 operands
[3], NULL
, NULL
, &negate
);
4278 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4279 CONST0_RTX (GET_MODE (cmp
)),
4280 NULL
, NULL
, &negate
);
4282 gcc_assert (!negate
);
4284 if (operands
[0] != cmp
)
4285 emit_move_insn (operands
[0], cmp
);
4290 /* Expand a floating-point vector conditional move; a vcond operation
4291 rather than a movcc operation. */
4294 ix86_expand_fp_vcond (rtx operands
[])
4296 enum rtx_code code
= GET_CODE (operands
[3]);
4299 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4300 &operands
[4], &operands
[5]);
4301 if (code
== UNKNOWN
)
4304 switch (GET_CODE (operands
[3]))
4307 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4308 operands
[5], operands
[0], operands
[0]);
4309 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4310 operands
[5], operands
[1], operands
[2]);
4314 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4315 operands
[5], operands
[0], operands
[0]);
4316 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4317 operands
[5], operands
[1], operands
[2]);
4323 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4325 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4329 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4330 operands
[5], operands
[1], operands
[2]))
4333 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4334 operands
[1], operands
[2]);
4335 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4339 /* Expand a signed/unsigned integral vector conditional move. */
4342 ix86_expand_int_vcond (rtx operands
[])
4344 machine_mode data_mode
= GET_MODE (operands
[0]);
4345 machine_mode mode
= GET_MODE (operands
[4]);
4346 enum rtx_code code
= GET_CODE (operands
[3]);
4347 bool negate
= false;
4353 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4354 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4355 if ((code
== LT
|| code
== GE
)
4356 && data_mode
== mode
4357 && cop1
== CONST0_RTX (mode
)
4358 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4359 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4360 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4361 && (GET_MODE_SIZE (data_mode
) == 16
4362 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4364 rtx negop
= operands
[2 - (code
== LT
)];
4365 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4366 if (negop
== CONST1_RTX (data_mode
))
4368 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4369 operands
[0], 1, OPTAB_DIRECT
);
4370 if (res
!= operands
[0])
4371 emit_move_insn (operands
[0], res
);
4374 else if (GET_MODE_INNER (data_mode
) != DImode
4375 && vector_all_ones_operand (negop
, data_mode
))
4377 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4378 operands
[0], 0, OPTAB_DIRECT
);
4379 if (res
!= operands
[0])
4380 emit_move_insn (operands
[0], res
);
4385 if (!nonimmediate_operand (cop1
, mode
))
4386 cop1
= force_reg (mode
, cop1
);
4387 if (!general_operand (operands
[1], data_mode
))
4388 operands
[1] = force_reg (data_mode
, operands
[1]);
4389 if (!general_operand (operands
[2], data_mode
))
4390 operands
[2] = force_reg (data_mode
, operands
[2]);
4392 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4393 operands
[1], operands
[2], &negate
);
4398 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4399 operands
[2-negate
]);
4404 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4405 struct expand_vec_perm_d
*d
)
4407 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4408 expander, so args are either in d, or in op0, op1 etc. */
4409 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4410 machine_mode maskmode
= mode
;
4411 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4416 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4417 gen
= gen_avx512vl_vpermt2varv8hi3
;
4420 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4421 gen
= gen_avx512vl_vpermt2varv16hi3
;
4424 if (TARGET_AVX512VBMI
)
4425 gen
= gen_avx512bw_vpermt2varv64qi3
;
4428 if (TARGET_AVX512BW
)
4429 gen
= gen_avx512bw_vpermt2varv32hi3
;
4432 if (TARGET_AVX512VL
)
4433 gen
= gen_avx512vl_vpermt2varv4si3
;
4436 if (TARGET_AVX512VL
)
4437 gen
= gen_avx512vl_vpermt2varv8si3
;
4441 gen
= gen_avx512f_vpermt2varv16si3
;
4444 if (TARGET_AVX512VL
)
4446 gen
= gen_avx512vl_vpermt2varv4sf3
;
4447 maskmode
= V4SImode
;
4451 if (TARGET_AVX512VL
)
4453 gen
= gen_avx512vl_vpermt2varv8sf3
;
4454 maskmode
= V8SImode
;
4460 gen
= gen_avx512f_vpermt2varv16sf3
;
4461 maskmode
= V16SImode
;
4465 if (TARGET_AVX512VL
)
4466 gen
= gen_avx512vl_vpermt2varv2di3
;
4469 if (TARGET_AVX512VL
)
4470 gen
= gen_avx512vl_vpermt2varv4di3
;
4474 gen
= gen_avx512f_vpermt2varv8di3
;
4477 if (TARGET_AVX512VL
)
4479 gen
= gen_avx512vl_vpermt2varv2df3
;
4480 maskmode
= V2DImode
;
4484 if (TARGET_AVX512VL
)
4486 gen
= gen_avx512vl_vpermt2varv4df3
;
4487 maskmode
= V4DImode
;
4493 gen
= gen_avx512f_vpermt2varv8df3
;
4494 maskmode
= V8DImode
;
4504 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4505 expander, so args are either in d, or in op0, op1 etc. */
4512 for (int i
= 0; i
< d
->nelt
; ++i
)
4513 vec
[i
] = GEN_INT (d
->perm
[i
]);
4514 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4517 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4521 /* Expand a variable vector permutation. */
4524 ix86_expand_vec_perm (rtx operands
[])
4526 rtx target
= operands
[0];
4527 rtx op0
= operands
[1];
4528 rtx op1
= operands
[2];
4529 rtx mask
= operands
[3];
4530 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4531 machine_mode mode
= GET_MODE (op0
);
4532 machine_mode maskmode
= GET_MODE (mask
);
4534 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4536 /* Number of elements in the vector. */
4537 w
= GET_MODE_NUNITS (mode
);
4538 e
= GET_MODE_UNIT_SIZE (mode
);
4539 gcc_assert (w
<= 64);
4541 if (TARGET_AVX512F
&& one_operand_shuffle
)
4543 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4547 gen
=gen_avx512f_permvarv16si
;
4550 gen
= gen_avx512f_permvarv16sf
;
4553 gen
= gen_avx512f_permvarv8di
;
4556 gen
= gen_avx512f_permvarv8df
;
4563 emit_insn (gen (target
, op0
, mask
));
4568 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4573 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4575 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4576 an constant shuffle operand. With a tiny bit of effort we can
4577 use VPERMD instead. A re-interpretation stall for V4DFmode is
4578 unfortunate but there's no avoiding it.
4579 Similarly for V16HImode we don't have instructions for variable
4580 shuffling, while for V32QImode we can use after preparing suitable
4581 masks vpshufb; vpshufb; vpermq; vpor. */
4583 if (mode
== V16HImode
)
4585 maskmode
= mode
= V32QImode
;
4591 maskmode
= mode
= V8SImode
;
4595 t1
= gen_reg_rtx (maskmode
);
4597 /* Replicate the low bits of the V4DImode mask into V8SImode:
4599 t1 = { A A B B C C D D }. */
4600 for (i
= 0; i
< w
/ 2; ++i
)
4601 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4602 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4603 vt
= force_reg (maskmode
, vt
);
4604 mask
= gen_lowpart (maskmode
, mask
);
4605 if (maskmode
== V8SImode
)
4606 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4608 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4610 /* Multiply the shuffle indicies by two. */
4611 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4614 /* Add one to the odd shuffle indicies:
4615 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4616 for (i
= 0; i
< w
/ 2; ++i
)
4618 vec
[i
* 2] = const0_rtx
;
4619 vec
[i
* 2 + 1] = const1_rtx
;
4621 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4622 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4623 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4626 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4627 operands
[3] = mask
= t1
;
4628 target
= gen_reg_rtx (mode
);
4629 op0
= gen_lowpart (mode
, op0
);
4630 op1
= gen_lowpart (mode
, op1
);
4636 /* The VPERMD and VPERMPS instructions already properly ignore
4637 the high bits of the shuffle elements. No need for us to
4638 perform an AND ourselves. */
4639 if (one_operand_shuffle
)
4641 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4642 if (target
!= operands
[0])
4643 emit_move_insn (operands
[0],
4644 gen_lowpart (GET_MODE (operands
[0]), target
));
4648 t1
= gen_reg_rtx (V8SImode
);
4649 t2
= gen_reg_rtx (V8SImode
);
4650 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4651 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4657 mask
= gen_lowpart (V8SImode
, mask
);
4658 if (one_operand_shuffle
)
4659 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4662 t1
= gen_reg_rtx (V8SFmode
);
4663 t2
= gen_reg_rtx (V8SFmode
);
4664 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4665 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4671 /* By combining the two 128-bit input vectors into one 256-bit
4672 input vector, we can use VPERMD and VPERMPS for the full
4673 two-operand shuffle. */
4674 t1
= gen_reg_rtx (V8SImode
);
4675 t2
= gen_reg_rtx (V8SImode
);
4676 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4677 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4678 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4679 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4683 t1
= gen_reg_rtx (V8SFmode
);
4684 t2
= gen_reg_rtx (V8SImode
);
4685 mask
= gen_lowpart (V4SImode
, mask
);
4686 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4687 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4688 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4689 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4693 t1
= gen_reg_rtx (V32QImode
);
4694 t2
= gen_reg_rtx (V32QImode
);
4695 t3
= gen_reg_rtx (V32QImode
);
4696 vt2
= GEN_INT (-128);
4697 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4698 vt
= force_reg (V32QImode
, vt
);
4699 for (i
= 0; i
< 32; i
++)
4700 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4701 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4702 vt2
= force_reg (V32QImode
, vt2
);
4703 /* From mask create two adjusted masks, which contain the same
4704 bits as mask in the low 7 bits of each vector element.
4705 The first mask will have the most significant bit clear
4706 if it requests element from the same 128-bit lane
4707 and MSB set if it requests element from the other 128-bit lane.
4708 The second mask will have the opposite values of the MSB,
4709 and additionally will have its 128-bit lanes swapped.
4710 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4711 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4712 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4713 stands for other 12 bytes. */
4714 /* The bit whether element is from the same lane or the other
4715 lane is bit 4, so shift it up by 3 to the MSB position. */
4716 t5
= gen_reg_rtx (V4DImode
);
4717 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4719 /* Clear MSB bits from the mask just in case it had them set. */
4720 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4721 /* After this t1 will have MSB set for elements from other lane. */
4722 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4723 /* Clear bits other than MSB. */
4724 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4725 /* Or in the lower bits from mask into t3. */
4726 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4727 /* And invert MSB bits in t1, so MSB is set for elements from the same
4729 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4730 /* Swap 128-bit lanes in t3. */
4731 t6
= gen_reg_rtx (V4DImode
);
4732 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4733 const2_rtx
, GEN_INT (3),
4734 const0_rtx
, const1_rtx
));
4735 /* And or in the lower bits from mask into t1. */
4736 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4737 if (one_operand_shuffle
)
4739 /* Each of these shuffles will put 0s in places where
4740 element from the other 128-bit lane is needed, otherwise
4741 will shuffle in the requested value. */
4742 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4743 gen_lowpart (V32QImode
, t6
)));
4744 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4745 /* For t3 the 128-bit lanes are swapped again. */
4746 t7
= gen_reg_rtx (V4DImode
);
4747 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4748 const2_rtx
, GEN_INT (3),
4749 const0_rtx
, const1_rtx
));
4750 /* And oring both together leads to the result. */
4751 emit_insn (gen_iorv32qi3 (target
, t1
,
4752 gen_lowpart (V32QImode
, t7
)));
4753 if (target
!= operands
[0])
4754 emit_move_insn (operands
[0],
4755 gen_lowpart (GET_MODE (operands
[0]), target
));
4759 t4
= gen_reg_rtx (V32QImode
);
4760 /* Similarly to the above one_operand_shuffle code,
4761 just for repeated twice for each operand. merge_two:
4762 code will merge the two results together. */
4763 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4764 gen_lowpart (V32QImode
, t6
)));
4765 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4766 gen_lowpart (V32QImode
, t6
)));
4767 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4768 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4769 t7
= gen_reg_rtx (V4DImode
);
4770 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4771 const2_rtx
, GEN_INT (3),
4772 const0_rtx
, const1_rtx
));
4773 t8
= gen_reg_rtx (V4DImode
);
4774 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4775 const2_rtx
, GEN_INT (3),
4776 const0_rtx
, const1_rtx
));
4777 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4778 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4784 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4791 /* The XOP VPPERM insn supports three inputs. By ignoring the
4792 one_operand_shuffle special case, we avoid creating another
4793 set of constant vectors in memory. */
4794 one_operand_shuffle
= false;
4796 /* mask = mask & {2*w-1, ...} */
4797 vt
= GEN_INT (2*w
- 1);
4801 /* mask = mask & {w-1, ...} */
4802 vt
= GEN_INT (w
- 1);
4805 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4806 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4807 NULL_RTX
, 0, OPTAB_DIRECT
);
4809 /* For non-QImode operations, convert the word permutation control
4810 into a byte permutation control. */
4811 if (mode
!= V16QImode
)
4813 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4814 GEN_INT (exact_log2 (e
)),
4815 NULL_RTX
, 0, OPTAB_DIRECT
);
4817 /* Convert mask to vector of chars. */
4818 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4820 /* Replicate each of the input bytes into byte positions:
4821 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4822 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4823 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4824 for (i
= 0; i
< 16; ++i
)
4825 vec
[i
] = GEN_INT (i
/e
* e
);
4826 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4827 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4829 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4831 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4833 /* Convert it into the byte positions by doing
4834 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4835 for (i
= 0; i
< 16; ++i
)
4836 vec
[i
] = GEN_INT (i
% e
);
4837 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4838 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4839 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4842 /* The actual shuffle operations all operate on V16QImode. */
4843 op0
= gen_lowpart (V16QImode
, op0
);
4844 op1
= gen_lowpart (V16QImode
, op1
);
4848 if (GET_MODE (target
) != V16QImode
)
4849 target
= gen_reg_rtx (V16QImode
);
4850 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4851 if (target
!= operands
[0])
4852 emit_move_insn (operands
[0],
4853 gen_lowpart (GET_MODE (operands
[0]), target
));
4855 else if (one_operand_shuffle
)
4857 if (GET_MODE (target
) != V16QImode
)
4858 target
= gen_reg_rtx (V16QImode
);
4859 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4860 if (target
!= operands
[0])
4861 emit_move_insn (operands
[0],
4862 gen_lowpart (GET_MODE (operands
[0]), target
));
4869 /* Shuffle the two input vectors independently. */
4870 t1
= gen_reg_rtx (V16QImode
);
4871 t2
= gen_reg_rtx (V16QImode
);
4872 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4873 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4876 /* Then merge them together. The key is whether any given control
4877 element contained a bit set that indicates the second word. */
4880 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4882 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4883 more shuffle to convert the V2DI input mask into a V4SI
4884 input mask. At which point the masking that expand_int_vcond
4885 will work as desired. */
4886 rtx t3
= gen_reg_rtx (V4SImode
);
4887 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4888 const0_rtx
, const0_rtx
,
4889 const2_rtx
, const2_rtx
));
4891 maskmode
= V4SImode
;
4895 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4896 vt
= force_reg (maskmode
, vt
);
4897 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4898 NULL_RTX
, 0, OPTAB_DIRECT
);
4900 if (GET_MODE (target
) != mode
)
4901 target
= gen_reg_rtx (mode
);
4903 xops
[1] = gen_lowpart (mode
, t2
);
4904 xops
[2] = gen_lowpart (mode
, t1
);
4905 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4908 ok
= ix86_expand_int_vcond (xops
);
4910 if (target
!= operands
[0])
4911 emit_move_insn (operands
[0],
4912 gen_lowpart (GET_MODE (operands
[0]), target
));
4916 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4917 true if we should do zero extension, else sign extension. HIGH_P is
4918 true if we want the N/2 high elements, else the low elements. */
4921 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4923 machine_mode imode
= GET_MODE (src
);
4928 rtx (*unpack
)(rtx
, rtx
);
4929 rtx (*extract
)(rtx
, rtx
) = NULL
;
4930 machine_mode halfmode
= BLKmode
;
4936 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4938 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4939 halfmode
= V32QImode
;
4941 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4945 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4947 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4948 halfmode
= V16QImode
;
4950 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4954 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4956 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4957 halfmode
= V16HImode
;
4959 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4963 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4965 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4966 halfmode
= V8HImode
;
4968 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
4972 unpack
= gen_avx512f_zero_extendv8siv8di2
;
4974 unpack
= gen_avx512f_sign_extendv8siv8di2
;
4975 halfmode
= V8SImode
;
4977 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
4981 unpack
= gen_avx2_zero_extendv4siv4di2
;
4983 unpack
= gen_avx2_sign_extendv4siv4di2
;
4984 halfmode
= V4SImode
;
4986 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
4990 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
4992 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
4996 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
4998 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5002 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5004 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5010 if (GET_MODE_SIZE (imode
) >= 32)
5012 tmp
= gen_reg_rtx (halfmode
);
5013 emit_insn (extract (tmp
, src
));
5017 /* Shift higher 8 bytes to lower 8 bytes. */
5018 tmp
= gen_reg_rtx (V1TImode
);
5019 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5021 tmp
= gen_lowpart (imode
, tmp
);
5026 emit_insn (unpack (dest
, tmp
));
5030 rtx (*unpack
)(rtx
, rtx
, rtx
);
5036 unpack
= gen_vec_interleave_highv16qi
;
5038 unpack
= gen_vec_interleave_lowv16qi
;
5042 unpack
= gen_vec_interleave_highv8hi
;
5044 unpack
= gen_vec_interleave_lowv8hi
;
5048 unpack
= gen_vec_interleave_highv4si
;
5050 unpack
= gen_vec_interleave_lowv4si
;
5057 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5059 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5060 src
, pc_rtx
, pc_rtx
);
5062 rtx tmp2
= gen_reg_rtx (imode
);
5063 emit_insn (unpack (tmp2
, src
, tmp
));
5064 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5068 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5069 but works for floating pointer parameters and nonoffsetable memories.
5070 For pushes, it returns just stack offsets; the values will be saved
5071 in the right order. Maximally three parts are generated. */
5074 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5079 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5081 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5083 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5084 gcc_assert (size
>= 2 && size
<= 4);
5086 /* Optimize constant pool reference to immediates. This is used by fp
5087 moves, that force all constants to memory to allow combining. */
5088 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5089 operand
= avoid_constant_pool_reference (operand
);
5091 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5093 /* The only non-offsetable memories we handle are pushes. */
5094 int ok
= push_operand (operand
, VOIDmode
);
5098 operand
= copy_rtx (operand
);
5099 PUT_MODE (operand
, word_mode
);
5100 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5104 if (GET_CODE (operand
) == CONST_VECTOR
)
5106 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5107 /* Caution: if we looked through a constant pool memory above,
5108 the operand may actually have a different mode now. That's
5109 ok, since we want to pun this all the way back to an integer. */
5110 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5111 gcc_assert (operand
!= NULL
);
5118 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5123 if (REG_P (operand
))
5125 gcc_assert (reload_completed
);
5126 for (i
= 0; i
< size
; i
++)
5127 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5129 else if (offsettable_memref_p (operand
))
5131 operand
= adjust_address (operand
, SImode
, 0);
5133 for (i
= 1; i
< size
; i
++)
5134 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5136 else if (CONST_DOUBLE_P (operand
))
5138 const REAL_VALUE_TYPE
*r
;
5141 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5145 real_to_target (l
, r
, mode
);
5146 parts
[3] = gen_int_mode (l
[3], SImode
);
5147 parts
[2] = gen_int_mode (l
[2], SImode
);
5150 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5151 long double may not be 80-bit. */
5152 real_to_target (l
, r
, mode
);
5153 parts
[2] = gen_int_mode (l
[2], SImode
);
5156 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5161 parts
[1] = gen_int_mode (l
[1], SImode
);
5162 parts
[0] = gen_int_mode (l
[0], SImode
);
5171 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5172 if (mode
== XFmode
|| mode
== TFmode
)
5174 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5175 if (REG_P (operand
))
5177 gcc_assert (reload_completed
);
5178 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5179 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5181 else if (offsettable_memref_p (operand
))
5183 operand
= adjust_address (operand
, DImode
, 0);
5185 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5187 else if (CONST_DOUBLE_P (operand
))
5191 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5193 /* real_to_target puts 32-bit pieces in each long. */
5194 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5195 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5198 if (upper_mode
== SImode
)
5199 parts
[1] = gen_int_mode (l
[2], SImode
);
5202 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5203 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5214 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5215 Return false when normal moves are needed; true when all required
5216 insns have been emitted. Operands 2-4 contain the input values
5217 int the correct order; operands 5-7 contain the output values. */
5220 ix86_split_long_move (rtx operands
[])
5226 machine_mode mode
= GET_MODE (operands
[0]);
5227 bool collisionparts
[4];
5229 /* The DFmode expanders may ask us to move double.
5230 For 64bit target this is single move. By hiding the fact
5231 here we simplify i386.md splitters. */
5232 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5234 /* Optimize constant pool reference to immediates. This is used by
5235 fp moves, that force all constants to memory to allow combining. */
5237 if (MEM_P (operands
[1])
5238 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5239 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5240 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5241 if (push_operand (operands
[0], VOIDmode
))
5243 operands
[0] = copy_rtx (operands
[0]);
5244 PUT_MODE (operands
[0], word_mode
);
5247 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5248 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5249 emit_move_insn (operands
[0], operands
[1]);
5253 /* The only non-offsettable memory we handle is push. */
5254 if (push_operand (operands
[0], VOIDmode
))
5257 gcc_assert (!MEM_P (operands
[0])
5258 || offsettable_memref_p (operands
[0]));
5260 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5261 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5263 /* When emitting push, take care for source operands on the stack. */
5264 if (push
&& MEM_P (operands
[1])
5265 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5267 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5269 /* Compensate for the stack decrement by 4. */
5270 if (!TARGET_64BIT
&& nparts
== 3
5271 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5272 src_base
= plus_constant (Pmode
, src_base
, 4);
5274 /* src_base refers to the stack pointer and is
5275 automatically decreased by emitted push. */
5276 for (i
= 0; i
< nparts
; i
++)
5277 part
[1][i
] = change_address (part
[1][i
],
5278 GET_MODE (part
[1][i
]), src_base
);
5281 /* We need to do copy in the right order in case an address register
5282 of the source overlaps the destination. */
5283 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5287 for (i
= 0; i
< nparts
; i
++)
5290 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5291 if (collisionparts
[i
])
5295 /* Collision in the middle part can be handled by reordering. */
5296 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5298 std::swap (part
[0][1], part
[0][2]);
5299 std::swap (part
[1][1], part
[1][2]);
5301 else if (collisions
== 1
5303 && (collisionparts
[1] || collisionparts
[2]))
5305 if (collisionparts
[1])
5307 std::swap (part
[0][1], part
[0][2]);
5308 std::swap (part
[1][1], part
[1][2]);
5312 std::swap (part
[0][2], part
[0][3]);
5313 std::swap (part
[1][2], part
[1][3]);
5317 /* If there are more collisions, we can't handle it by reordering.
5318 Do an lea to the last part and use only one colliding move. */
5319 else if (collisions
> 1)
5325 base
= part
[0][nparts
- 1];
5327 /* Handle the case when the last part isn't valid for lea.
5328 Happens in 64-bit mode storing the 12-byte XFmode. */
5329 if (GET_MODE (base
) != Pmode
)
5330 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5332 addr
= XEXP (part
[1][0], 0);
5333 if (TARGET_TLS_DIRECT_SEG_REFS
)
5335 struct ix86_address parts
;
5336 int ok
= ix86_decompose_address (addr
, &parts
);
5338 /* It is not valid to use %gs: or %fs: in lea. */
5339 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5341 emit_insn (gen_rtx_SET (base
, addr
));
5342 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5343 for (i
= 1; i
< nparts
; i
++)
5345 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5346 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5357 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5358 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5359 emit_move_insn (part
[0][2], part
[1][2]);
5361 else if (nparts
== 4)
5363 emit_move_insn (part
[0][3], part
[1][3]);
5364 emit_move_insn (part
[0][2], part
[1][2]);
5369 /* In 64bit mode we don't have 32bit push available. In case this is
5370 register, it is OK - we will just use larger counterpart. We also
5371 retype memory - these comes from attempt to avoid REX prefix on
5372 moving of second half of TFmode value. */
5373 if (GET_MODE (part
[1][1]) == SImode
)
5375 switch (GET_CODE (part
[1][1]))
5378 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5382 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5389 if (GET_MODE (part
[1][0]) == SImode
)
5390 part
[1][0] = part
[1][1];
5393 emit_move_insn (part
[0][1], part
[1][1]);
5394 emit_move_insn (part
[0][0], part
[1][0]);
5398 /* Choose correct order to not overwrite the source before it is copied. */
5399 if ((REG_P (part
[0][0])
5400 && REG_P (part
[1][1])
5401 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5403 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5405 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5407 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5409 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5411 operands
[2 + i
] = part
[0][j
];
5412 operands
[6 + i
] = part
[1][j
];
5417 for (i
= 0; i
< nparts
; i
++)
5419 operands
[2 + i
] = part
[0][i
];
5420 operands
[6 + i
] = part
[1][i
];
5424 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5425 if (optimize_insn_for_size_p ())
5427 for (j
= 0; j
< nparts
- 1; j
++)
5428 if (CONST_INT_P (operands
[6 + j
])
5429 && operands
[6 + j
] != const0_rtx
5430 && REG_P (operands
[2 + j
]))
5431 for (i
= j
; i
< nparts
- 1; i
++)
5432 if (CONST_INT_P (operands
[7 + i
])
5433 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5434 operands
[7 + i
] = operands
[2 + j
];
5437 for (i
= 0; i
< nparts
; i
++)
5438 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5443 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5444 left shift by a constant, either using a single shift or
5445 a sequence of add instructions. */
5448 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5451 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5452 && !optimize_insn_for_size_p ()))
5455 emit_insn (gen_add2_insn (operand
, operand
));
5459 rtx (*insn
)(rtx
, rtx
, rtx
);
5461 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5462 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5467 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5469 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5470 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5471 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5472 machine_mode half_mode
;
5474 rtx low
[2], high
[2];
5477 if (CONST_INT_P (operands
[2]))
5479 split_double_mode (mode
, operands
, 2, low
, high
);
5480 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5482 if (count
>= half_width
)
5484 emit_move_insn (high
[0], low
[1]);
5485 emit_move_insn (low
[0], const0_rtx
);
5487 if (count
> half_width
)
5488 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5492 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5494 if (!rtx_equal_p (operands
[0], operands
[1]))
5495 emit_move_insn (operands
[0], operands
[1]);
5497 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5498 ix86_expand_ashl_const (low
[0], count
, mode
);
5503 split_double_mode (mode
, operands
, 1, low
, high
);
5504 half_mode
= mode
== DImode
? SImode
: DImode
;
5506 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5508 if (operands
[1] == const1_rtx
)
5510 /* Assuming we've chosen a QImode capable registers, then 1 << N
5511 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5512 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5514 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5516 ix86_expand_clear (low
[0]);
5517 ix86_expand_clear (high
[0]);
5518 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5520 d
= gen_lowpart (QImode
, low
[0]);
5521 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5522 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5523 emit_insn (gen_rtx_SET (d
, s
));
5525 d
= gen_lowpart (QImode
, high
[0]);
5526 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5527 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5528 emit_insn (gen_rtx_SET (d
, s
));
5531 /* Otherwise, we can get the same results by manually performing
5532 a bit extract operation on bit 5/6, and then performing the two
5533 shifts. The two methods of getting 0/1 into low/high are exactly
5534 the same size. Avoiding the shift in the bit extract case helps
5535 pentium4 a bit; no one else seems to care much either way. */
5538 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5539 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5540 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5546 gen_lshr3
= gen_lshrsi3
;
5547 gen_and3
= gen_andsi3
;
5548 gen_xor3
= gen_xorsi3
;
5553 gen_lshr3
= gen_lshrdi3
;
5554 gen_and3
= gen_anddi3
;
5555 gen_xor3
= gen_xordi3
;
5559 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5560 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5562 x
= gen_lowpart (half_mode
, operands
[2]);
5563 emit_insn (gen_rtx_SET (high
[0], x
));
5565 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5566 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5567 emit_move_insn (low
[0], high
[0]);
5568 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5571 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5572 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5576 if (operands
[1] == constm1_rtx
)
5578 /* For -1 << N, we can avoid the shld instruction, because we
5579 know that we're shifting 0...31/63 ones into a -1. */
5580 emit_move_insn (low
[0], constm1_rtx
);
5581 if (optimize_insn_for_size_p ())
5582 emit_move_insn (high
[0], low
[0]);
5584 emit_move_insn (high
[0], constm1_rtx
);
5588 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5590 if (!rtx_equal_p (operands
[0], operands
[1]))
5591 emit_move_insn (operands
[0], operands
[1]);
5593 split_double_mode (mode
, operands
, 1, low
, high
);
5594 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5597 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5599 if (TARGET_CMOVE
&& scratch
)
5601 ix86_expand_clear (scratch
);
5602 emit_insn (gen_x86_shift_adj_1
5603 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5606 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5610 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5612 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5613 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5614 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5615 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5617 rtx low
[2], high
[2];
5620 if (CONST_INT_P (operands
[2]))
5622 split_double_mode (mode
, operands
, 2, low
, high
);
5623 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5625 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5627 emit_move_insn (high
[0], high
[1]);
5628 emit_insn (gen_ashr3 (high
[0], high
[0],
5629 GEN_INT (half_width
- 1)));
5630 emit_move_insn (low
[0], high
[0]);
5633 else if (count
>= half_width
)
5635 emit_move_insn (low
[0], high
[1]);
5636 emit_move_insn (high
[0], low
[0]);
5637 emit_insn (gen_ashr3 (high
[0], high
[0],
5638 GEN_INT (half_width
- 1)));
5640 if (count
> half_width
)
5641 emit_insn (gen_ashr3 (low
[0], low
[0],
5642 GEN_INT (count
- half_width
)));
5646 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5648 if (!rtx_equal_p (operands
[0], operands
[1]))
5649 emit_move_insn (operands
[0], operands
[1]);
5651 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5652 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5657 machine_mode half_mode
;
5659 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5661 if (!rtx_equal_p (operands
[0], operands
[1]))
5662 emit_move_insn (operands
[0], operands
[1]);
5664 split_double_mode (mode
, operands
, 1, low
, high
);
5665 half_mode
= mode
== DImode
? SImode
: DImode
;
5667 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5668 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5670 if (TARGET_CMOVE
&& scratch
)
5672 emit_move_insn (scratch
, high
[0]);
5673 emit_insn (gen_ashr3 (scratch
, scratch
,
5674 GEN_INT (half_width
- 1)));
5675 emit_insn (gen_x86_shift_adj_1
5676 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5679 emit_insn (gen_x86_shift_adj_3
5680 (half_mode
, low
[0], high
[0], operands
[2]));
5685 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5687 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5688 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5689 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5690 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5692 rtx low
[2], high
[2];
5695 if (CONST_INT_P (operands
[2]))
5697 split_double_mode (mode
, operands
, 2, low
, high
);
5698 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5700 if (count
>= half_width
)
5702 emit_move_insn (low
[0], high
[1]);
5703 ix86_expand_clear (high
[0]);
5705 if (count
> half_width
)
5706 emit_insn (gen_lshr3 (low
[0], low
[0],
5707 GEN_INT (count
- half_width
)));
5711 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5713 if (!rtx_equal_p (operands
[0], operands
[1]))
5714 emit_move_insn (operands
[0], operands
[1]);
5716 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5717 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5722 machine_mode half_mode
;
5724 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5726 if (!rtx_equal_p (operands
[0], operands
[1]))
5727 emit_move_insn (operands
[0], operands
[1]);
5729 split_double_mode (mode
, operands
, 1, low
, high
);
5730 half_mode
= mode
== DImode
? SImode
: DImode
;
5732 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5733 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5735 if (TARGET_CMOVE
&& scratch
)
5737 ix86_expand_clear (scratch
);
5738 emit_insn (gen_x86_shift_adj_1
5739 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5742 emit_insn (gen_x86_shift_adj_2
5743 (half_mode
, low
[0], high
[0], operands
[2]));
5747 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5748 DImode for constant loop counts. */
5751 counter_mode (rtx count_exp
)
5753 if (GET_MODE (count_exp
) != VOIDmode
)
5754 return GET_MODE (count_exp
);
5755 if (!CONST_INT_P (count_exp
))
5757 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5762 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5763 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5764 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5765 memory by VALUE (supposed to be in MODE).
5767 The size is rounded down to whole number of chunk size moved at once.
5768 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5772 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5773 rtx destptr
, rtx srcptr
, rtx value
,
5774 rtx count
, machine_mode mode
, int unroll
,
5775 int expected_size
, bool issetmem
)
5777 rtx_code_label
*out_label
, *top_label
;
5779 machine_mode iter_mode
= counter_mode (count
);
5780 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5781 rtx piece_size
= GEN_INT (piece_size_n
);
5782 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5786 top_label
= gen_label_rtx ();
5787 out_label
= gen_label_rtx ();
5788 iter
= gen_reg_rtx (iter_mode
);
5790 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5791 NULL
, 1, OPTAB_DIRECT
);
5792 /* Those two should combine. */
5793 if (piece_size
== const1_rtx
)
5795 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5797 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5799 emit_move_insn (iter
, const0_rtx
);
5801 emit_label (top_label
);
5803 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5805 /* This assert could be relaxed - in this case we'll need to compute
5806 smallest power of two, containing in PIECE_SIZE_N and pass it to
5808 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5809 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5810 destmem
= adjust_address (destmem
, mode
, 0);
5814 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5815 srcmem
= adjust_address (srcmem
, mode
, 0);
5817 /* When unrolling for chips that reorder memory reads and writes,
5818 we can save registers by using single temporary.
5819 Also using 4 temporaries is overkill in 32bit mode. */
5820 if (!TARGET_64BIT
&& 0)
5822 for (i
= 0; i
< unroll
; i
++)
5826 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5827 GET_MODE_SIZE (mode
));
5828 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5829 GET_MODE_SIZE (mode
));
5831 emit_move_insn (destmem
, srcmem
);
5837 gcc_assert (unroll
<= 4);
5838 for (i
= 0; i
< unroll
; i
++)
5840 tmpreg
[i
] = gen_reg_rtx (mode
);
5842 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5843 GET_MODE_SIZE (mode
));
5844 emit_move_insn (tmpreg
[i
], srcmem
);
5846 for (i
= 0; i
< unroll
; i
++)
5849 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5850 GET_MODE_SIZE (mode
));
5851 emit_move_insn (destmem
, tmpreg
[i
]);
5856 for (i
= 0; i
< unroll
; i
++)
5859 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5860 GET_MODE_SIZE (mode
));
5861 emit_move_insn (destmem
, value
);
5864 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5865 true, OPTAB_LIB_WIDEN
);
5867 emit_move_insn (iter
, tmp
);
5869 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5871 if (expected_size
!= -1)
5873 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5874 if (expected_size
== 0)
5876 else if (expected_size
> REG_BR_PROB_BASE
)
5877 predict_jump (REG_BR_PROB_BASE
- 1);
5879 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5883 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5884 iter
= ix86_zero_extend_to_Pmode (iter
);
5885 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5886 true, OPTAB_LIB_WIDEN
);
5888 emit_move_insn (destptr
, tmp
);
5891 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5892 true, OPTAB_LIB_WIDEN
);
5894 emit_move_insn (srcptr
, tmp
);
5896 emit_label (out_label
);
5899 /* Divide COUNTREG by SCALE. */
5901 scale_counter (rtx countreg
, int scale
)
5907 if (CONST_INT_P (countreg
))
5908 return GEN_INT (INTVAL (countreg
) / scale
);
5909 gcc_assert (REG_P (countreg
));
5911 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5912 GEN_INT (exact_log2 (scale
)),
5913 NULL
, 1, OPTAB_DIRECT
);
5917 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5918 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5919 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5920 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5921 ORIG_VALUE is the original value passed to memset to fill the memory with.
5922 Other arguments have same meaning as for previous function. */
5925 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5926 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5928 machine_mode mode
, bool issetmem
)
5933 HOST_WIDE_INT rounded_count
;
5935 /* If possible, it is shorter to use rep movs.
5936 TODO: Maybe it is better to move this logic to decide_alg. */
5937 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5938 && (!issetmem
|| orig_value
== const0_rtx
))
5941 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5942 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5944 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5945 GET_MODE_SIZE (mode
)));
5948 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5949 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5950 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5953 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5954 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5957 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5958 destmem
= shallow_copy_rtx (destmem
);
5959 set_mem_size (destmem
, rounded_count
);
5961 else if (MEM_SIZE_KNOWN_P (destmem
))
5962 clear_mem_size (destmem
);
5966 value
= force_reg (mode
, gen_lowpart (mode
, value
));
5967 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
5971 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
5972 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
5975 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5976 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5977 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
5980 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
5981 if (CONST_INT_P (count
))
5984 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5985 srcmem
= shallow_copy_rtx (srcmem
);
5986 set_mem_size (srcmem
, rounded_count
);
5990 if (MEM_SIZE_KNOWN_P (srcmem
))
5991 clear_mem_size (srcmem
);
5993 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
5998 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6000 SRC is passed by pointer to be updated on return.
6001 Return value is updated DST. */
6003 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6004 HOST_WIDE_INT size_to_move
)
6006 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
6007 enum insn_code code
;
6008 machine_mode move_mode
;
6011 /* Find the widest mode in which we could perform moves.
6012 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6013 it until move of such size is supported. */
6014 piece_size
= 1 << floor_log2 (size_to_move
);
6015 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6016 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6018 gcc_assert (piece_size
> 1);
6022 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6023 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6024 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6026 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6027 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6028 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6030 move_mode
= word_mode
;
6031 piece_size
= GET_MODE_SIZE (move_mode
);
6032 code
= optab_handler (mov_optab
, move_mode
);
6035 gcc_assert (code
!= CODE_FOR_nothing
);
6037 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6038 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6040 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6041 gcc_assert (size_to_move
% piece_size
== 0);
6043 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6045 /* We move from memory to memory, so we'll need to do it via
6046 a temporary register. */
6047 tempreg
= gen_reg_rtx (move_mode
);
6048 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6049 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6051 emit_move_insn (destptr
,
6052 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6053 emit_move_insn (srcptr
,
6054 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
6056 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6058 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6062 /* Update DST and SRC rtx. */
6067 /* Helper function for the string operations below. Dest VARIABLE whether
6068 it is aligned to VALUE bytes. If true, jump to the label. */
6070 static rtx_code_label
*
6071 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6073 rtx_code_label
*label
= gen_label_rtx ();
6074 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6075 if (GET_MODE (variable
) == DImode
)
6076 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6078 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6079 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6082 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6084 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6089 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6092 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6093 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6096 if (CONST_INT_P (count
))
6098 HOST_WIDE_INT countval
= INTVAL (count
);
6099 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6102 /* For now MAX_SIZE should be a power of 2. This assert could be
6103 relaxed, but it'll require a bit more complicated epilogue
6105 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6106 for (i
= max_size
; i
>= 1; i
>>= 1)
6108 if (epilogue_size
& i
)
6109 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6115 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6116 count
, 1, OPTAB_DIRECT
);
6117 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6118 count
, QImode
, 1, 4, false);
6122 /* When there are stringops, we can cheaply increase dest and src pointers.
6123 Otherwise we save code size by maintaining offset (zero is readily
6124 available from preceding rep operation) and using x86 addressing modes.
6126 if (TARGET_SINGLE_STRINGOP
)
6130 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6131 src
= change_address (srcmem
, SImode
, srcptr
);
6132 dest
= change_address (destmem
, SImode
, destptr
);
6133 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6135 LABEL_NUSES (label
) = 1;
6139 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6140 src
= change_address (srcmem
, HImode
, srcptr
);
6141 dest
= change_address (destmem
, HImode
, destptr
);
6142 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6144 LABEL_NUSES (label
) = 1;
6148 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6149 src
= change_address (srcmem
, QImode
, srcptr
);
6150 dest
= change_address (destmem
, QImode
, destptr
);
6151 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6153 LABEL_NUSES (label
) = 1;
6158 rtx offset
= force_reg (Pmode
, const0_rtx
);
6163 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6164 src
= change_address (srcmem
, SImode
, srcptr
);
6165 dest
= change_address (destmem
, SImode
, destptr
);
6166 emit_move_insn (dest
, src
);
6167 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6168 true, OPTAB_LIB_WIDEN
);
6170 emit_move_insn (offset
, tmp
);
6172 LABEL_NUSES (label
) = 1;
6176 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6177 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6178 src
= change_address (srcmem
, HImode
, tmp
);
6179 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6180 dest
= change_address (destmem
, HImode
, tmp
);
6181 emit_move_insn (dest
, src
);
6182 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6183 true, OPTAB_LIB_WIDEN
);
6185 emit_move_insn (offset
, tmp
);
6187 LABEL_NUSES (label
) = 1;
6191 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6192 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6193 src
= change_address (srcmem
, QImode
, tmp
);
6194 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6195 dest
= change_address (destmem
, QImode
, tmp
);
6196 emit_move_insn (dest
, src
);
6198 LABEL_NUSES (label
) = 1;
6203 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6204 with value PROMOTED_VAL.
6205 SRC is passed by pointer to be updated on return.
6206 Return value is updated DST. */
6208 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6209 HOST_WIDE_INT size_to_move
)
6212 enum insn_code code
;
6213 machine_mode move_mode
;
6216 /* Find the widest mode in which we could perform moves.
6217 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6218 it until move of such size is supported. */
6219 move_mode
= GET_MODE (promoted_val
);
6220 if (move_mode
== VOIDmode
)
6222 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6224 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6225 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6226 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6228 piece_size
= GET_MODE_SIZE (move_mode
);
6229 code
= optab_handler (mov_optab
, move_mode
);
6230 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6232 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6234 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6235 gcc_assert (size_to_move
% piece_size
== 0);
6237 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6239 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6241 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6242 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6247 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6249 emit_move_insn (destptr
,
6250 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6252 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6256 /* Update DST rtx. */
6259 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6261 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6262 rtx count
, int max_size
)
6264 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6265 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6266 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6267 gen_lowpart (QImode
, value
), count
, QImode
,
6268 1, max_size
/ 2, true);
6271 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6273 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6274 rtx count
, int max_size
)
6278 if (CONST_INT_P (count
))
6280 HOST_WIDE_INT countval
= INTVAL (count
);
6281 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6284 /* For now MAX_SIZE should be a power of 2. This assert could be
6285 relaxed, but it'll require a bit more complicated epilogue
6287 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6288 for (i
= max_size
; i
>= 1; i
>>= 1)
6290 if (epilogue_size
& i
)
6292 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6293 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6295 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6302 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6307 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6310 dest
= change_address (destmem
, DImode
, destptr
);
6311 emit_insn (gen_strset (destptr
, dest
, value
));
6312 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6313 emit_insn (gen_strset (destptr
, dest
, value
));
6317 dest
= change_address (destmem
, SImode
, destptr
);
6318 emit_insn (gen_strset (destptr
, dest
, value
));
6319 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6320 emit_insn (gen_strset (destptr
, dest
, value
));
6321 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6322 emit_insn (gen_strset (destptr
, dest
, value
));
6323 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6324 emit_insn (gen_strset (destptr
, dest
, value
));
6327 LABEL_NUSES (label
) = 1;
6331 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6334 dest
= change_address (destmem
, DImode
, destptr
);
6335 emit_insn (gen_strset (destptr
, dest
, value
));
6339 dest
= change_address (destmem
, SImode
, destptr
);
6340 emit_insn (gen_strset (destptr
, dest
, value
));
6341 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6342 emit_insn (gen_strset (destptr
, dest
, value
));
6345 LABEL_NUSES (label
) = 1;
6349 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6350 dest
= change_address (destmem
, SImode
, destptr
);
6351 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6353 LABEL_NUSES (label
) = 1;
6357 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6358 dest
= change_address (destmem
, HImode
, destptr
);
6359 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6361 LABEL_NUSES (label
) = 1;
6365 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6366 dest
= change_address (destmem
, QImode
, destptr
);
6367 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6369 LABEL_NUSES (label
) = 1;
6373 /* Adjust COUNTER by the VALUE. */
6375 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6377 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6380 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6381 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6382 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6384 Return value is updated DESTMEM. */
6387 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6388 rtx destptr
, rtx srcptr
, rtx value
,
6389 rtx vec_value
, rtx count
, int align
,
6390 int desired_alignment
, bool issetmem
)
6393 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6397 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6400 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6401 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6403 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6406 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6407 ix86_adjust_counter (count
, i
);
6409 LABEL_NUSES (label
) = 1;
6410 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6416 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6417 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6418 and jump to DONE_LABEL. */
6420 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6421 rtx destptr
, rtx srcptr
,
6422 rtx value
, rtx vec_value
,
6423 rtx count
, int size
,
6424 rtx done_label
, bool issetmem
)
6426 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6427 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6431 /* If we do not have vector value to copy, we must reduce size. */
6436 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6438 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6439 mode
= GET_MODE (value
);
6442 mode
= GET_MODE (vec_value
), value
= vec_value
;
6446 /* Choose appropriate vector mode. */
6448 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6449 else if (size
>= 16)
6450 mode
= TARGET_SSE
? V16QImode
: DImode
;
6451 srcmem
= change_address (srcmem
, mode
, srcptr
);
6453 destmem
= change_address (destmem
, mode
, destptr
);
6454 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6455 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6456 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6459 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6462 emit_move_insn (destmem
, srcmem
);
6463 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6465 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6468 destmem
= offset_address (destmem
, count
, 1);
6469 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6470 GET_MODE_SIZE (mode
));
6473 srcmem
= offset_address (srcmem
, count
, 1);
6474 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6475 GET_MODE_SIZE (mode
));
6477 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6480 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6483 emit_move_insn (destmem
, srcmem
);
6484 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6486 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6488 emit_jump_insn (gen_jump (done_label
));
6492 LABEL_NUSES (label
) = 1;
6495 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6496 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6497 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6498 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6499 DONE_LABEL is a label after the whole copying sequence. The label is created
6500 on demand if *DONE_LABEL is NULL.
6501 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6502 bounds after the initial copies.
6504 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6505 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6506 we will dispatch to a library call for large blocks.
6508 In pseudocode we do:
6512 Assume that SIZE is 4. Bigger sizes are handled analogously
6515 copy 4 bytes from SRCPTR to DESTPTR
6516 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6521 copy 1 byte from SRCPTR to DESTPTR
6524 copy 2 bytes from SRCPTR to DESTPTR
6525 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6530 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6531 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6533 OLD_DESPTR = DESTPTR;
6534 Align DESTPTR up to DESIRED_ALIGN
6535 SRCPTR += DESTPTR - OLD_DESTPTR
6536 COUNT -= DEST_PTR - OLD_DESTPTR
6538 Round COUNT down to multiple of SIZE
6539 << optional caller supplied zero size guard is here >>
6540 << optional caller supplied dynamic check is here >>
6541 << caller supplied main copy loop is here >>
6546 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6547 rtx
*destptr
, rtx
*srcptr
,
6549 rtx value
, rtx vec_value
,
6551 rtx_code_label
**done_label
,
6555 unsigned HOST_WIDE_INT
*min_size
,
6559 rtx_code_label
*loop_label
= NULL
, *label
;
6562 int prolog_size
= 0;
6565 /* Chose proper value to copy. */
6566 if (issetmem
&& VECTOR_MODE_P (mode
))
6567 mode_value
= vec_value
;
6570 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6572 /* See if block is big or small, handle small blocks. */
6573 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6576 loop_label
= gen_label_rtx ();
6579 *done_label
= gen_label_rtx ();
6581 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6585 /* Handle sizes > 3. */
6586 for (;size2
> 2; size2
>>= 1)
6587 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6591 size2
, *done_label
, issetmem
);
6592 /* Nothing to copy? Jump to DONE_LABEL if so */
6593 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6596 /* Do a byte copy. */
6597 destmem
= change_address (destmem
, QImode
, *destptr
);
6599 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6602 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6603 emit_move_insn (destmem
, srcmem
);
6606 /* Handle sizes 2 and 3. */
6607 label
= ix86_expand_aligntest (*count
, 2, false);
6608 destmem
= change_address (destmem
, HImode
, *destptr
);
6609 destmem
= offset_address (destmem
, *count
, 1);
6610 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6612 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6615 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6616 srcmem
= offset_address (srcmem
, *count
, 1);
6617 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6618 emit_move_insn (destmem
, srcmem
);
6622 LABEL_NUSES (label
) = 1;
6623 emit_jump_insn (gen_jump (*done_label
));
6627 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6628 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6630 /* Start memcpy for COUNT >= SIZE. */
6633 emit_label (loop_label
);
6634 LABEL_NUSES (loop_label
) = 1;
6637 /* Copy first desired_align bytes. */
6639 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6640 destmem
= change_address (destmem
, mode
, *destptr
);
6641 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6642 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6645 emit_move_insn (destmem
, mode_value
);
6648 emit_move_insn (destmem
, srcmem
);
6649 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6651 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6652 prolog_size
+= GET_MODE_SIZE (mode
);
6656 /* Copy last SIZE bytes. */
6657 destmem
= offset_address (destmem
, *count
, 1);
6658 destmem
= offset_address (destmem
,
6659 GEN_INT (-size
- prolog_size
),
6662 emit_move_insn (destmem
, mode_value
);
6665 srcmem
= offset_address (srcmem
, *count
, 1);
6666 srcmem
= offset_address (srcmem
,
6667 GEN_INT (-size
- prolog_size
),
6669 emit_move_insn (destmem
, srcmem
);
6671 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6673 destmem
= offset_address (destmem
, modesize
, 1);
6675 emit_move_insn (destmem
, mode_value
);
6678 srcmem
= offset_address (srcmem
, modesize
, 1);
6679 emit_move_insn (destmem
, srcmem
);
6683 /* Align destination. */
6684 if (desired_align
> 1 && desired_align
> align
)
6686 rtx saveddest
= *destptr
;
6688 gcc_assert (desired_align
<= size
);
6689 /* Align destptr up, place it to new register. */
6690 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6691 GEN_INT (prolog_size
),
6692 NULL_RTX
, 1, OPTAB_DIRECT
);
6693 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6694 REG_POINTER (*destptr
) = 1;
6695 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6696 GEN_INT (-desired_align
),
6697 *destptr
, 1, OPTAB_DIRECT
);
6698 /* See how many bytes we skipped. */
6699 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6701 saveddest
, 1, OPTAB_DIRECT
);
6702 /* Adjust srcptr and count. */
6704 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6705 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6706 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6707 saveddest
, *count
, 1, OPTAB_DIRECT
);
6708 /* We copied at most size + prolog_size. */
6709 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6711 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6715 /* Our loops always round down the block size, but for dispatch to
6716 library we need precise value. */
6718 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6719 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6723 gcc_assert (prolog_size
== 0);
6724 /* Decrease count, so we won't end up copying last word twice. */
6725 if (!CONST_INT_P (*count
))
6726 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6727 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6729 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6730 (unsigned HOST_WIDE_INT
)size
));
6732 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6737 /* This function is like the previous one, except here we know how many bytes
6738 need to be copied. That allows us to update alignment not only of DST, which
6739 is returned, but also of SRC, which is passed as a pointer for that
6742 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6743 rtx srcreg
, rtx value
, rtx vec_value
,
6744 int desired_align
, int align_bytes
,
6749 rtx orig_src
= NULL
;
6751 int copied_bytes
= 0;
6755 gcc_assert (srcp
!= NULL
);
6760 for (piece_size
= 1;
6761 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6764 if (align_bytes
& piece_size
)
6768 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6769 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6771 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6774 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6775 copied_bytes
+= piece_size
;
6778 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6779 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6780 if (MEM_SIZE_KNOWN_P (orig_dst
))
6781 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6785 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6787 if (src_align_bytes
>= 0)
6788 src_align_bytes
= desired_align
- src_align_bytes
;
6789 if (src_align_bytes
>= 0)
6791 unsigned int src_align
;
6792 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6794 if ((src_align_bytes
& (src_align
- 1))
6795 == (align_bytes
& (src_align
- 1)))
6798 if (src_align
> (unsigned int) desired_align
)
6799 src_align
= desired_align
;
6800 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6801 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6803 if (MEM_SIZE_KNOWN_P (orig_src
))
6804 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6811 /* Return true if ALG can be used in current context.
6812 Assume we expand memset if MEMSET is true. */
6814 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6816 if (alg
== no_stringop
)
6818 if (alg
== vector_loop
)
6819 return TARGET_SSE
|| TARGET_AVX
;
6820 /* Algorithms using the rep prefix want at least edi and ecx;
6821 additionally, memset wants eax and memcpy wants esi. Don't
6822 consider such algorithms if the user has appropriated those
6823 registers for their own purposes, or if we have a non-default
6824 address space, since some string insns cannot override the segment. */
6825 if (alg
== rep_prefix_1_byte
6826 || alg
== rep_prefix_4_byte
6827 || alg
== rep_prefix_8_byte
)
6831 if (fixed_regs
[CX_REG
]
6832 || fixed_regs
[DI_REG
]
6833 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6839 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6840 static enum stringop_alg
6841 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6842 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6843 bool memset
, bool zero_memset
, bool have_as
,
6844 int *dynamic_check
, bool *noalign
, bool recur
)
6846 const struct stringop_algs
*algs
;
6847 bool optimize_for_speed
;
6849 const struct processor_costs
*cost
;
6851 bool any_alg_usable_p
= false;
6854 *dynamic_check
= -1;
6856 /* Even if the string operation call is cold, we still might spend a lot
6857 of time processing large blocks. */
6858 if (optimize_function_for_size_p (cfun
)
6859 || (optimize_insn_for_size_p ()
6861 || (expected_size
!= -1 && expected_size
< 256))))
6862 optimize_for_speed
= false;
6864 optimize_for_speed
= true;
6866 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6868 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6870 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6872 /* See maximal size for user defined algorithm. */
6873 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6875 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6876 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6877 any_alg_usable_p
|= usable
;
6879 if (candidate
!= libcall
&& candidate
&& usable
)
6880 max
= algs
->size
[i
].max
;
6883 /* If expected size is not known but max size is small enough
6884 so inline version is a win, set expected size into
6886 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6887 && expected_size
== -1)
6888 expected_size
= min_size
/ 2 + max_size
/ 2;
6890 /* If user specified the algorithm, honor it if possible. */
6891 if (ix86_stringop_alg
!= no_stringop
6892 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6893 return ix86_stringop_alg
;
6894 /* rep; movq or rep; movl is the smallest variant. */
6895 else if (!optimize_for_speed
)
6898 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6899 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6900 ? rep_prefix_1_byte
: loop_1_byte
;
6902 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6903 ? rep_prefix_4_byte
: loop
;
6905 /* Very tiny blocks are best handled via the loop, REP is expensive to
6907 else if (expected_size
!= -1 && expected_size
< 4)
6909 else if (expected_size
!= -1)
6911 enum stringop_alg alg
= libcall
;
6912 bool alg_noalign
= false;
6913 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6915 /* We get here if the algorithms that were not libcall-based
6916 were rep-prefix based and we are unable to use rep prefixes
6917 based on global register usage. Break out of the loop and
6918 use the heuristic below. */
6919 if (algs
->size
[i
].max
== 0)
6921 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6923 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6925 if (candidate
!= libcall
6926 && alg_usable_p (candidate
, memset
, have_as
))
6929 alg_noalign
= algs
->size
[i
].noalign
;
6931 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6932 last non-libcall inline algorithm. */
6933 if (TARGET_INLINE_ALL_STRINGOPS
)
6935 /* When the current size is best to be copied by a libcall,
6936 but we are still forced to inline, run the heuristic below
6937 that will pick code for medium sized blocks. */
6940 *noalign
= alg_noalign
;
6943 else if (!any_alg_usable_p
)
6946 else if (alg_usable_p (candidate
, memset
, have_as
))
6948 *noalign
= algs
->size
[i
].noalign
;
6954 /* When asked to inline the call anyway, try to pick meaningful choice.
6955 We look for maximal size of block that is faster to copy by hand and
6956 take blocks of at most of that size guessing that average size will
6957 be roughly half of the block.
6959 If this turns out to be bad, we might simply specify the preferred
6960 choice in ix86_costs. */
6961 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6962 && (algs
->unknown_size
== libcall
6963 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
6965 enum stringop_alg alg
;
6966 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
6968 /* If there aren't any usable algorithms or if recursing already,
6969 then recursing on smaller sizes or same size isn't going to
6970 find anything. Just return the simple byte-at-a-time copy loop. */
6971 if (!any_alg_usable_p
|| recur
)
6973 /* Pick something reasonable. */
6974 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
6975 *dynamic_check
= 128;
6978 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
6979 zero_memset
, have_as
, dynamic_check
, noalign
, true);
6980 gcc_assert (*dynamic_check
== -1);
6981 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6982 *dynamic_check
= max
;
6984 gcc_assert (alg
!= libcall
);
6987 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
6988 ? algs
->unknown_size
: libcall
);
6991 /* Decide on alignment. We know that the operand is already aligned to ALIGN
6992 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
6994 decide_alignment (int align
,
6995 enum stringop_alg alg
,
6997 machine_mode move_mode
)
6999 int desired_align
= 0;
7001 gcc_assert (alg
!= no_stringop
);
7005 if (move_mode
== VOIDmode
)
7008 desired_align
= GET_MODE_SIZE (move_mode
);
7009 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7010 copying whole cacheline at once. */
7011 if (TARGET_PENTIUMPRO
7012 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7017 if (desired_align
< align
)
7018 desired_align
= align
;
7019 if (expected_size
!= -1 && expected_size
< 4)
7020 desired_align
= align
;
7022 return desired_align
;
7026 /* Helper function for memcpy. For QImode value 0xXY produce
7027 0xXYXYXYXY of wide specified by MODE. This is essentially
7028 a * 0x10101010, but we can do slightly better than
7029 synth_mult by unwinding the sequence by hand on CPUs with
7032 promote_duplicated_reg (machine_mode mode
, rtx val
)
7034 machine_mode valmode
= GET_MODE (val
);
7036 int nops
= mode
== DImode
? 3 : 2;
7038 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7039 if (val
== const0_rtx
)
7040 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7041 if (CONST_INT_P (val
))
7043 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7048 v
|= (v
<< 16) << 16;
7049 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7052 if (valmode
== VOIDmode
)
7054 if (valmode
!= QImode
)
7055 val
= gen_lowpart (QImode
, val
);
7058 if (!TARGET_PARTIAL_REG_STALL
)
7060 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7061 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7062 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7063 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7065 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7066 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7067 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7072 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7074 if (!TARGET_PARTIAL_REG_STALL
)
7075 emit_insn (gen_insv_1 (mode
, reg
, reg
));
7078 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7079 NULL
, 1, OPTAB_DIRECT
);
7080 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7083 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7084 NULL
, 1, OPTAB_DIRECT
);
7085 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7088 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7089 NULL
, 1, OPTAB_DIRECT
);
7090 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7095 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7096 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7097 alignment from ALIGN to DESIRED_ALIGN. */
7099 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7105 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7106 promoted_val
= promote_duplicated_reg (DImode
, val
);
7107 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7108 promoted_val
= promote_duplicated_reg (SImode
, val
);
7109 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7110 promoted_val
= promote_duplicated_reg (HImode
, val
);
7114 return promoted_val
;
7117 /* Copy the address to a Pmode register. This is used for x32 to
7118 truncate DImode TLS address to a SImode register. */
7121 ix86_copy_addr_to_reg (rtx addr
)
7124 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7126 reg
= copy_addr_to_reg (addr
);
7127 REG_POINTER (reg
) = 1;
7132 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7133 reg
= copy_to_mode_reg (DImode
, addr
);
7134 REG_POINTER (reg
) = 1;
7135 return gen_rtx_SUBREG (SImode
, reg
, 0);
7139 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7140 operations when profitable. The code depends upon architecture, block size
7141 and alignment, but always has one of the following overall structures:
7143 Aligned move sequence:
7145 1) Prologue guard: Conditional that jumps up to epilogues for small
7146 blocks that can be handled by epilogue alone. This is faster
7147 but also needed for correctness, since prologue assume the block
7148 is larger than the desired alignment.
7150 Optional dynamic check for size and libcall for large
7151 blocks is emitted here too, with -minline-stringops-dynamically.
7153 2) Prologue: copy first few bytes in order to get destination
7154 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7155 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7156 copied. We emit either a jump tree on power of two sized
7157 blocks, or a byte loop.
7159 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7160 with specified algorithm.
7162 4) Epilogue: code copying tail of the block that is too small to be
7163 handled by main body (or up to size guarded by prologue guard).
7165 Misaligned move sequence
7167 1) missaligned move prologue/epilogue containing:
7168 a) Prologue handling small memory blocks and jumping to done_label
7169 (skipped if blocks are known to be large enough)
7170 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7171 needed by single possibly misaligned move
7172 (skipped if alignment is not needed)
7173 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7175 2) Zero size guard dispatching to done_label, if needed
7177 3) dispatch to library call, if needed,
7179 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7180 with specified algorithm. */
7182 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7183 rtx align_exp
, rtx expected_align_exp
,
7184 rtx expected_size_exp
, rtx min_size_exp
,
7185 rtx max_size_exp
, rtx probable_max_size_exp
,
7190 rtx_code_label
*label
= NULL
;
7192 rtx_code_label
*jump_around_label
= NULL
;
7193 HOST_WIDE_INT align
= 1;
7194 unsigned HOST_WIDE_INT count
= 0;
7195 HOST_WIDE_INT expected_size
= -1;
7196 int size_needed
= 0, epilogue_size_needed
;
7197 int desired_align
= 0, align_bytes
= 0;
7198 enum stringop_alg alg
;
7199 rtx promoted_val
= NULL
;
7200 rtx vec_promoted_val
= NULL
;
7201 bool force_loopy_epilogue
= false;
7203 bool need_zero_guard
= false;
7205 machine_mode move_mode
= VOIDmode
;
7206 machine_mode wider_mode
;
7207 int unroll_factor
= 1;
7208 /* TODO: Once value ranges are available, fill in proper data. */
7209 unsigned HOST_WIDE_INT min_size
= 0;
7210 unsigned HOST_WIDE_INT max_size
= -1;
7211 unsigned HOST_WIDE_INT probable_max_size
= -1;
7212 bool misaligned_prologue_used
= false;
7215 if (CONST_INT_P (align_exp
))
7216 align
= INTVAL (align_exp
);
7217 /* i386 can do misaligned access on reasonably increased cost. */
7218 if (CONST_INT_P (expected_align_exp
)
7219 && INTVAL (expected_align_exp
) > align
)
7220 align
= INTVAL (expected_align_exp
);
7221 /* ALIGN is the minimum of destination and source alignment, but we care here
7222 just about destination alignment. */
7224 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7225 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7227 if (CONST_INT_P (count_exp
))
7229 min_size
= max_size
= probable_max_size
= count
= expected_size
7230 = INTVAL (count_exp
);
7231 /* When COUNT is 0, there is nothing to do. */
7238 min_size
= INTVAL (min_size_exp
);
7240 max_size
= INTVAL (max_size_exp
);
7241 if (probable_max_size_exp
)
7242 probable_max_size
= INTVAL (probable_max_size_exp
);
7243 if (CONST_INT_P (expected_size_exp
))
7244 expected_size
= INTVAL (expected_size_exp
);
7247 /* Make sure we don't need to care about overflow later on. */
7248 if (count
> (HOST_WIDE_INT_1U
<< 30))
7251 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7253 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7255 /* Step 0: Decide on preferred algorithm, desired alignment and
7256 size of chunks to be copied by main loop. */
7257 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7259 issetmem
&& val_exp
== const0_rtx
, have_as
,
7260 &dynamic_check
, &noalign
, false);
7263 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7264 stringop_alg_names
[alg
]);
7268 gcc_assert (alg
!= no_stringop
);
7270 /* For now vector-version of memset is generated only for memory zeroing, as
7271 creating of promoted vector value is very cheap in this case. */
7272 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7273 alg
= unrolled_loop
;
7276 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7277 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7279 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7282 move_mode
= word_mode
;
7290 need_zero_guard
= true;
7294 need_zero_guard
= true;
7297 need_zero_guard
= true;
7298 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7301 need_zero_guard
= true;
7303 /* Find the widest supported mode. */
7304 move_mode
= word_mode
;
7305 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7306 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7307 move_mode
= wider_mode
;
7309 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
7312 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7313 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7314 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7316 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7317 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7318 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7319 move_mode
= word_mode
;
7321 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7323 case rep_prefix_8_byte
:
7326 case rep_prefix_4_byte
:
7329 case rep_prefix_1_byte
:
7333 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7334 epilogue_size_needed
= size_needed
;
7336 /* If we are going to call any library calls conditionally, make sure any
7337 pending stack adjustment happen before the first conditional branch,
7338 otherwise they will be emitted before the library call only and won't
7339 happen from the other branches. */
7340 if (dynamic_check
!= -1)
7341 do_pending_stack_adjust ();
7343 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7344 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7345 align
= desired_align
;
7347 /* Step 1: Prologue guard. */
7349 /* Alignment code needs count to be in register. */
7350 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7352 if (INTVAL (count_exp
) > desired_align
7353 && INTVAL (count_exp
) > size_needed
)
7356 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7357 if (align_bytes
<= 0)
7360 align_bytes
= desired_align
- align_bytes
;
7362 if (align_bytes
== 0)
7363 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7365 gcc_assert (desired_align
>= 1 && align
>= 1);
7367 /* Misaligned move sequences handle both prologue and epilogue at once.
7368 Default code generation results in a smaller code for large alignments
7369 and also avoids redundant job when sizes are known precisely. */
7370 misaligned_prologue_used
7371 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7372 && MAX (desired_align
, epilogue_size_needed
) <= 32
7373 && desired_align
<= epilogue_size_needed
7374 && ((desired_align
> align
&& !align_bytes
)
7375 || (!count
&& epilogue_size_needed
> 1)));
7377 /* Do the cheap promotion to allow better CSE across the
7378 main loop and epilogue (ie one load of the big constant in the
7380 For now the misaligned move sequences do not have fast path
7381 without broadcasting. */
7382 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7384 if (alg
== vector_loop
)
7386 gcc_assert (val_exp
== const0_rtx
);
7387 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7388 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7389 GET_MODE_SIZE (word_mode
),
7390 desired_align
, align
);
7394 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7395 desired_align
, align
);
7398 /* Misaligned move sequences handles both prologues and epilogues at once.
7399 Default code generation results in smaller code for large alignments and
7400 also avoids redundant job when sizes are known precisely. */
7401 if (misaligned_prologue_used
)
7403 /* Misaligned move prologue handled small blocks by itself. */
7404 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7405 (dst
, src
, &destreg
, &srcreg
,
7406 move_mode
, promoted_val
, vec_promoted_val
,
7409 desired_align
< align
7410 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7411 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7413 src
= change_address (src
, BLKmode
, srcreg
);
7414 dst
= change_address (dst
, BLKmode
, destreg
);
7415 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7416 epilogue_size_needed
= 0;
7418 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7420 /* It is possible that we copied enough so the main loop will not
7422 gcc_assert (size_needed
> 1);
7423 if (jump_around_label
== NULL_RTX
)
7424 jump_around_label
= gen_label_rtx ();
7425 emit_cmp_and_jump_insns (count_exp
,
7426 GEN_INT (size_needed
),
7427 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7428 if (expected_size
== -1
7429 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7430 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7432 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7435 /* Ensure that alignment prologue won't copy past end of block. */
7436 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7438 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7439 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7440 Make sure it is power of 2. */
7441 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7443 /* To improve performance of small blocks, we jump around the VAL
7444 promoting mode. This mean that if the promoted VAL is not constant,
7445 we might not use it in the epilogue and have to use byte
7447 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7448 force_loopy_epilogue
= true;
7449 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7450 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7452 /* If main algorithm works on QImode, no epilogue is needed.
7453 For small sizes just don't align anything. */
7454 if (size_needed
== 1)
7455 desired_align
= align
;
7460 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7462 label
= gen_label_rtx ();
7463 emit_cmp_and_jump_insns (count_exp
,
7464 GEN_INT (epilogue_size_needed
),
7465 LTU
, 0, counter_mode (count_exp
), 1, label
);
7466 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7467 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7469 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7473 /* Emit code to decide on runtime whether library call or inline should be
7475 if (dynamic_check
!= -1)
7477 if (!issetmem
&& CONST_INT_P (count_exp
))
7479 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7481 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7482 count_exp
= const0_rtx
;
7488 rtx_code_label
*hot_label
= gen_label_rtx ();
7489 if (jump_around_label
== NULL_RTX
)
7490 jump_around_label
= gen_label_rtx ();
7491 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7492 LEU
, 0, counter_mode (count_exp
),
7494 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7496 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7498 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7499 emit_jump (jump_around_label
);
7500 emit_label (hot_label
);
7504 /* Step 2: Alignment prologue. */
7505 /* Do the expensive promotion once we branched off the small blocks. */
7506 if (issetmem
&& !promoted_val
)
7507 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7508 desired_align
, align
);
7510 if (desired_align
> align
&& !misaligned_prologue_used
)
7512 if (align_bytes
== 0)
7514 /* Except for the first move in prologue, we no longer know
7515 constant offset in aliasing info. It don't seems to worth
7516 the pain to maintain it for the first move, so throw away
7518 dst
= change_address (dst
, BLKmode
, destreg
);
7520 src
= change_address (src
, BLKmode
, srcreg
);
7521 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7522 promoted_val
, vec_promoted_val
,
7523 count_exp
, align
, desired_align
,
7525 /* At most desired_align - align bytes are copied. */
7526 if (min_size
< (unsigned)(desired_align
- align
))
7529 min_size
-= desired_align
- align
;
7533 /* If we know how many bytes need to be stored before dst is
7534 sufficiently aligned, maintain aliasing info accurately. */
7535 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7543 count_exp
= plus_constant (counter_mode (count_exp
),
7544 count_exp
, -align_bytes
);
7545 count
-= align_bytes
;
7546 min_size
-= align_bytes
;
7547 max_size
-= align_bytes
;
7550 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7551 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7552 || (align_bytes
== 0
7553 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7554 + desired_align
- align
))))
7556 /* It is possible that we copied enough so the main loop will not
7558 gcc_assert (size_needed
> 1);
7559 if (label
== NULL_RTX
)
7560 label
= gen_label_rtx ();
7561 emit_cmp_and_jump_insns (count_exp
,
7562 GEN_INT (size_needed
),
7563 LTU
, 0, counter_mode (count_exp
), 1, label
);
7564 if (expected_size
== -1
7565 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7566 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7568 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7571 if (label
&& size_needed
== 1)
7574 LABEL_NUSES (label
) = 1;
7576 epilogue_size_needed
= 1;
7578 promoted_val
= val_exp
;
7580 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7581 epilogue_size_needed
= size_needed
;
7583 /* Step 3: Main loop. */
7594 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7595 count_exp
, move_mode
, unroll_factor
,
7596 expected_size
, issetmem
);
7599 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7600 vec_promoted_val
, count_exp
, move_mode
,
7601 unroll_factor
, expected_size
, issetmem
);
7603 case rep_prefix_8_byte
:
7604 case rep_prefix_4_byte
:
7605 case rep_prefix_1_byte
:
7606 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7607 val_exp
, count_exp
, move_mode
, issetmem
);
7610 /* Adjust properly the offset of src and dest memory for aliasing. */
7611 if (CONST_INT_P (count_exp
))
7614 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7615 (count
/ size_needed
) * size_needed
);
7616 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7617 (count
/ size_needed
) * size_needed
);
7622 src
= change_address (src
, BLKmode
, srcreg
);
7623 dst
= change_address (dst
, BLKmode
, destreg
);
7626 /* Step 4: Epilogue to copy the remaining bytes. */
7630 /* When the main loop is done, COUNT_EXP might hold original count,
7631 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7632 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7633 bytes. Compensate if needed. */
7635 if (size_needed
< epilogue_size_needed
)
7637 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7638 GEN_INT (size_needed
- 1), count_exp
, 1,
7640 if (tmp
!= count_exp
)
7641 emit_move_insn (count_exp
, tmp
);
7644 LABEL_NUSES (label
) = 1;
7647 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7649 if (force_loopy_epilogue
)
7650 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7651 epilogue_size_needed
);
7655 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7656 vec_promoted_val
, count_exp
,
7657 epilogue_size_needed
);
7659 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7660 epilogue_size_needed
);
7663 if (jump_around_label
)
7664 emit_label (jump_around_label
);
7669 /* Expand the appropriate insns for doing strlen if not just doing
7672 out = result, initialized with the start address
7673 align_rtx = alignment of the address.
7674 scratch = scratch register, initialized with the startaddress when
7675 not aligned, otherwise undefined
7677 This is just the body. It needs the initializations mentioned above and
7678 some address computing at the end. These things are done in i386.md. */
7681 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7685 rtx_code_label
*align_2_label
= NULL
;
7686 rtx_code_label
*align_3_label
= NULL
;
7687 rtx_code_label
*align_4_label
= gen_label_rtx ();
7688 rtx_code_label
*end_0_label
= gen_label_rtx ();
7690 rtx tmpreg
= gen_reg_rtx (SImode
);
7691 rtx scratch
= gen_reg_rtx (SImode
);
7695 if (CONST_INT_P (align_rtx
))
7696 align
= INTVAL (align_rtx
);
7698 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7700 /* Is there a known alignment and is it less than 4? */
7703 rtx scratch1
= gen_reg_rtx (Pmode
);
7704 emit_move_insn (scratch1
, out
);
7705 /* Is there a known alignment and is it not 2? */
7708 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7709 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7711 /* Leave just the 3 lower bits. */
7712 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7713 NULL_RTX
, 0, OPTAB_WIDEN
);
7715 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7716 Pmode
, 1, align_4_label
);
7717 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7718 Pmode
, 1, align_2_label
);
7719 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7720 Pmode
, 1, align_3_label
);
7724 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7725 check if is aligned to 4 - byte. */
7727 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7728 NULL_RTX
, 0, OPTAB_WIDEN
);
7730 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7731 Pmode
, 1, align_4_label
);
7734 mem
= change_address (src
, QImode
, out
);
7736 /* Now compare the bytes. */
7738 /* Compare the first n unaligned byte on a byte per byte basis. */
7739 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7740 QImode
, 1, end_0_label
);
7742 /* Increment the address. */
7743 emit_insn (gen_add2_insn (out
, const1_rtx
));
7745 /* Not needed with an alignment of 2 */
7748 emit_label (align_2_label
);
7750 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7753 emit_insn (gen_add2_insn (out
, const1_rtx
));
7755 emit_label (align_3_label
);
7758 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7761 emit_insn (gen_add2_insn (out
, const1_rtx
));
7764 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7765 align this loop. It gives only huge programs, but does not help to
7767 emit_label (align_4_label
);
7769 mem
= change_address (src
, SImode
, out
);
7770 emit_move_insn (scratch
, mem
);
7771 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7773 /* This formula yields a nonzero result iff one of the bytes is zero.
7774 This saves three branches inside loop and many cycles. */
7776 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7777 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7778 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7779 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7780 gen_int_mode (0x80808080, SImode
)));
7781 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7786 rtx reg
= gen_reg_rtx (SImode
);
7787 rtx reg2
= gen_reg_rtx (Pmode
);
7788 emit_move_insn (reg
, tmpreg
);
7789 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7791 /* If zero is not in the first two bytes, move two bytes forward. */
7792 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7793 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7794 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7795 emit_insn (gen_rtx_SET (tmpreg
,
7796 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7799 /* Emit lea manually to avoid clobbering of flags. */
7800 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
7802 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7803 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7804 emit_insn (gen_rtx_SET (out
,
7805 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7811 rtx_code_label
*end_2_label
= gen_label_rtx ();
7812 /* Is zero in the first two bytes? */
7814 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7815 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7816 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7817 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7818 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7820 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7821 JUMP_LABEL (tmp
) = end_2_label
;
7823 /* Not in the first two. Move two bytes forward. */
7824 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7825 emit_insn (gen_add2_insn (out
, const2_rtx
));
7827 emit_label (end_2_label
);
7831 /* Avoid branch in fixing the byte. */
7832 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7833 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7834 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7835 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7836 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7838 emit_label (end_0_label
);
7841 /* Expand strlen. */
7844 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7846 if (TARGET_UNROLL_STRLEN
7847 && TARGET_INLINE_ALL_STRINGOPS
7848 && eoschar
== const0_rtx
7851 /* The generic case of strlen expander is long. Avoid it's
7852 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7853 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7854 /* Well it seems that some optimizer does not combine a call like
7855 foo(strlen(bar), strlen(bar));
7856 when the move and the subtraction is done here. It does calculate
7857 the length just once when these instructions are done inside of
7858 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7859 often used and I use one fewer register for the lifetime of
7860 output_strlen_unroll() this is better. */
7862 emit_move_insn (out
, addr
);
7864 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7866 /* strlensi_unroll_1 returns the address of the zero at the end of
7867 the string, like memchr(), so compute the length by subtracting
7868 the start address. */
7869 emit_insn (gen_sub2_insn (out
, addr
));
7876 /* For given symbol (function) construct code to compute address of it's PLT
7877 entry in large x86-64 PIC model. */
7880 construct_plt_address (rtx symbol
)
7884 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7885 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7886 gcc_assert (Pmode
== DImode
);
7888 tmp
= gen_reg_rtx (Pmode
);
7889 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7891 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7892 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7896 /* Additional registers that are clobbered by SYSV calls. */
7898 static int const x86_64_ms_sysv_extra_clobbered_registers
7899 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7903 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7904 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7908 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7910 rtx pop
, bool sibcall
)
7913 rtx use
= NULL
, call
;
7914 unsigned int vec_len
= 0;
7917 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7919 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7921 && (lookup_attribute ("interrupt",
7922 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7923 error ("interrupt service routine cannot be called directly");
7928 if (pop
== const0_rtx
)
7930 gcc_assert (!TARGET_64BIT
|| !pop
);
7932 if (TARGET_MACHO
&& !TARGET_64BIT
)
7935 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7936 fnaddr
= machopic_indirect_call_target (fnaddr
);
7941 /* Static functions and indirect calls don't need the pic register. Also,
7942 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7943 it an indirect call. */
7944 rtx addr
= XEXP (fnaddr
, 0);
7946 && GET_CODE (addr
) == SYMBOL_REF
7947 && !SYMBOL_REF_LOCAL_P (addr
))
7950 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7951 || !lookup_attribute ("noplt",
7952 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
7955 || (ix86_cmodel
== CM_LARGE_PIC
7956 && DEFAULT_ABI
!= MS_ABI
))
7958 use_reg (&use
, gen_rtx_REG (Pmode
,
7959 REAL_PIC_OFFSET_TABLE_REGNUM
));
7960 if (ix86_use_pseudo_pic_reg ())
7961 emit_move_insn (gen_rtx_REG (Pmode
,
7962 REAL_PIC_OFFSET_TABLE_REGNUM
),
7963 pic_offset_table_rtx
);
7966 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
7970 fnaddr
= gen_rtx_UNSPEC (Pmode
,
7971 gen_rtvec (1, addr
),
7973 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
7977 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
7979 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
7980 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
7983 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
7984 /* Pmode may not be the same as word_mode for x32, which
7985 doesn't support indirect branch via 32-bit memory slot.
7986 Since x32 GOT slot is 64 bit with zero upper 32 bits,
7987 indirect branch via x32 GOT slot is OK. */
7988 if (GET_MODE (fnaddr
) != word_mode
)
7989 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
7990 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
7995 /* Skip setting up RAX register for -mskip-rax-setup when there are no
7996 parameters passed in vector registers. */
7998 && (INTVAL (callarg2
) > 0
7999 || (INTVAL (callarg2
) == 0
8000 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8002 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8003 emit_move_insn (al
, callarg2
);
8007 if (ix86_cmodel
== CM_LARGE_PIC
8010 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8011 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8012 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8013 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8014 branch via x32 GOT slot is OK. */
8015 else if (!(TARGET_X32
8017 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8018 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8020 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8021 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8023 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8024 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8027 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8030 call
= gen_rtx_SET (retval
, call
);
8031 vec
[vec_len
++] = call
;
8035 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8036 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8037 vec
[vec_len
++] = pop
;
8040 if (cfun
->machine
->no_caller_saved_registers
8042 || (!TREE_THIS_VOLATILE (fndecl
)
8043 && !lookup_attribute ("no_caller_saved_registers",
8044 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8046 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8047 bool is_64bit_ms_abi
= (TARGET_64BIT
8048 && ix86_function_abi (fndecl
) == MS_ABI
);
8049 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8051 /* If there are no caller-saved registers, add all registers
8052 that are clobbered by the call which returns. */
8053 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8055 && (ix86_call_used_regs
[i
] == 1
8056 || (ix86_call_used_regs
[i
] & c_mask
))
8057 && !STACK_REGNO_P (i
)
8058 && !MMX_REGNO_P (i
))
8060 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8062 else if (TARGET_64BIT_MS_ABI
8063 && (!callarg2
|| INTVAL (callarg2
) != -2))
8067 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8069 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8070 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8072 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8075 /* Set here, but it may get cleared later. */
8076 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8081 /* Don't break hot-patched functions. */
8082 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8085 /* TODO: Cases not yet examined. */
8086 else if (flag_split_stack
)
8087 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8091 gcc_assert (!reload_completed
);
8092 cfun
->machine
->call_ms2sysv
= true;
8098 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8099 rtx_insn
*call_insn
= emit_call_insn (call
);
8101 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8106 /* Split simple return with popping POPC bytes from stack to indirect
8107 branch with stack adjustment . */
8110 ix86_split_simple_return_pop_internal (rtx popc
)
8112 struct machine_function
*m
= cfun
->machine
;
8113 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8116 /* There is no "pascal" calling convention in any 64bit ABI. */
8117 gcc_assert (!TARGET_64BIT
);
8119 insn
= emit_insn (gen_pop (ecx
));
8120 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8121 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8123 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8124 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8125 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8126 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8127 RTX_FRAME_RELATED_P (insn
) = 1;
8129 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8130 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8131 insn
= emit_insn (x
);
8132 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8133 RTX_FRAME_RELATED_P (insn
) = 1;
8135 /* Now return address is in ECX. */
8136 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8139 /* Errors in the source file can cause expand_expr to return const0_rtx
8140 where we expect a vector. To avoid crashing, use one of the vector
8141 clear instructions. */
8144 safe_vector_operand (rtx x
, machine_mode mode
)
8146 if (x
== const0_rtx
)
8147 x
= CONST0_RTX (mode
);
8151 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8154 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8157 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8158 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8159 rtx op0
= expand_normal (arg0
);
8160 rtx op1
= expand_normal (arg1
);
8161 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8162 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8163 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8165 if (VECTOR_MODE_P (mode0
))
8166 op0
= safe_vector_operand (op0
, mode0
);
8167 if (VECTOR_MODE_P (mode1
))
8168 op1
= safe_vector_operand (op1
, mode1
);
8170 if (optimize
|| !target
8171 || GET_MODE (target
) != tmode
8172 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8173 target
= gen_reg_rtx (tmode
);
8175 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8177 rtx x
= gen_reg_rtx (V4SImode
);
8178 emit_insn (gen_sse2_loadd (x
, op1
));
8179 op1
= gen_lowpart (TImode
, x
);
8182 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8183 op0
= copy_to_mode_reg (mode0
, op0
);
8184 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8185 op1
= copy_to_mode_reg (mode1
, op1
);
8187 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8196 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8199 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8200 enum ix86_builtin_func_type m_type
,
8201 enum rtx_code sub_code
)
8206 bool comparison_p
= false;
8208 bool last_arg_constant
= false;
8215 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8219 case MULTI_ARG_4_DF2_DI_I
:
8220 case MULTI_ARG_4_DF2_DI_I1
:
8221 case MULTI_ARG_4_SF2_SI_I
:
8222 case MULTI_ARG_4_SF2_SI_I1
:
8224 last_arg_constant
= true;
8227 case MULTI_ARG_3_SF
:
8228 case MULTI_ARG_3_DF
:
8229 case MULTI_ARG_3_SF2
:
8230 case MULTI_ARG_3_DF2
:
8231 case MULTI_ARG_3_DI
:
8232 case MULTI_ARG_3_SI
:
8233 case MULTI_ARG_3_SI_DI
:
8234 case MULTI_ARG_3_HI
:
8235 case MULTI_ARG_3_HI_SI
:
8236 case MULTI_ARG_3_QI
:
8237 case MULTI_ARG_3_DI2
:
8238 case MULTI_ARG_3_SI2
:
8239 case MULTI_ARG_3_HI2
:
8240 case MULTI_ARG_3_QI2
:
8244 case MULTI_ARG_2_SF
:
8245 case MULTI_ARG_2_DF
:
8246 case MULTI_ARG_2_DI
:
8247 case MULTI_ARG_2_SI
:
8248 case MULTI_ARG_2_HI
:
8249 case MULTI_ARG_2_QI
:
8253 case MULTI_ARG_2_DI_IMM
:
8254 case MULTI_ARG_2_SI_IMM
:
8255 case MULTI_ARG_2_HI_IMM
:
8256 case MULTI_ARG_2_QI_IMM
:
8258 last_arg_constant
= true;
8261 case MULTI_ARG_1_SF
:
8262 case MULTI_ARG_1_DF
:
8263 case MULTI_ARG_1_SF2
:
8264 case MULTI_ARG_1_DF2
:
8265 case MULTI_ARG_1_DI
:
8266 case MULTI_ARG_1_SI
:
8267 case MULTI_ARG_1_HI
:
8268 case MULTI_ARG_1_QI
:
8269 case MULTI_ARG_1_SI_DI
:
8270 case MULTI_ARG_1_HI_DI
:
8271 case MULTI_ARG_1_HI_SI
:
8272 case MULTI_ARG_1_QI_DI
:
8273 case MULTI_ARG_1_QI_SI
:
8274 case MULTI_ARG_1_QI_HI
:
8278 case MULTI_ARG_2_DI_CMP
:
8279 case MULTI_ARG_2_SI_CMP
:
8280 case MULTI_ARG_2_HI_CMP
:
8281 case MULTI_ARG_2_QI_CMP
:
8283 comparison_p
= true;
8286 case MULTI_ARG_2_SF_TF
:
8287 case MULTI_ARG_2_DF_TF
:
8288 case MULTI_ARG_2_DI_TF
:
8289 case MULTI_ARG_2_SI_TF
:
8290 case MULTI_ARG_2_HI_TF
:
8291 case MULTI_ARG_2_QI_TF
:
8300 if (optimize
|| !target
8301 || GET_MODE (target
) != tmode
8302 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8303 target
= gen_reg_rtx (tmode
);
8304 else if (memory_operand (target
, tmode
))
8307 gcc_assert (nargs
<= 4);
8309 for (i
= 0; i
< nargs
; i
++)
8311 tree arg
= CALL_EXPR_ARG (exp
, i
);
8312 rtx op
= expand_normal (arg
);
8313 int adjust
= (comparison_p
) ? 1 : 0;
8314 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8316 if (last_arg_constant
&& i
== nargs
- 1)
8318 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8320 enum insn_code new_icode
= icode
;
8323 case CODE_FOR_xop_vpermil2v2df3
:
8324 case CODE_FOR_xop_vpermil2v4sf3
:
8325 case CODE_FOR_xop_vpermil2v4df3
:
8326 case CODE_FOR_xop_vpermil2v8sf3
:
8327 error ("the last argument must be a 2-bit immediate");
8328 return gen_reg_rtx (tmode
);
8329 case CODE_FOR_xop_rotlv2di3
:
8330 new_icode
= CODE_FOR_rotlv2di3
;
8332 case CODE_FOR_xop_rotlv4si3
:
8333 new_icode
= CODE_FOR_rotlv4si3
;
8335 case CODE_FOR_xop_rotlv8hi3
:
8336 new_icode
= CODE_FOR_rotlv8hi3
;
8338 case CODE_FOR_xop_rotlv16qi3
:
8339 new_icode
= CODE_FOR_rotlv16qi3
;
8341 if (CONST_INT_P (op
))
8343 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8344 op
= GEN_INT (INTVAL (op
) & mask
);
8346 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8352 && insn_data
[new_icode
].operand
[0].mode
== tmode
8353 && insn_data
[new_icode
].operand
[1].mode
== tmode
8354 && insn_data
[new_icode
].operand
[2].mode
== mode
8355 && insn_data
[new_icode
].operand
[0].predicate
8356 == insn_data
[icode
].operand
[0].predicate
8357 && insn_data
[new_icode
].operand
[1].predicate
8358 == insn_data
[icode
].operand
[1].predicate
);
8371 if (VECTOR_MODE_P (mode
))
8372 op
= safe_vector_operand (op
, mode
);
8374 /* If we aren't optimizing, only allow one memory operand to be
8376 if (memory_operand (op
, mode
))
8379 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8382 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8384 op
= force_reg (mode
, op
);
8388 args
[i
].mode
= mode
;
8394 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8399 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8400 GEN_INT ((int)sub_code
));
8401 else if (! comparison_p
)
8402 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8405 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8409 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8414 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8418 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8432 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8433 insns with vec_merge. */
8436 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8440 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8441 rtx op1
, op0
= expand_normal (arg0
);
8442 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8443 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8445 if (optimize
|| !target
8446 || GET_MODE (target
) != tmode
8447 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8448 target
= gen_reg_rtx (tmode
);
8450 if (VECTOR_MODE_P (mode0
))
8451 op0
= safe_vector_operand (op0
, mode0
);
8453 if ((optimize
&& !register_operand (op0
, mode0
))
8454 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8455 op0
= copy_to_mode_reg (mode0
, op0
);
8458 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8459 op1
= copy_to_mode_reg (mode0
, op1
);
8461 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8468 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8471 ix86_expand_sse_compare (const struct builtin_description
*d
,
8472 tree exp
, rtx target
, bool swap
)
8475 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8476 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8477 rtx op0
= expand_normal (arg0
);
8478 rtx op1
= expand_normal (arg1
);
8480 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8481 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8482 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8483 enum rtx_code comparison
= d
->comparison
;
8485 if (VECTOR_MODE_P (mode0
))
8486 op0
= safe_vector_operand (op0
, mode0
);
8487 if (VECTOR_MODE_P (mode1
))
8488 op1
= safe_vector_operand (op1
, mode1
);
8490 /* Swap operands if we have a comparison that isn't available in
8493 std::swap (op0
, op1
);
8495 if (optimize
|| !target
8496 || GET_MODE (target
) != tmode
8497 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8498 target
= gen_reg_rtx (tmode
);
8500 if ((optimize
&& !register_operand (op0
, mode0
))
8501 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8502 op0
= copy_to_mode_reg (mode0
, op0
);
8503 if ((optimize
&& !register_operand (op1
, mode1
))
8504 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8505 op1
= copy_to_mode_reg (mode1
, op1
);
8507 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8508 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8515 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8518 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8522 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8523 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8524 rtx op0
= expand_normal (arg0
);
8525 rtx op1
= expand_normal (arg1
);
8526 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8527 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8528 enum rtx_code comparison
= d
->comparison
;
8530 if (VECTOR_MODE_P (mode0
))
8531 op0
= safe_vector_operand (op0
, mode0
);
8532 if (VECTOR_MODE_P (mode1
))
8533 op1
= safe_vector_operand (op1
, mode1
);
8535 /* Swap operands if we have a comparison that isn't available in
8537 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8538 std::swap (op0
, op1
);
8540 target
= gen_reg_rtx (SImode
);
8541 emit_move_insn (target
, const0_rtx
);
8542 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8544 if ((optimize
&& !register_operand (op0
, mode0
))
8545 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8546 op0
= copy_to_mode_reg (mode0
, op0
);
8547 if ((optimize
&& !register_operand (op1
, mode1
))
8548 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8549 op1
= copy_to_mode_reg (mode1
, op1
);
8551 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8555 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8556 gen_rtx_fmt_ee (comparison
, QImode
,
8560 return SUBREG_REG (target
);
8563 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8566 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8570 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8571 rtx op1
, op0
= expand_normal (arg0
);
8572 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8573 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8575 if (optimize
|| target
== 0
8576 || GET_MODE (target
) != tmode
8577 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8578 target
= gen_reg_rtx (tmode
);
8580 if (VECTOR_MODE_P (mode0
))
8581 op0
= safe_vector_operand (op0
, mode0
);
8583 if ((optimize
&& !register_operand (op0
, mode0
))
8584 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8585 op0
= copy_to_mode_reg (mode0
, op0
);
8587 op1
= GEN_INT (d
->comparison
);
8589 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8597 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8598 tree exp
, rtx target
)
8601 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8602 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8603 rtx op0
= expand_normal (arg0
);
8604 rtx op1
= expand_normal (arg1
);
8606 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8607 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8608 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8610 if (optimize
|| target
== 0
8611 || GET_MODE (target
) != tmode
8612 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8613 target
= gen_reg_rtx (tmode
);
8615 op0
= safe_vector_operand (op0
, mode0
);
8616 op1
= safe_vector_operand (op1
, mode1
);
8618 if ((optimize
&& !register_operand (op0
, mode0
))
8619 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8620 op0
= copy_to_mode_reg (mode0
, op0
);
8621 if ((optimize
&& !register_operand (op1
, mode1
))
8622 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8623 op1
= copy_to_mode_reg (mode1
, op1
);
8625 op2
= GEN_INT (d
->comparison
);
8627 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8634 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8637 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8641 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8642 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8643 rtx op0
= expand_normal (arg0
);
8644 rtx op1
= expand_normal (arg1
);
8645 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8646 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8647 enum rtx_code comparison
= d
->comparison
;
8649 if (VECTOR_MODE_P (mode0
))
8650 op0
= safe_vector_operand (op0
, mode0
);
8651 if (VECTOR_MODE_P (mode1
))
8652 op1
= safe_vector_operand (op1
, mode1
);
8654 target
= gen_reg_rtx (SImode
);
8655 emit_move_insn (target
, const0_rtx
);
8656 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8658 if ((optimize
&& !register_operand (op0
, mode0
))
8659 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8660 op0
= copy_to_mode_reg (mode0
, op0
);
8661 if ((optimize
&& !register_operand (op1
, mode1
))
8662 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8663 op1
= copy_to_mode_reg (mode1
, op1
);
8665 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8669 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8670 gen_rtx_fmt_ee (comparison
, QImode
,
8674 return SUBREG_REG (target
);
8677 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8680 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8681 tree exp
, rtx target
)
8684 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8685 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8686 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8687 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8688 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8689 rtx scratch0
, scratch1
;
8690 rtx op0
= expand_normal (arg0
);
8691 rtx op1
= expand_normal (arg1
);
8692 rtx op2
= expand_normal (arg2
);
8693 rtx op3
= expand_normal (arg3
);
8694 rtx op4
= expand_normal (arg4
);
8695 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8697 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8698 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8699 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8700 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8701 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8702 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8703 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8705 if (VECTOR_MODE_P (modev2
))
8706 op0
= safe_vector_operand (op0
, modev2
);
8707 if (VECTOR_MODE_P (modev4
))
8708 op2
= safe_vector_operand (op2
, modev4
);
8710 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8711 op0
= copy_to_mode_reg (modev2
, op0
);
8712 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8713 op1
= copy_to_mode_reg (modei3
, op1
);
8714 if ((optimize
&& !register_operand (op2
, modev4
))
8715 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8716 op2
= copy_to_mode_reg (modev4
, op2
);
8717 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8718 op3
= copy_to_mode_reg (modei5
, op3
);
8720 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8722 error ("the fifth argument must be an 8-bit immediate");
8726 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8728 if (optimize
|| !target
8729 || GET_MODE (target
) != tmode0
8730 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8731 target
= gen_reg_rtx (tmode0
);
8733 scratch1
= gen_reg_rtx (tmode1
);
8735 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8737 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8739 if (optimize
|| !target
8740 || GET_MODE (target
) != tmode1
8741 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8742 target
= gen_reg_rtx (tmode1
);
8744 scratch0
= gen_reg_rtx (tmode0
);
8746 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8750 gcc_assert (d
->flag
);
8752 scratch0
= gen_reg_rtx (tmode0
);
8753 scratch1
= gen_reg_rtx (tmode1
);
8755 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8765 target
= gen_reg_rtx (SImode
);
8766 emit_move_insn (target
, const0_rtx
);
8767 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8770 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8771 gen_rtx_fmt_ee (EQ
, QImode
,
8772 gen_rtx_REG ((machine_mode
) d
->flag
,
8775 return SUBREG_REG (target
);
8782 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8785 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8786 tree exp
, rtx target
)
8789 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8790 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8791 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8792 rtx scratch0
, scratch1
;
8793 rtx op0
= expand_normal (arg0
);
8794 rtx op1
= expand_normal (arg1
);
8795 rtx op2
= expand_normal (arg2
);
8796 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8798 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8799 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8800 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8801 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8802 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8804 if (VECTOR_MODE_P (modev2
))
8805 op0
= safe_vector_operand (op0
, modev2
);
8806 if (VECTOR_MODE_P (modev3
))
8807 op1
= safe_vector_operand (op1
, modev3
);
8809 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8810 op0
= copy_to_mode_reg (modev2
, op0
);
8811 if ((optimize
&& !register_operand (op1
, modev3
))
8812 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8813 op1
= copy_to_mode_reg (modev3
, op1
);
8815 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8817 error ("the third argument must be an 8-bit immediate");
8821 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8823 if (optimize
|| !target
8824 || GET_MODE (target
) != tmode0
8825 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8826 target
= gen_reg_rtx (tmode0
);
8828 scratch1
= gen_reg_rtx (tmode1
);
8830 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8832 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8834 if (optimize
|| !target
8835 || GET_MODE (target
) != tmode1
8836 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8837 target
= gen_reg_rtx (tmode1
);
8839 scratch0
= gen_reg_rtx (tmode0
);
8841 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8845 gcc_assert (d
->flag
);
8847 scratch0
= gen_reg_rtx (tmode0
);
8848 scratch1
= gen_reg_rtx (tmode1
);
8850 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8860 target
= gen_reg_rtx (SImode
);
8861 emit_move_insn (target
, const0_rtx
);
8862 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8865 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8866 gen_rtx_fmt_ee (EQ
, QImode
,
8867 gen_rtx_REG ((machine_mode
) d
->flag
,
8870 return SUBREG_REG (target
);
8876 /* Fixup modeless constants to fit required mode. */
8879 fixup_modeless_constant (rtx x
, machine_mode mode
)
8881 if (GET_MODE (x
) == VOIDmode
)
8882 x
= convert_to_mode (mode
, x
, 1);
8886 /* Subroutine of ix86_expand_builtin to take care of insns with
8887 variable number of operands. */
8890 ix86_expand_args_builtin (const struct builtin_description
*d
,
8891 tree exp
, rtx target
)
8893 rtx pat
, real_target
;
8894 unsigned int i
, nargs
;
8895 unsigned int nargs_constant
= 0;
8896 unsigned int mask_pos
= 0;
8903 bool second_arg_count
= false;
8904 enum insn_code icode
= d
->icode
;
8905 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8906 machine_mode tmode
= insn_p
->operand
[0].mode
;
8907 machine_mode rmode
= VOIDmode
;
8909 enum rtx_code comparison
= d
->comparison
;
8911 switch ((enum ix86_builtin_func_type
) d
->flag
)
8913 case V2DF_FTYPE_V2DF_ROUND
:
8914 case V4DF_FTYPE_V4DF_ROUND
:
8915 case V8DF_FTYPE_V8DF_ROUND
:
8916 case V4SF_FTYPE_V4SF_ROUND
:
8917 case V8SF_FTYPE_V8SF_ROUND
:
8918 case V16SF_FTYPE_V16SF_ROUND
:
8919 case V4SI_FTYPE_V4SF_ROUND
:
8920 case V8SI_FTYPE_V8SF_ROUND
:
8921 case V16SI_FTYPE_V16SF_ROUND
:
8922 return ix86_expand_sse_round (d
, exp
, target
);
8923 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8924 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8925 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8926 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8927 case INT_FTYPE_V8SF_V8SF_PTEST
:
8928 case INT_FTYPE_V4DI_V4DI_PTEST
:
8929 case INT_FTYPE_V4DF_V4DF_PTEST
:
8930 case INT_FTYPE_V4SF_V4SF_PTEST
:
8931 case INT_FTYPE_V2DI_V2DI_PTEST
:
8932 case INT_FTYPE_V2DF_V2DF_PTEST
:
8933 return ix86_expand_sse_ptest (d
, exp
, target
);
8934 case FLOAT128_FTYPE_FLOAT128
:
8935 case FLOAT_FTYPE_FLOAT
:
8937 case UINT_FTYPE_UINT
:
8938 case UINT16_FTYPE_UINT16
:
8939 case UINT64_FTYPE_INT
:
8940 case UINT64_FTYPE_UINT64
:
8941 case INT64_FTYPE_INT64
:
8942 case INT64_FTYPE_V4SF
:
8943 case INT64_FTYPE_V2DF
:
8944 case INT_FTYPE_V16QI
:
8945 case INT_FTYPE_V8QI
:
8946 case INT_FTYPE_V8SF
:
8947 case INT_FTYPE_V4DF
:
8948 case INT_FTYPE_V4SF
:
8949 case INT_FTYPE_V2DF
:
8950 case INT_FTYPE_V32QI
:
8951 case V16QI_FTYPE_V16QI
:
8952 case V8SI_FTYPE_V8SF
:
8953 case V8SI_FTYPE_V4SI
:
8954 case V8HI_FTYPE_V8HI
:
8955 case V8HI_FTYPE_V16QI
:
8956 case V8QI_FTYPE_V8QI
:
8957 case V8SF_FTYPE_V8SF
:
8958 case V8SF_FTYPE_V8SI
:
8959 case V8SF_FTYPE_V4SF
:
8960 case V8SF_FTYPE_V8HI
:
8961 case V4SI_FTYPE_V4SI
:
8962 case V4SI_FTYPE_V16QI
:
8963 case V4SI_FTYPE_V4SF
:
8964 case V4SI_FTYPE_V8SI
:
8965 case V4SI_FTYPE_V8HI
:
8966 case V4SI_FTYPE_V4DF
:
8967 case V4SI_FTYPE_V2DF
:
8968 case V4HI_FTYPE_V4HI
:
8969 case V4DF_FTYPE_V4DF
:
8970 case V4DF_FTYPE_V4SI
:
8971 case V4DF_FTYPE_V4SF
:
8972 case V4DF_FTYPE_V2DF
:
8973 case V4SF_FTYPE_V4SF
:
8974 case V4SF_FTYPE_V4SI
:
8975 case V4SF_FTYPE_V8SF
:
8976 case V4SF_FTYPE_V4DF
:
8977 case V4SF_FTYPE_V8HI
:
8978 case V4SF_FTYPE_V2DF
:
8979 case V2DI_FTYPE_V2DI
:
8980 case V2DI_FTYPE_V16QI
:
8981 case V2DI_FTYPE_V8HI
:
8982 case V2DI_FTYPE_V4SI
:
8983 case V2DF_FTYPE_V2DF
:
8984 case V2DF_FTYPE_V4SI
:
8985 case V2DF_FTYPE_V4DF
:
8986 case V2DF_FTYPE_V4SF
:
8987 case V2DF_FTYPE_V2SI
:
8988 case V2SI_FTYPE_V2SI
:
8989 case V2SI_FTYPE_V4SF
:
8990 case V2SI_FTYPE_V2SF
:
8991 case V2SI_FTYPE_V2DF
:
8992 case V2SF_FTYPE_V2SF
:
8993 case V2SF_FTYPE_V2SI
:
8994 case V32QI_FTYPE_V32QI
:
8995 case V32QI_FTYPE_V16QI
:
8996 case V16HI_FTYPE_V16HI
:
8997 case V16HI_FTYPE_V8HI
:
8998 case V8SI_FTYPE_V8SI
:
8999 case V16HI_FTYPE_V16QI
:
9000 case V8SI_FTYPE_V16QI
:
9001 case V4DI_FTYPE_V16QI
:
9002 case V8SI_FTYPE_V8HI
:
9003 case V4DI_FTYPE_V8HI
:
9004 case V4DI_FTYPE_V4SI
:
9005 case V4DI_FTYPE_V2DI
:
9012 case UHI_FTYPE_V16QI
:
9013 case USI_FTYPE_V32QI
:
9014 case UDI_FTYPE_V64QI
:
9015 case V16QI_FTYPE_UHI
:
9016 case V32QI_FTYPE_USI
:
9017 case V64QI_FTYPE_UDI
:
9018 case V8HI_FTYPE_UQI
:
9019 case V16HI_FTYPE_UHI
:
9020 case V32HI_FTYPE_USI
:
9021 case V4SI_FTYPE_UQI
:
9022 case V8SI_FTYPE_UQI
:
9023 case V4SI_FTYPE_UHI
:
9024 case V8SI_FTYPE_UHI
:
9025 case UQI_FTYPE_V8HI
:
9026 case UHI_FTYPE_V16HI
:
9027 case USI_FTYPE_V32HI
:
9028 case UQI_FTYPE_V4SI
:
9029 case UQI_FTYPE_V8SI
:
9030 case UHI_FTYPE_V16SI
:
9031 case UQI_FTYPE_V2DI
:
9032 case UQI_FTYPE_V4DI
:
9033 case UQI_FTYPE_V8DI
:
9034 case V16SI_FTYPE_UHI
:
9035 case V2DI_FTYPE_UQI
:
9036 case V4DI_FTYPE_UQI
:
9037 case V16SI_FTYPE_INT
:
9038 case V16SF_FTYPE_V8SF
:
9039 case V16SI_FTYPE_V8SI
:
9040 case V16SF_FTYPE_V4SF
:
9041 case V16SI_FTYPE_V4SI
:
9042 case V16SI_FTYPE_V16SF
:
9043 case V16SI_FTYPE_V16SI
:
9044 case V64QI_FTYPE_V64QI
:
9045 case V32HI_FTYPE_V32HI
:
9046 case V16SF_FTYPE_V16SF
:
9047 case V8DI_FTYPE_UQI
:
9048 case V8DI_FTYPE_V8DI
:
9049 case V8DF_FTYPE_V4DF
:
9050 case V8DF_FTYPE_V2DF
:
9051 case V8DF_FTYPE_V8DF
:
9052 case V4DI_FTYPE_V4DI
:
9053 case V16HI_FTYPE_V16SF
:
9054 case V8HI_FTYPE_V8SF
:
9055 case V8HI_FTYPE_V4SF
:
9058 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9059 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9060 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9061 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9062 case V16QI_FTYPE_V16QI_V16QI
:
9063 case V16QI_FTYPE_V8HI_V8HI
:
9064 case V16SF_FTYPE_V16SF_V16SF
:
9065 case V8QI_FTYPE_V8QI_V8QI
:
9066 case V8QI_FTYPE_V4HI_V4HI
:
9067 case V8HI_FTYPE_V8HI_V8HI
:
9068 case V8HI_FTYPE_V16QI_V16QI
:
9069 case V8HI_FTYPE_V4SI_V4SI
:
9070 case V8SF_FTYPE_V8SF_V8SF
:
9071 case V8SF_FTYPE_V8SF_V8SI
:
9072 case V8DF_FTYPE_V8DF_V8DF
:
9073 case V4SI_FTYPE_V4SI_V4SI
:
9074 case V4SI_FTYPE_V8HI_V8HI
:
9075 case V4SI_FTYPE_V2DF_V2DF
:
9076 case V4HI_FTYPE_V4HI_V4HI
:
9077 case V4HI_FTYPE_V8QI_V8QI
:
9078 case V4HI_FTYPE_V2SI_V2SI
:
9079 case V4DF_FTYPE_V4DF_V4DF
:
9080 case V4DF_FTYPE_V4DF_V4DI
:
9081 case V4SF_FTYPE_V4SF_V4SF
:
9082 case V4SF_FTYPE_V4SF_V4SI
:
9083 case V4SF_FTYPE_V4SF_V2SI
:
9084 case V4SF_FTYPE_V4SF_V2DF
:
9085 case V4SF_FTYPE_V4SF_UINT
:
9086 case V4SF_FTYPE_V4SF_DI
:
9087 case V4SF_FTYPE_V4SF_SI
:
9088 case V2DI_FTYPE_V2DI_V2DI
:
9089 case V2DI_FTYPE_V16QI_V16QI
:
9090 case V2DI_FTYPE_V4SI_V4SI
:
9091 case V2DI_FTYPE_V2DI_V16QI
:
9092 case V2SI_FTYPE_V2SI_V2SI
:
9093 case V2SI_FTYPE_V4HI_V4HI
:
9094 case V2SI_FTYPE_V2SF_V2SF
:
9095 case V2DF_FTYPE_V2DF_V2DF
:
9096 case V2DF_FTYPE_V2DF_V4SF
:
9097 case V2DF_FTYPE_V2DF_V2DI
:
9098 case V2DF_FTYPE_V2DF_DI
:
9099 case V2DF_FTYPE_V2DF_SI
:
9100 case V2DF_FTYPE_V2DF_UINT
:
9101 case V2SF_FTYPE_V2SF_V2SF
:
9102 case V1DI_FTYPE_V1DI_V1DI
:
9103 case V1DI_FTYPE_V8QI_V8QI
:
9104 case V1DI_FTYPE_V2SI_V2SI
:
9105 case V32QI_FTYPE_V16HI_V16HI
:
9106 case V16HI_FTYPE_V8SI_V8SI
:
9107 case V64QI_FTYPE_V64QI_V64QI
:
9108 case V32QI_FTYPE_V32QI_V32QI
:
9109 case V16HI_FTYPE_V32QI_V32QI
:
9110 case V16HI_FTYPE_V16HI_V16HI
:
9111 case V8SI_FTYPE_V4DF_V4DF
:
9112 case V8SI_FTYPE_V8SI_V8SI
:
9113 case V8SI_FTYPE_V16HI_V16HI
:
9114 case V4DI_FTYPE_V4DI_V4DI
:
9115 case V4DI_FTYPE_V8SI_V8SI
:
9116 case V8DI_FTYPE_V64QI_V64QI
:
9117 if (comparison
== UNKNOWN
)
9118 return ix86_expand_binop_builtin (icode
, exp
, target
);
9121 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9122 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9123 gcc_assert (comparison
!= UNKNOWN
);
9127 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9128 case V16HI_FTYPE_V16HI_SI_COUNT
:
9129 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9130 case V8SI_FTYPE_V8SI_SI_COUNT
:
9131 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9132 case V4DI_FTYPE_V4DI_INT_COUNT
:
9133 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9134 case V8HI_FTYPE_V8HI_SI_COUNT
:
9135 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9136 case V4SI_FTYPE_V4SI_SI_COUNT
:
9137 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9138 case V4HI_FTYPE_V4HI_SI_COUNT
:
9139 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9140 case V2DI_FTYPE_V2DI_SI_COUNT
:
9141 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9142 case V2SI_FTYPE_V2SI_SI_COUNT
:
9143 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9144 case V1DI_FTYPE_V1DI_SI_COUNT
:
9146 second_arg_count
= true;
9148 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9149 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9150 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9151 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9152 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9153 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9154 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9155 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9156 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9157 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9158 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9159 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9160 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9161 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9162 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9163 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9164 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9165 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9167 second_arg_count
= true;
9169 case UINT64_FTYPE_UINT64_UINT64
:
9170 case UINT_FTYPE_UINT_UINT
:
9171 case UINT_FTYPE_UINT_USHORT
:
9172 case UINT_FTYPE_UINT_UCHAR
:
9173 case UINT16_FTYPE_UINT16_INT
:
9174 case UINT8_FTYPE_UINT8_INT
:
9175 case UQI_FTYPE_UQI_UQI
:
9176 case UHI_FTYPE_UHI_UHI
:
9177 case USI_FTYPE_USI_USI
:
9178 case UDI_FTYPE_UDI_UDI
:
9179 case V16SI_FTYPE_V8DF_V8DF
:
9180 case V32HI_FTYPE_V16SF_V16SF
:
9181 case V16HI_FTYPE_V8SF_V8SF
:
9182 case V8HI_FTYPE_V4SF_V4SF
:
9183 case V16HI_FTYPE_V16SF_UHI
:
9184 case V8HI_FTYPE_V8SF_UQI
:
9185 case V8HI_FTYPE_V4SF_UQI
:
9188 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9193 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9198 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9203 case V8HI_FTYPE_V8HI_INT
:
9204 case V8HI_FTYPE_V8SF_INT
:
9205 case V16HI_FTYPE_V16SF_INT
:
9206 case V8HI_FTYPE_V4SF_INT
:
9207 case V8SF_FTYPE_V8SF_INT
:
9208 case V4SF_FTYPE_V16SF_INT
:
9209 case V16SF_FTYPE_V16SF_INT
:
9210 case V4SI_FTYPE_V4SI_INT
:
9211 case V4SI_FTYPE_V8SI_INT
:
9212 case V4HI_FTYPE_V4HI_INT
:
9213 case V4DF_FTYPE_V4DF_INT
:
9214 case V4DF_FTYPE_V8DF_INT
:
9215 case V4SF_FTYPE_V4SF_INT
:
9216 case V4SF_FTYPE_V8SF_INT
:
9217 case V2DI_FTYPE_V2DI_INT
:
9218 case V2DF_FTYPE_V2DF_INT
:
9219 case V2DF_FTYPE_V4DF_INT
:
9220 case V16HI_FTYPE_V16HI_INT
:
9221 case V8SI_FTYPE_V8SI_INT
:
9222 case V16SI_FTYPE_V16SI_INT
:
9223 case V4SI_FTYPE_V16SI_INT
:
9224 case V4DI_FTYPE_V4DI_INT
:
9225 case V2DI_FTYPE_V4DI_INT
:
9226 case V4DI_FTYPE_V8DI_INT
:
9227 case UQI_FTYPE_UQI_UQI_CONST
:
9228 case UHI_FTYPE_UHI_UQI
:
9229 case USI_FTYPE_USI_UQI
:
9230 case UDI_FTYPE_UDI_UQI
:
9234 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9235 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9236 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9237 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9238 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9239 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9240 case UHI_FTYPE_V16SI_V16SI_UHI
:
9241 case UQI_FTYPE_V8DI_V8DI_UQI
:
9242 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9243 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9244 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9245 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9246 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9247 case V16SI_FTYPE_SI_V16SI_UHI
:
9248 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9249 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9250 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9251 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9252 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9253 case V8SI_FTYPE_SI_V8SI_UQI
:
9254 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9255 case V4SI_FTYPE_SI_V4SI_UQI
:
9256 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9257 case V4DI_FTYPE_DI_V4DI_UQI
:
9258 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9259 case V2DI_FTYPE_DI_V2DI_UQI
:
9260 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9261 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9262 case V64QI_FTYPE_QI_V64QI_UDI
:
9263 case V32QI_FTYPE_V32QI_V32QI_USI
:
9264 case V32QI_FTYPE_V16QI_V32QI_USI
:
9265 case V32QI_FTYPE_QI_V32QI_USI
:
9266 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9267 case V16QI_FTYPE_QI_V16QI_UHI
:
9268 case V32HI_FTYPE_V8HI_V32HI_USI
:
9269 case V32HI_FTYPE_HI_V32HI_USI
:
9270 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9271 case V16HI_FTYPE_HI_V16HI_UHI
:
9272 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9273 case V8HI_FTYPE_HI_V8HI_UQI
:
9274 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9275 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9276 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9277 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9278 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9279 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9280 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9281 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9282 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9283 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9284 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9285 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9286 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9287 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9288 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9289 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9290 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9291 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9292 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9293 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9294 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9295 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9296 case V32QI_FTYPE_V32HI_V32QI_USI
:
9297 case UHI_FTYPE_V16QI_V16QI_UHI
:
9298 case USI_FTYPE_V32QI_V32QI_USI
:
9299 case UDI_FTYPE_V64QI_V64QI_UDI
:
9300 case UQI_FTYPE_V8HI_V8HI_UQI
:
9301 case UHI_FTYPE_V16HI_V16HI_UHI
:
9302 case USI_FTYPE_V32HI_V32HI_USI
:
9303 case UQI_FTYPE_V4SI_V4SI_UQI
:
9304 case UQI_FTYPE_V8SI_V8SI_UQI
:
9305 case UQI_FTYPE_V2DI_V2DI_UQI
:
9306 case UQI_FTYPE_V4DI_V4DI_UQI
:
9307 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9308 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9309 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9310 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9311 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9312 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9313 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9314 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9315 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9316 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9317 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9318 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9319 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9320 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9321 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9322 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9323 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9324 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9325 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9326 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9327 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9328 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9329 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9330 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9331 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9332 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9333 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9334 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9335 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9336 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9337 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9338 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9339 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9340 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9341 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9342 case V8DI_FTYPE_DI_V8DI_UQI
:
9343 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9344 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9345 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9346 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9347 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9348 case V32HI_FTYPE_V32HI_V32HI_USI
:
9349 case V32HI_FTYPE_V32QI_V32HI_USI
:
9350 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9351 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9352 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9353 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9354 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9355 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9356 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9357 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9358 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9359 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9360 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9361 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9362 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9363 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9364 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9365 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9366 case V32HI_FTYPE_V16SF_V16SF_USI
:
9367 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9368 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9369 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9370 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9371 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9372 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9373 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9374 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9377 case V32QI_FTYPE_V32QI_V32QI_INT
:
9378 case V16HI_FTYPE_V16HI_V16HI_INT
:
9379 case V16QI_FTYPE_V16QI_V16QI_INT
:
9380 case V4DI_FTYPE_V4DI_V4DI_INT
:
9381 case V8HI_FTYPE_V8HI_V8HI_INT
:
9382 case V8SI_FTYPE_V8SI_V8SI_INT
:
9383 case V8SI_FTYPE_V8SI_V4SI_INT
:
9384 case V8SF_FTYPE_V8SF_V8SF_INT
:
9385 case V8SF_FTYPE_V8SF_V4SF_INT
:
9386 case V4SI_FTYPE_V4SI_V4SI_INT
:
9387 case V4DF_FTYPE_V4DF_V4DF_INT
:
9388 case V16SF_FTYPE_V16SF_V16SF_INT
:
9389 case V16SF_FTYPE_V16SF_V4SF_INT
:
9390 case V16SI_FTYPE_V16SI_V4SI_INT
:
9391 case V4DF_FTYPE_V4DF_V2DF_INT
:
9392 case V4SF_FTYPE_V4SF_V4SF_INT
:
9393 case V2DI_FTYPE_V2DI_V2DI_INT
:
9394 case V4DI_FTYPE_V4DI_V2DI_INT
:
9395 case V2DF_FTYPE_V2DF_V2DF_INT
:
9396 case UQI_FTYPE_V8DI_V8UDI_INT
:
9397 case UQI_FTYPE_V8DF_V8DF_INT
:
9398 case UQI_FTYPE_V2DF_V2DF_INT
:
9399 case UQI_FTYPE_V4SF_V4SF_INT
:
9400 case UHI_FTYPE_V16SI_V16SI_INT
:
9401 case UHI_FTYPE_V16SF_V16SF_INT
:
9402 case V64QI_FTYPE_V64QI_V64QI_INT
:
9403 case V32HI_FTYPE_V32HI_V32HI_INT
:
9404 case V16SI_FTYPE_V16SI_V16SI_INT
:
9405 case V8DI_FTYPE_V8DI_V8DI_INT
:
9409 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9414 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9419 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9424 case V2DI_FTYPE_V2DI_UINT_UINT
:
9428 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9433 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9439 case QI_FTYPE_V8DF_INT_UQI
:
9440 case QI_FTYPE_V4DF_INT_UQI
:
9441 case QI_FTYPE_V2DF_INT_UQI
:
9442 case HI_FTYPE_V16SF_INT_UHI
:
9443 case QI_FTYPE_V8SF_INT_UQI
:
9444 case QI_FTYPE_V4SF_INT_UQI
:
9445 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9446 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9451 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9457 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9463 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9464 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9465 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9466 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9467 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9468 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9469 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9470 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9471 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9472 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9473 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9474 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9475 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9476 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9477 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9478 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9479 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9480 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9481 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9482 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9483 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9484 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9485 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9486 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9487 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9488 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9489 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9490 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9491 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9492 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9493 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9494 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9495 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9496 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9497 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9498 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9499 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9500 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9501 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9502 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9503 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9504 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9505 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9506 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9507 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9508 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9509 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9510 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9511 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9512 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9513 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9514 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9515 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9516 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9519 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9520 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9521 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9522 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9523 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9527 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9528 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9529 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9530 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9531 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9532 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9533 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9534 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9535 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9536 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9537 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9538 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9539 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9540 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9545 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9549 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9550 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9551 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9552 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9553 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9556 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9557 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9562 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9563 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9564 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9565 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9566 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9567 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9568 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9569 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9570 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9571 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9572 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9573 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9574 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9575 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9576 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9577 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9578 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9579 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9580 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9581 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9582 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9583 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9584 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9585 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9586 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9587 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9588 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9589 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9590 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9591 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9596 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9597 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9598 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9599 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9600 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9601 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9602 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9603 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9604 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9605 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9606 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9607 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9608 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9609 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9610 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9611 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9612 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9613 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9614 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9615 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9616 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9617 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9618 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9619 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9620 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9621 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9622 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9627 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9628 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9629 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9630 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9631 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9632 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9633 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9634 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9635 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9636 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9641 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9642 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9643 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9644 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9645 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9646 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9647 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9648 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9649 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9650 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9651 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9652 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9662 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9664 if (comparison
!= UNKNOWN
)
9666 gcc_assert (nargs
== 2);
9667 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9670 if (rmode
== VOIDmode
|| rmode
== tmode
)
9674 || GET_MODE (target
) != tmode
9675 || !insn_p
->operand
[0].predicate (target
, tmode
))
9676 target
= gen_reg_rtx (tmode
);
9677 else if (memory_operand (target
, tmode
))
9679 real_target
= target
;
9683 real_target
= gen_reg_rtx (tmode
);
9684 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9687 for (i
= 0; i
< nargs
; i
++)
9689 tree arg
= CALL_EXPR_ARG (exp
, i
);
9690 rtx op
= expand_normal (arg
);
9691 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9692 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9694 if (second_arg_count
&& i
== 1)
9696 /* SIMD shift insns take either an 8-bit immediate or
9697 register as count. But builtin functions take int as
9698 count. If count doesn't match, we put it in register.
9699 The instructions are using 64-bit count, if op is just
9700 32-bit, zero-extend it, as negative shift counts
9701 are undefined behavior and zero-extension is more
9705 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9706 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9708 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9709 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9710 op
= copy_to_reg (op
);
9713 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9714 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9719 case CODE_FOR_avx_vinsertf128v4di
:
9720 case CODE_FOR_avx_vextractf128v4di
:
9721 error ("the last argument must be an 1-bit immediate");
9724 case CODE_FOR_avx512f_cmpv8di3_mask
:
9725 case CODE_FOR_avx512f_cmpv16si3_mask
:
9726 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9727 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9728 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9729 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9730 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9731 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9732 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9733 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9734 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9735 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9736 error ("the last argument must be a 3-bit immediate");
9739 case CODE_FOR_sse4_1_roundsd
:
9740 case CODE_FOR_sse4_1_roundss
:
9742 case CODE_FOR_sse4_1_roundpd
:
9743 case CODE_FOR_sse4_1_roundps
:
9744 case CODE_FOR_avx_roundpd256
:
9745 case CODE_FOR_avx_roundps256
:
9747 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9748 case CODE_FOR_sse4_1_roundps_sfix
:
9749 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9750 case CODE_FOR_avx_roundps_sfix256
:
9752 case CODE_FOR_sse4_1_blendps
:
9753 case CODE_FOR_avx_blendpd256
:
9754 case CODE_FOR_avx_vpermilv4df
:
9755 case CODE_FOR_avx_vpermilv4df_mask
:
9756 case CODE_FOR_avx512f_getmantv8df_mask
:
9757 case CODE_FOR_avx512f_getmantv16sf_mask
:
9758 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9759 case CODE_FOR_avx512vl_getmantv4df_mask
:
9760 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9761 case CODE_FOR_avx512vl_getmantv2df_mask
:
9762 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9763 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9764 case CODE_FOR_avx512dq_rangepv4df_mask
:
9765 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9766 case CODE_FOR_avx512dq_rangepv2df_mask
:
9767 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9768 case CODE_FOR_avx_shufpd256_mask
:
9769 error ("the last argument must be a 4-bit immediate");
9772 case CODE_FOR_sha1rnds4
:
9773 case CODE_FOR_sse4_1_blendpd
:
9774 case CODE_FOR_avx_vpermilv2df
:
9775 case CODE_FOR_avx_vpermilv2df_mask
:
9776 case CODE_FOR_xop_vpermil2v2df3
:
9777 case CODE_FOR_xop_vpermil2v4sf3
:
9778 case CODE_FOR_xop_vpermil2v4df3
:
9779 case CODE_FOR_xop_vpermil2v8sf3
:
9780 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9781 case CODE_FOR_avx512f_vinserti32x4_mask
:
9782 case CODE_FOR_avx512f_vextractf32x4_mask
:
9783 case CODE_FOR_avx512f_vextracti32x4_mask
:
9784 case CODE_FOR_sse2_shufpd
:
9785 case CODE_FOR_sse2_shufpd_mask
:
9786 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9787 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9788 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9789 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9790 error ("the last argument must be a 2-bit immediate");
9793 case CODE_FOR_avx_vextractf128v4df
:
9794 case CODE_FOR_avx_vextractf128v8sf
:
9795 case CODE_FOR_avx_vextractf128v8si
:
9796 case CODE_FOR_avx_vinsertf128v4df
:
9797 case CODE_FOR_avx_vinsertf128v8sf
:
9798 case CODE_FOR_avx_vinsertf128v8si
:
9799 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9800 case CODE_FOR_avx512f_vinserti64x4_mask
:
9801 case CODE_FOR_avx512f_vextractf64x4_mask
:
9802 case CODE_FOR_avx512f_vextracti64x4_mask
:
9803 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9804 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9805 case CODE_FOR_avx512vl_vinsertv4df
:
9806 case CODE_FOR_avx512vl_vinsertv4di
:
9807 case CODE_FOR_avx512vl_vinsertv8sf
:
9808 case CODE_FOR_avx512vl_vinsertv8si
:
9809 error ("the last argument must be a 1-bit immediate");
9812 case CODE_FOR_avx_vmcmpv2df3
:
9813 case CODE_FOR_avx_vmcmpv4sf3
:
9814 case CODE_FOR_avx_cmpv2df3
:
9815 case CODE_FOR_avx_cmpv4sf3
:
9816 case CODE_FOR_avx_cmpv4df3
:
9817 case CODE_FOR_avx_cmpv8sf3
:
9818 case CODE_FOR_avx512f_cmpv8df3_mask
:
9819 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9820 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9821 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9822 error ("the last argument must be a 5-bit immediate");
9826 switch (nargs_constant
)
9829 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9830 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9832 error ("the next to last argument must be an 8-bit immediate");
9837 error ("the last argument must be an 8-bit immediate");
9847 if (VECTOR_MODE_P (mode
))
9848 op
= safe_vector_operand (op
, mode
);
9850 /* If we aren't optimizing, only allow one memory operand to
9852 if (memory_operand (op
, mode
))
9855 op
= fixup_modeless_constant (op
, mode
);
9857 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9859 if (optimize
|| !match
|| num_memory
> 1)
9860 op
= copy_to_mode_reg (mode
, op
);
9864 op
= copy_to_reg (op
);
9865 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9870 args
[i
].mode
= mode
;
9876 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9879 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9882 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9886 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9887 args
[2].op
, args
[3].op
);
9890 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9891 args
[2].op
, args
[3].op
, args
[4].op
);
9894 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9895 args
[2].op
, args
[3].op
, args
[4].op
,
9909 /* Transform pattern of following layout:
9911 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9917 ix86_erase_embedded_rounding (rtx pat
)
9919 if (GET_CODE (pat
) == INSN
)
9920 pat
= PATTERN (pat
);
9922 gcc_assert (GET_CODE (pat
) == SET
);
9923 rtx src
= SET_SRC (pat
);
9924 gcc_assert (XVECLEN (src
, 0) == 2);
9925 rtx p0
= XVECEXP (src
, 0, 0);
9926 gcc_assert (GET_CODE (src
) == UNSPEC
9927 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9928 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9932 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9935 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9936 tree exp
, rtx target
)
9939 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9940 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9941 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9942 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9943 rtx op0
= expand_normal (arg0
);
9944 rtx op1
= expand_normal (arg1
);
9945 rtx op2
= expand_normal (arg2
);
9946 rtx op3
= expand_normal (arg3
);
9947 enum insn_code icode
= d
->icode
;
9948 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9949 machine_mode mode0
= insn_p
->operand
[0].mode
;
9950 machine_mode mode1
= insn_p
->operand
[1].mode
;
9952 /* See avxintrin.h for values. */
9953 static const enum rtx_code comparisons
[32] =
9955 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9956 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
9957 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9958 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
9960 static const bool ordereds
[32] =
9962 true, true, true, false, false, false, false, true,
9963 false, false, false, true, true, true, true, false,
9964 true, true, true, false, false, false, false, true,
9965 false, false, false, true, true, true, true, false
9967 static const bool non_signalings
[32] =
9969 true, false, false, true, true, false, false, true,
9970 true, false, false, true, true, false, false, true,
9971 false, true, true, false, false, true, true, false,
9972 false, true, true, false, false, true, true, false
9975 if (!CONST_INT_P (op2
))
9977 error ("the third argument must be comparison constant");
9980 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
9982 error ("incorrect comparison mode");
9986 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
9988 error ("incorrect rounding operand");
9992 if (VECTOR_MODE_P (mode0
))
9993 op0
= safe_vector_operand (op0
, mode0
);
9994 if (VECTOR_MODE_P (mode1
))
9995 op1
= safe_vector_operand (op1
, mode1
);
9997 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
9998 bool ordered
= ordereds
[INTVAL (op2
)];
9999 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10000 rtx const_val
= const0_rtx
;
10002 bool check_unordered
= false;
10003 machine_mode mode
= CCFPmode
;
10004 switch (comparison
)
10009 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10010 if (!non_signaling
)
10016 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10026 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10033 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10034 if (!non_signaling
)
10041 case LE
: /* -> GE */
10042 case LT
: /* -> GT */
10043 case UNGE
: /* -> UNLE */
10044 case UNGT
: /* -> UNLT */
10045 std::swap (op0
, op1
);
10046 comparison
= swap_condition (comparison
);
10054 /* These are supported by CCFPmode. NB: Use ordered/signaling
10055 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10056 with NAN operands. */
10057 if (ordered
== non_signaling
)
10058 ordered
= !ordered
;
10061 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10062 _CMP_EQ_OQ/_CMP_EQ_OS. */
10063 check_unordered
= true;
10067 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10068 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10069 gcc_assert (!ordered
);
10070 check_unordered
= true;
10072 const_val
= const1_rtx
;
10075 gcc_unreachable ();
10078 target
= gen_reg_rtx (SImode
);
10079 emit_move_insn (target
, const_val
);
10080 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10082 if ((optimize
&& !register_operand (op0
, mode0
))
10083 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10084 op0
= copy_to_mode_reg (mode0
, op0
);
10085 if ((optimize
&& !register_operand (op1
, mode1
))
10086 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10087 op1
= copy_to_mode_reg (mode1
, op1
);
10090 1. COMI: ordered and signaling.
10091 2. UCOMI: unordered and non-signaling.
10094 icode
= (icode
== CODE_FOR_sse_comi_round
10095 ? CODE_FOR_sse_ucomi_round
10096 : CODE_FOR_sse2_ucomi_round
);
10098 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10102 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10103 if (INTVAL (op3
) == NO_ROUND
)
10105 pat
= ix86_erase_embedded_rounding (pat
);
10109 set_dst
= SET_DEST (pat
);
10113 gcc_assert (GET_CODE (pat
) == SET
);
10114 set_dst
= SET_DEST (pat
);
10119 rtx_code_label
*label
= NULL
;
10121 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10122 with NAN operands. */
10123 if (check_unordered
)
10125 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10127 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10128 label
= gen_label_rtx ();
10129 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10130 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10131 gen_rtx_LABEL_REF (VOIDmode
, label
),
10133 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10136 /* NB: Set CCFPmode and check a different CCmode which is in subset
10138 if (GET_MODE (set_dst
) != mode
)
10140 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10141 || mode
== CCOmode
|| mode
== CCPmode
10142 || mode
== CCSmode
|| mode
== CCZmode
);
10143 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10146 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10147 gen_rtx_fmt_ee (comparison
, QImode
,
10152 emit_label (label
);
10154 return SUBREG_REG (target
);
10158 ix86_expand_round_builtin (const struct builtin_description
*d
,
10159 tree exp
, rtx target
)
10162 unsigned int i
, nargs
;
10168 enum insn_code icode
= d
->icode
;
10169 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10170 machine_mode tmode
= insn_p
->operand
[0].mode
;
10171 unsigned int nargs_constant
= 0;
10172 unsigned int redundant_embed_rnd
= 0;
10174 switch ((enum ix86_builtin_func_type
) d
->flag
)
10176 case UINT64_FTYPE_V2DF_INT
:
10177 case UINT64_FTYPE_V4SF_INT
:
10178 case UINT_FTYPE_V2DF_INT
:
10179 case UINT_FTYPE_V4SF_INT
:
10180 case INT64_FTYPE_V2DF_INT
:
10181 case INT64_FTYPE_V4SF_INT
:
10182 case INT_FTYPE_V2DF_INT
:
10183 case INT_FTYPE_V4SF_INT
:
10186 case V4SF_FTYPE_V4SF_UINT_INT
:
10187 case V4SF_FTYPE_V4SF_UINT64_INT
:
10188 case V2DF_FTYPE_V2DF_UINT64_INT
:
10189 case V4SF_FTYPE_V4SF_INT_INT
:
10190 case V4SF_FTYPE_V4SF_INT64_INT
:
10191 case V2DF_FTYPE_V2DF_INT64_INT
:
10192 case V4SF_FTYPE_V4SF_V4SF_INT
:
10193 case V2DF_FTYPE_V2DF_V2DF_INT
:
10194 case V4SF_FTYPE_V4SF_V2DF_INT
:
10195 case V2DF_FTYPE_V2DF_V4SF_INT
:
10198 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10199 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10200 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10201 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10202 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10203 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10204 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10205 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10206 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10207 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10208 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10209 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10210 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10211 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10214 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10215 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10216 nargs_constant
= 2;
10219 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10220 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10221 return ix86_expand_sse_comi_round (d
, exp
, target
);
10222 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10223 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10224 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10225 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10226 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10227 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10228 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10229 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10232 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10233 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10234 nargs_constant
= 4;
10237 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10238 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10239 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10240 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10241 nargs_constant
= 3;
10244 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10245 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10246 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10247 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10248 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10249 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10251 nargs_constant
= 4;
10253 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10254 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10255 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10256 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10258 nargs_constant
= 3;
10261 gcc_unreachable ();
10263 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10267 || GET_MODE (target
) != tmode
10268 || !insn_p
->operand
[0].predicate (target
, tmode
))
10269 target
= gen_reg_rtx (tmode
);
10271 for (i
= 0; i
< nargs
; i
++)
10273 tree arg
= CALL_EXPR_ARG (exp
, i
);
10274 rtx op
= expand_normal (arg
);
10275 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10276 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10278 if (i
== nargs
- nargs_constant
)
10284 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10285 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10286 case CODE_FOR_avx512f_vgetmantv2df_round
:
10287 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10288 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10289 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10290 error ("the immediate argument must be a 4-bit immediate");
10292 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10293 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10294 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10295 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10296 error ("the immediate argument must be a 5-bit immediate");
10299 error ("the immediate argument must be an 8-bit immediate");
10304 else if (i
== nargs
-1)
10306 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10308 error ("incorrect rounding operand");
10312 /* If there is no rounding use normal version of the pattern. */
10313 if (INTVAL (op
) == NO_ROUND
)
10314 redundant_embed_rnd
= 1;
10318 if (VECTOR_MODE_P (mode
))
10319 op
= safe_vector_operand (op
, mode
);
10321 op
= fixup_modeless_constant (op
, mode
);
10323 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10325 if (optimize
|| !match
)
10326 op
= copy_to_mode_reg (mode
, op
);
10330 op
= copy_to_reg (op
);
10331 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10336 args
[i
].mode
= mode
;
10342 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10345 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10348 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10352 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10353 args
[2].op
, args
[3].op
);
10356 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10357 args
[2].op
, args
[3].op
, args
[4].op
);
10360 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10361 args
[2].op
, args
[3].op
, args
[4].op
,
10365 gcc_unreachable ();
10371 if (redundant_embed_rnd
)
10372 pat
= ix86_erase_embedded_rounding (pat
);
10378 /* Subroutine of ix86_expand_builtin to take care of special insns
10379 with variable number of operands. */
10382 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10383 tree exp
, rtx target
)
10387 unsigned int i
, nargs
, arg_adjust
, memory
;
10388 bool aligned_mem
= false;
10394 enum insn_code icode
= d
->icode
;
10395 bool last_arg_constant
= false;
10396 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10397 machine_mode tmode
= insn_p
->operand
[0].mode
;
10398 enum { load
, store
} klass
;
10400 switch ((enum ix86_builtin_func_type
) d
->flag
)
10402 case VOID_FTYPE_VOID
:
10403 emit_insn (GEN_FCN (icode
) (target
));
10405 case VOID_FTYPE_UINT64
:
10406 case VOID_FTYPE_UNSIGNED
:
10412 case INT_FTYPE_VOID
:
10413 case USHORT_FTYPE_VOID
:
10414 case UINT64_FTYPE_VOID
:
10415 case UINT_FTYPE_VOID
:
10416 case UNSIGNED_FTYPE_VOID
:
10421 case UINT64_FTYPE_PUNSIGNED
:
10422 case V2DI_FTYPE_PV2DI
:
10423 case V4DI_FTYPE_PV4DI
:
10424 case V32QI_FTYPE_PCCHAR
:
10425 case V16QI_FTYPE_PCCHAR
:
10426 case V8SF_FTYPE_PCV4SF
:
10427 case V8SF_FTYPE_PCFLOAT
:
10428 case V4SF_FTYPE_PCFLOAT
:
10429 case V4DF_FTYPE_PCV2DF
:
10430 case V4DF_FTYPE_PCDOUBLE
:
10431 case V2DF_FTYPE_PCDOUBLE
:
10432 case VOID_FTYPE_PVOID
:
10433 case V8DI_FTYPE_PV8DI
:
10439 case CODE_FOR_sse4_1_movntdqa
:
10440 case CODE_FOR_avx2_movntdqa
:
10441 case CODE_FOR_avx512f_movntdqa
:
10442 aligned_mem
= true;
10448 case VOID_FTYPE_PV2SF_V4SF
:
10449 case VOID_FTYPE_PV8DI_V8DI
:
10450 case VOID_FTYPE_PV4DI_V4DI
:
10451 case VOID_FTYPE_PV2DI_V2DI
:
10452 case VOID_FTYPE_PCHAR_V32QI
:
10453 case VOID_FTYPE_PCHAR_V16QI
:
10454 case VOID_FTYPE_PFLOAT_V16SF
:
10455 case VOID_FTYPE_PFLOAT_V8SF
:
10456 case VOID_FTYPE_PFLOAT_V4SF
:
10457 case VOID_FTYPE_PDOUBLE_V8DF
:
10458 case VOID_FTYPE_PDOUBLE_V4DF
:
10459 case VOID_FTYPE_PDOUBLE_V2DF
:
10460 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10461 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10462 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10463 case VOID_FTYPE_PINT_INT
:
10466 /* Reserve memory operand for target. */
10467 memory
= ARRAY_SIZE (args
);
10470 /* These builtins and instructions require the memory
10471 to be properly aligned. */
10472 case CODE_FOR_avx_movntv4di
:
10473 case CODE_FOR_sse2_movntv2di
:
10474 case CODE_FOR_avx_movntv8sf
:
10475 case CODE_FOR_sse_movntv4sf
:
10476 case CODE_FOR_sse4a_vmmovntv4sf
:
10477 case CODE_FOR_avx_movntv4df
:
10478 case CODE_FOR_sse2_movntv2df
:
10479 case CODE_FOR_sse4a_vmmovntv2df
:
10480 case CODE_FOR_sse2_movntidi
:
10481 case CODE_FOR_sse_movntq
:
10482 case CODE_FOR_sse2_movntisi
:
10483 case CODE_FOR_avx512f_movntv16sf
:
10484 case CODE_FOR_avx512f_movntv8df
:
10485 case CODE_FOR_avx512f_movntv8di
:
10486 aligned_mem
= true;
10492 case VOID_FTYPE_PVOID_PCVOID
:
10498 case V4SF_FTYPE_V4SF_PCV2SF
:
10499 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10504 case V8SF_FTYPE_PCV8SF_V8SI
:
10505 case V4DF_FTYPE_PCV4DF_V4DI
:
10506 case V4SF_FTYPE_PCV4SF_V4SI
:
10507 case V2DF_FTYPE_PCV2DF_V2DI
:
10508 case V8SI_FTYPE_PCV8SI_V8SI
:
10509 case V4DI_FTYPE_PCV4DI_V4DI
:
10510 case V4SI_FTYPE_PCV4SI_V4SI
:
10511 case V2DI_FTYPE_PCV2DI_V2DI
:
10512 case VOID_FTYPE_INT_INT64
:
10517 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10518 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10519 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10520 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10521 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10522 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10523 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10524 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10525 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10526 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10527 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10528 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10529 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10530 case VOID_FTYPE_PV32HI_V32HI_USI
:
10531 case VOID_FTYPE_PV32QI_V32QI_USI
:
10532 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10533 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10534 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10537 /* These builtins and instructions require the memory
10538 to be properly aligned. */
10539 case CODE_FOR_avx512f_storev16sf_mask
:
10540 case CODE_FOR_avx512f_storev16si_mask
:
10541 case CODE_FOR_avx512f_storev8df_mask
:
10542 case CODE_FOR_avx512f_storev8di_mask
:
10543 case CODE_FOR_avx512vl_storev8sf_mask
:
10544 case CODE_FOR_avx512vl_storev8si_mask
:
10545 case CODE_FOR_avx512vl_storev4df_mask
:
10546 case CODE_FOR_avx512vl_storev4di_mask
:
10547 case CODE_FOR_avx512vl_storev4sf_mask
:
10548 case CODE_FOR_avx512vl_storev4si_mask
:
10549 case CODE_FOR_avx512vl_storev2df_mask
:
10550 case CODE_FOR_avx512vl_storev2di_mask
:
10551 aligned_mem
= true;
10557 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10558 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10559 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10560 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10561 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10562 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10563 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10564 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10565 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10566 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10567 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10568 case VOID_FTYPE_PUDI_V8DI_UQI
:
10569 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10570 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10571 case VOID_FTYPE_PUDI_V2DI_UQI
:
10572 case VOID_FTYPE_PUDI_V4DI_UQI
:
10573 case VOID_FTYPE_PUSI_V2DI_UQI
:
10574 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10575 case VOID_FTYPE_PUDI_V4SI_UQI
:
10576 case VOID_FTYPE_PUSI_V4DI_UQI
:
10577 case VOID_FTYPE_PUHI_V2DI_UQI
:
10578 case VOID_FTYPE_PUDI_V8SI_UQI
:
10579 case VOID_FTYPE_PUSI_V4SI_UQI
:
10580 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10581 case VOID_FTYPE_PCHAR_V32QI_USI
:
10582 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10583 case VOID_FTYPE_PSHORT_V32HI_USI
:
10584 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10585 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10586 case VOID_FTYPE_PINT_V16SI_UHI
:
10587 case VOID_FTYPE_PINT_V8SI_UQI
:
10588 case VOID_FTYPE_PINT_V4SI_UQI
:
10589 case VOID_FTYPE_PINT64_V8DI_UQI
:
10590 case VOID_FTYPE_PINT64_V4DI_UQI
:
10591 case VOID_FTYPE_PINT64_V2DI_UQI
:
10592 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10593 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10594 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10595 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10596 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10597 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10598 case VOID_FTYPE_PV32QI_V32HI_USI
:
10599 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10600 case VOID_FTYPE_PUDI_V8HI_UQI
:
10603 /* Reserve memory operand for target. */
10604 memory
= ARRAY_SIZE (args
);
10606 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10607 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10608 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10609 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10610 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10611 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10612 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10613 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10614 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10615 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10616 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10617 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10618 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10619 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10620 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10621 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10622 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10623 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10626 /* These builtins and instructions require the memory
10627 to be properly aligned. */
10628 case CODE_FOR_avx512f_loadv16sf_mask
:
10629 case CODE_FOR_avx512f_loadv16si_mask
:
10630 case CODE_FOR_avx512f_loadv8df_mask
:
10631 case CODE_FOR_avx512f_loadv8di_mask
:
10632 case CODE_FOR_avx512vl_loadv8sf_mask
:
10633 case CODE_FOR_avx512vl_loadv8si_mask
:
10634 case CODE_FOR_avx512vl_loadv4df_mask
:
10635 case CODE_FOR_avx512vl_loadv4di_mask
:
10636 case CODE_FOR_avx512vl_loadv4sf_mask
:
10637 case CODE_FOR_avx512vl_loadv4si_mask
:
10638 case CODE_FOR_avx512vl_loadv2df_mask
:
10639 case CODE_FOR_avx512vl_loadv2di_mask
:
10640 case CODE_FOR_avx512bw_loadv64qi_mask
:
10641 case CODE_FOR_avx512vl_loadv32qi_mask
:
10642 case CODE_FOR_avx512vl_loadv16qi_mask
:
10643 case CODE_FOR_avx512bw_loadv32hi_mask
:
10644 case CODE_FOR_avx512vl_loadv16hi_mask
:
10645 case CODE_FOR_avx512vl_loadv8hi_mask
:
10646 aligned_mem
= true;
10652 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10653 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10654 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10655 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10656 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10657 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10658 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10659 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10660 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10661 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10662 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10663 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10664 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10665 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10666 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10667 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10668 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10669 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10675 gcc_unreachable ();
10678 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10680 if (klass
== store
)
10682 arg
= CALL_EXPR_ARG (exp
, 0);
10683 op
= expand_normal (arg
);
10684 gcc_assert (target
== 0);
10687 op
= ix86_zero_extend_to_Pmode (op
);
10688 target
= gen_rtx_MEM (tmode
, op
);
10689 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10690 on it. Try to improve it using get_pointer_alignment,
10691 and if the special builtin is one that requires strict
10692 mode alignment, also from it's GET_MODE_ALIGNMENT.
10693 Failure to do so could lead to ix86_legitimate_combined_insn
10694 rejecting all changes to such insns. */
10695 unsigned int align
= get_pointer_alignment (arg
);
10696 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10697 align
= GET_MODE_ALIGNMENT (tmode
);
10698 if (MEM_ALIGN (target
) < align
)
10699 set_mem_align (target
, align
);
10702 target
= force_reg (tmode
, op
);
10710 || !register_operand (target
, tmode
)
10711 || GET_MODE (target
) != tmode
)
10712 target
= gen_reg_rtx (tmode
);
10715 for (i
= 0; i
< nargs
; i
++)
10717 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10720 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10721 op
= expand_normal (arg
);
10722 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10724 if (last_arg_constant
&& (i
+ 1) == nargs
)
10728 error ("the last argument must be an 8-bit immediate");
10736 /* This must be the memory operand. */
10737 op
= ix86_zero_extend_to_Pmode (op
);
10738 op
= gen_rtx_MEM (mode
, op
);
10739 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10740 on it. Try to improve it using get_pointer_alignment,
10741 and if the special builtin is one that requires strict
10742 mode alignment, also from it's GET_MODE_ALIGNMENT.
10743 Failure to do so could lead to ix86_legitimate_combined_insn
10744 rejecting all changes to such insns. */
10745 unsigned int align
= get_pointer_alignment (arg
);
10746 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10747 align
= GET_MODE_ALIGNMENT (mode
);
10748 if (MEM_ALIGN (op
) < align
)
10749 set_mem_align (op
, align
);
10753 /* This must be register. */
10754 if (VECTOR_MODE_P (mode
))
10755 op
= safe_vector_operand (op
, mode
);
10757 op
= fixup_modeless_constant (op
, mode
);
10759 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10760 op
= copy_to_mode_reg (mode
, op
);
10763 op
= copy_to_reg (op
);
10764 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10770 args
[i
].mode
= mode
;
10776 pat
= GEN_FCN (icode
) (target
);
10779 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10782 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10785 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10788 gcc_unreachable ();
10794 return klass
== store
? 0 : target
;
10797 /* Return the integer constant in ARG. Constrain it to be in the range
10798 of the subparts of VEC_TYPE; issue an error if not. */
10801 get_element_number (tree vec_type
, tree arg
)
10803 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10805 if (!tree_fits_uhwi_p (arg
)
10806 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10808 error ("selector must be an integer constant in the range "
10816 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10817 ix86_expand_vector_init. We DO have language-level syntax for this, in
10818 the form of (type){ init-list }. Except that since we can't place emms
10819 instructions from inside the compiler, we can't allow the use of MMX
10820 registers unless the user explicitly asks for it. So we do *not* define
10821 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10822 we have builtins invoked by mmintrin.h that gives us license to emit
10823 these sorts of instructions. */
10826 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10828 machine_mode tmode
= TYPE_MODE (type
);
10829 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10830 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10831 rtvec v
= rtvec_alloc (n_elt
);
10833 gcc_assert (VECTOR_MODE_P (tmode
));
10834 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10836 for (i
= 0; i
< n_elt
; ++i
)
10838 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10839 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10842 if (!target
|| !register_operand (target
, tmode
))
10843 target
= gen_reg_rtx (tmode
);
10845 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10849 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10850 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10851 had a language-level syntax for referencing vector elements. */
10854 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10856 machine_mode tmode
, mode0
;
10861 arg0
= CALL_EXPR_ARG (exp
, 0);
10862 arg1
= CALL_EXPR_ARG (exp
, 1);
10864 op0
= expand_normal (arg0
);
10865 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10867 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10868 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10869 gcc_assert (VECTOR_MODE_P (mode0
));
10871 op0
= force_reg (mode0
, op0
);
10873 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10874 target
= gen_reg_rtx (tmode
);
10876 ix86_expand_vector_extract (true, target
, op0
, elt
);
10881 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10882 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10883 a language-level syntax for referencing vector elements. */
10886 ix86_expand_vec_set_builtin (tree exp
)
10888 machine_mode tmode
, mode1
;
10889 tree arg0
, arg1
, arg2
;
10891 rtx op0
, op1
, target
;
10893 arg0
= CALL_EXPR_ARG (exp
, 0);
10894 arg1
= CALL_EXPR_ARG (exp
, 1);
10895 arg2
= CALL_EXPR_ARG (exp
, 2);
10897 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10898 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10899 gcc_assert (VECTOR_MODE_P (tmode
));
10901 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10902 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10903 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10905 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10906 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10908 op0
= force_reg (tmode
, op0
);
10909 op1
= force_reg (mode1
, op1
);
10911 /* OP0 is the source of these builtin functions and shouldn't be
10912 modified. Create a copy, use it and return it as target. */
10913 target
= gen_reg_rtx (tmode
);
10914 emit_move_insn (target
, op0
);
10915 ix86_expand_vector_set (true, target
, op1
, elt
);
10920 /* Expand an expression EXP that calls a built-in function,
10921 with result going to TARGET if that's convenient
10922 (and in mode MODE if that's convenient).
10923 SUBTARGET may be used as the target for computing one of EXP's operands.
10924 IGNORE is nonzero if the value is to be ignored. */
10927 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10928 machine_mode mode
, int ignore
)
10931 enum insn_code icode
, icode2
;
10932 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10933 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10934 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10935 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10936 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
10938 /* For CPU builtins that can be folded, fold first and expand the fold. */
10941 case IX86_BUILTIN_CPU_INIT
:
10943 /* Make it call __cpu_indicator_init in libgcc. */
10944 tree call_expr
, fndecl
, type
;
10945 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
10946 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
10947 call_expr
= build_call_expr (fndecl
, 0);
10948 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
10950 case IX86_BUILTIN_CPU_IS
:
10951 case IX86_BUILTIN_CPU_SUPPORTS
:
10953 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10954 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
10955 gcc_assert (fold_expr
!= NULL_TREE
);
10956 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
10960 HOST_WIDE_INT isa
= ix86_isa_flags
;
10961 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
10962 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
10963 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
10964 /* The general case is we require all the ISAs specified in bisa{,2}
10966 The exceptions are:
10967 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10968 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10969 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10970 where for each such pair it is sufficient if either of the ISAs is
10971 enabled, plus if it is ored with other options also those others.
10972 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
10973 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10974 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10975 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
10976 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
10977 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10978 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10979 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
10980 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
10981 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10982 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10983 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
10984 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
10985 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
)
10987 bisa
&= ~OPTION_MASK_ISA_MMX
;
10988 bisa
|= OPTION_MASK_ISA_SSE2
;
10990 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
10992 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
10993 if (TARGET_ABI_X32
)
10994 bisa
|= OPTION_MASK_ABI_X32
;
10996 bisa
|= OPTION_MASK_ABI_64
;
10997 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
10998 (enum fpmath_unit
) 0,
10999 (enum prefer_vector_width
) 0,
11002 error ("%qE needs unknown isa option", fndecl
);
11005 gcc_assert (opts
!= NULL
);
11006 error ("%qE needs isa option %s", fndecl
, opts
);
11009 return expand_call (exp
, target
, ignore
);
11014 case IX86_BUILTIN_MASKMOVQ
:
11015 case IX86_BUILTIN_MASKMOVDQU
:
11016 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11017 ? CODE_FOR_mmx_maskmovq
11018 : CODE_FOR_sse2_maskmovdqu
);
11019 /* Note the arg order is different from the operand order. */
11020 arg1
= CALL_EXPR_ARG (exp
, 0);
11021 arg2
= CALL_EXPR_ARG (exp
, 1);
11022 arg0
= CALL_EXPR_ARG (exp
, 2);
11023 op0
= expand_normal (arg0
);
11024 op1
= expand_normal (arg1
);
11025 op2
= expand_normal (arg2
);
11026 mode0
= insn_data
[icode
].operand
[0].mode
;
11027 mode1
= insn_data
[icode
].operand
[1].mode
;
11028 mode2
= insn_data
[icode
].operand
[2].mode
;
11030 op0
= ix86_zero_extend_to_Pmode (op0
);
11031 op0
= gen_rtx_MEM (mode1
, op0
);
11033 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11034 op0
= copy_to_mode_reg (mode0
, op0
);
11035 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11036 op1
= copy_to_mode_reg (mode1
, op1
);
11037 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11038 op2
= copy_to_mode_reg (mode2
, op2
);
11039 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11045 case IX86_BUILTIN_LDMXCSR
:
11046 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11047 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11048 emit_move_insn (target
, op0
);
11049 emit_insn (gen_sse_ldmxcsr (target
));
11052 case IX86_BUILTIN_STMXCSR
:
11053 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11054 emit_insn (gen_sse_stmxcsr (target
));
11055 return copy_to_mode_reg (SImode
, target
);
11057 case IX86_BUILTIN_CLFLUSH
:
11058 arg0
= CALL_EXPR_ARG (exp
, 0);
11059 op0
= expand_normal (arg0
);
11060 icode
= CODE_FOR_sse2_clflush
;
11061 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11062 op0
= ix86_zero_extend_to_Pmode (op0
);
11064 emit_insn (gen_sse2_clflush (op0
));
11067 case IX86_BUILTIN_CLWB
:
11068 arg0
= CALL_EXPR_ARG (exp
, 0);
11069 op0
= expand_normal (arg0
);
11070 icode
= CODE_FOR_clwb
;
11071 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11072 op0
= ix86_zero_extend_to_Pmode (op0
);
11074 emit_insn (gen_clwb (op0
));
11077 case IX86_BUILTIN_CLFLUSHOPT
:
11078 arg0
= CALL_EXPR_ARG (exp
, 0);
11079 op0
= expand_normal (arg0
);
11080 icode
= CODE_FOR_clflushopt
;
11081 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11082 op0
= ix86_zero_extend_to_Pmode (op0
);
11084 emit_insn (gen_clflushopt (op0
));
11087 case IX86_BUILTIN_MONITOR
:
11088 case IX86_BUILTIN_MONITORX
:
11089 arg0
= CALL_EXPR_ARG (exp
, 0);
11090 arg1
= CALL_EXPR_ARG (exp
, 1);
11091 arg2
= CALL_EXPR_ARG (exp
, 2);
11092 op0
= expand_normal (arg0
);
11093 op1
= expand_normal (arg1
);
11094 op2
= expand_normal (arg2
);
11096 op0
= ix86_zero_extend_to_Pmode (op0
);
11098 op1
= copy_to_mode_reg (SImode
, op1
);
11100 op2
= copy_to_mode_reg (SImode
, op2
);
11102 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11103 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11104 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11107 case IX86_BUILTIN_MWAIT
:
11108 arg0
= CALL_EXPR_ARG (exp
, 0);
11109 arg1
= CALL_EXPR_ARG (exp
, 1);
11110 op0
= expand_normal (arg0
);
11111 op1
= expand_normal (arg1
);
11113 op0
= copy_to_mode_reg (SImode
, op0
);
11115 op1
= copy_to_mode_reg (SImode
, op1
);
11116 emit_insn (gen_sse3_mwait (op0
, op1
));
11119 case IX86_BUILTIN_MWAITX
:
11120 arg0
= CALL_EXPR_ARG (exp
, 0);
11121 arg1
= CALL_EXPR_ARG (exp
, 1);
11122 arg2
= CALL_EXPR_ARG (exp
, 2);
11123 op0
= expand_normal (arg0
);
11124 op1
= expand_normal (arg1
);
11125 op2
= expand_normal (arg2
);
11127 op0
= copy_to_mode_reg (SImode
, op0
);
11129 op1
= copy_to_mode_reg (SImode
, op1
);
11131 op2
= copy_to_mode_reg (SImode
, op2
);
11132 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11135 case IX86_BUILTIN_UMONITOR
:
11136 arg0
= CALL_EXPR_ARG (exp
, 0);
11137 op0
= expand_normal (arg0
);
11139 op0
= ix86_zero_extend_to_Pmode (op0
);
11140 emit_insn (gen_umonitor (Pmode
, op0
));
11143 case IX86_BUILTIN_UMWAIT
:
11144 case IX86_BUILTIN_TPAUSE
:
11145 arg0
= CALL_EXPR_ARG (exp
, 0);
11146 arg1
= CALL_EXPR_ARG (exp
, 1);
11147 op0
= expand_normal (arg0
);
11148 op1
= expand_normal (arg1
);
11151 op0
= copy_to_mode_reg (SImode
, op0
);
11153 op1
= force_reg (DImode
, op1
);
11157 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11158 NULL
, 1, OPTAB_DIRECT
);
11161 case IX86_BUILTIN_UMWAIT
:
11162 icode
= CODE_FOR_umwait_rex64
;
11164 case IX86_BUILTIN_TPAUSE
:
11165 icode
= CODE_FOR_tpause_rex64
;
11168 gcc_unreachable ();
11171 op2
= gen_lowpart (SImode
, op2
);
11172 op1
= gen_lowpart (SImode
, op1
);
11173 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11179 case IX86_BUILTIN_UMWAIT
:
11180 icode
= CODE_FOR_umwait
;
11182 case IX86_BUILTIN_TPAUSE
:
11183 icode
= CODE_FOR_tpause
;
11186 gcc_unreachable ();
11188 pat
= GEN_FCN (icode
) (op0
, op1
);
11197 || !register_operand (target
, QImode
))
11198 target
= gen_reg_rtx (QImode
);
11200 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11202 emit_insn (gen_rtx_SET (target
, pat
));
11206 case IX86_BUILTIN_CLZERO
:
11207 arg0
= CALL_EXPR_ARG (exp
, 0);
11208 op0
= expand_normal (arg0
);
11210 op0
= ix86_zero_extend_to_Pmode (op0
);
11211 emit_insn (gen_clzero (Pmode
, op0
));
11214 case IX86_BUILTIN_CLDEMOTE
:
11215 arg0
= CALL_EXPR_ARG (exp
, 0);
11216 op0
= expand_normal (arg0
);
11217 icode
= CODE_FOR_cldemote
;
11218 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11219 op0
= ix86_zero_extend_to_Pmode (op0
);
11221 emit_insn (gen_cldemote (op0
));
11224 case IX86_BUILTIN_VEC_INIT_V2SI
:
11225 case IX86_BUILTIN_VEC_INIT_V4HI
:
11226 case IX86_BUILTIN_VEC_INIT_V8QI
:
11227 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11229 case IX86_BUILTIN_VEC_EXT_V2DF
:
11230 case IX86_BUILTIN_VEC_EXT_V2DI
:
11231 case IX86_BUILTIN_VEC_EXT_V4SF
:
11232 case IX86_BUILTIN_VEC_EXT_V4SI
:
11233 case IX86_BUILTIN_VEC_EXT_V8HI
:
11234 case IX86_BUILTIN_VEC_EXT_V2SI
:
11235 case IX86_BUILTIN_VEC_EXT_V4HI
:
11236 case IX86_BUILTIN_VEC_EXT_V16QI
:
11237 return ix86_expand_vec_ext_builtin (exp
, target
);
11239 case IX86_BUILTIN_VEC_SET_V2DI
:
11240 case IX86_BUILTIN_VEC_SET_V4SF
:
11241 case IX86_BUILTIN_VEC_SET_V4SI
:
11242 case IX86_BUILTIN_VEC_SET_V8HI
:
11243 case IX86_BUILTIN_VEC_SET_V4HI
:
11244 case IX86_BUILTIN_VEC_SET_V16QI
:
11245 return ix86_expand_vec_set_builtin (exp
);
11247 case IX86_BUILTIN_NANQ
:
11248 case IX86_BUILTIN_NANSQ
:
11249 return expand_call (exp
, target
, ignore
);
11251 case IX86_BUILTIN_RDPID
:
11253 op0
= gen_reg_rtx (word_mode
);
11257 insn
= gen_rdpid_rex64 (op0
);
11258 op0
= convert_to_mode (SImode
, op0
, 1);
11261 insn
= gen_rdpid (op0
);
11266 || !register_operand (target
, SImode
))
11267 target
= gen_reg_rtx (SImode
);
11269 emit_move_insn (target
, op0
);
11272 case IX86_BUILTIN_2INTERSECTD512
:
11273 case IX86_BUILTIN_2INTERSECTQ512
:
11274 case IX86_BUILTIN_2INTERSECTD256
:
11275 case IX86_BUILTIN_2INTERSECTQ256
:
11276 case IX86_BUILTIN_2INTERSECTD128
:
11277 case IX86_BUILTIN_2INTERSECTQ128
:
11278 arg0
= CALL_EXPR_ARG (exp
, 0);
11279 arg1
= CALL_EXPR_ARG (exp
, 1);
11280 arg2
= CALL_EXPR_ARG (exp
, 2);
11281 arg3
= CALL_EXPR_ARG (exp
, 3);
11282 op0
= expand_normal (arg0
);
11283 op1
= expand_normal (arg1
);
11284 op2
= expand_normal (arg2
);
11285 op3
= expand_normal (arg3
);
11287 if (!address_operand (op0
, VOIDmode
))
11289 op0
= convert_memory_address (Pmode
, op0
);
11290 op0
= copy_addr_to_reg (op0
);
11292 if (!address_operand (op1
, VOIDmode
))
11294 op1
= convert_memory_address (Pmode
, op1
);
11295 op1
= copy_addr_to_reg (op1
);
11300 case IX86_BUILTIN_2INTERSECTD512
:
11302 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11304 case IX86_BUILTIN_2INTERSECTQ512
:
11306 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11308 case IX86_BUILTIN_2INTERSECTD256
:
11310 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11312 case IX86_BUILTIN_2INTERSECTQ256
:
11314 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11316 case IX86_BUILTIN_2INTERSECTD128
:
11318 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11320 case IX86_BUILTIN_2INTERSECTQ128
:
11322 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11325 gcc_unreachable ();
11328 mode2
= insn_data
[icode
].operand
[1].mode
;
11329 mode3
= insn_data
[icode
].operand
[2].mode
;
11330 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11331 op2
= copy_to_mode_reg (mode2
, op2
);
11332 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11333 op3
= copy_to_mode_reg (mode3
, op3
);
11335 op4
= gen_reg_rtx (mode4
);
11336 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11337 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11338 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11339 gen_lowpart (mode0
, op4
));
11340 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11341 gen_highpart (mode0
, op4
));
11345 case IX86_BUILTIN_RDPMC
:
11346 case IX86_BUILTIN_RDTSC
:
11347 case IX86_BUILTIN_RDTSCP
:
11348 case IX86_BUILTIN_XGETBV
:
11350 op0
= gen_reg_rtx (DImode
);
11351 op1
= gen_reg_rtx (DImode
);
11353 if (fcode
== IX86_BUILTIN_RDPMC
)
11355 arg0
= CALL_EXPR_ARG (exp
, 0);
11356 op2
= expand_normal (arg0
);
11357 if (!register_operand (op2
, SImode
))
11358 op2
= copy_to_mode_reg (SImode
, op2
);
11360 insn
= (TARGET_64BIT
11361 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11362 : gen_rdpmc (op0
, op2
));
11365 else if (fcode
== IX86_BUILTIN_XGETBV
)
11367 arg0
= CALL_EXPR_ARG (exp
, 0);
11368 op2
= expand_normal (arg0
);
11369 if (!register_operand (op2
, SImode
))
11370 op2
= copy_to_mode_reg (SImode
, op2
);
11372 insn
= (TARGET_64BIT
11373 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11374 : gen_xgetbv (op0
, op2
));
11377 else if (fcode
== IX86_BUILTIN_RDTSC
)
11379 insn
= (TARGET_64BIT
11380 ? gen_rdtsc_rex64 (op0
, op1
)
11381 : gen_rdtsc (op0
));
11386 op2
= gen_reg_rtx (SImode
);
11388 insn
= (TARGET_64BIT
11389 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11390 : gen_rdtscp (op0
, op2
));
11393 arg0
= CALL_EXPR_ARG (exp
, 0);
11394 op4
= expand_normal (arg0
);
11395 if (!address_operand (op4
, VOIDmode
))
11397 op4
= convert_memory_address (Pmode
, op4
);
11398 op4
= copy_addr_to_reg (op4
);
11400 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11404 || !register_operand (target
, DImode
))
11405 target
= gen_reg_rtx (DImode
);
11409 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11410 op1
, 1, OPTAB_DIRECT
);
11411 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11412 op0
, 1, OPTAB_DIRECT
);
11415 emit_move_insn (target
, op0
);
11418 case IX86_BUILTIN_ENQCMD
:
11419 case IX86_BUILTIN_ENQCMDS
:
11420 case IX86_BUILTIN_MOVDIR64B
:
11422 arg0
= CALL_EXPR_ARG (exp
, 0);
11423 arg1
= CALL_EXPR_ARG (exp
, 1);
11424 op0
= expand_normal (arg0
);
11425 op1
= expand_normal (arg1
);
11427 op0
= ix86_zero_extend_to_Pmode (op0
);
11428 if (!address_operand (op1
, VOIDmode
))
11430 op1
= convert_memory_address (Pmode
, op1
);
11431 op1
= copy_addr_to_reg (op1
);
11433 op1
= gen_rtx_MEM (XImode
, op1
);
11435 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11437 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11443 || !register_operand (target
, SImode
))
11444 target
= gen_reg_rtx (SImode
);
11446 emit_move_insn (target
, const0_rtx
);
11447 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11449 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
11451 : UNSPECV_ENQCMDS
);
11452 icode
= code_for_enqcmd (unspecv
, Pmode
);
11453 emit_insn (GEN_FCN (icode
) (op0
, op1
));
11456 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11457 gen_rtx_fmt_ee (EQ
, QImode
,
11458 gen_rtx_REG (CCZmode
, FLAGS_REG
),
11460 return SUBREG_REG (target
);
11463 case IX86_BUILTIN_FXSAVE
:
11464 case IX86_BUILTIN_FXRSTOR
:
11465 case IX86_BUILTIN_FXSAVE64
:
11466 case IX86_BUILTIN_FXRSTOR64
:
11467 case IX86_BUILTIN_FNSTENV
:
11468 case IX86_BUILTIN_FLDENV
:
11472 case IX86_BUILTIN_FXSAVE
:
11473 icode
= CODE_FOR_fxsave
;
11475 case IX86_BUILTIN_FXRSTOR
:
11476 icode
= CODE_FOR_fxrstor
;
11478 case IX86_BUILTIN_FXSAVE64
:
11479 icode
= CODE_FOR_fxsave64
;
11481 case IX86_BUILTIN_FXRSTOR64
:
11482 icode
= CODE_FOR_fxrstor64
;
11484 case IX86_BUILTIN_FNSTENV
:
11485 icode
= CODE_FOR_fnstenv
;
11487 case IX86_BUILTIN_FLDENV
:
11488 icode
= CODE_FOR_fldenv
;
11491 gcc_unreachable ();
11494 arg0
= CALL_EXPR_ARG (exp
, 0);
11495 op0
= expand_normal (arg0
);
11497 if (!address_operand (op0
, VOIDmode
))
11499 op0
= convert_memory_address (Pmode
, op0
);
11500 op0
= copy_addr_to_reg (op0
);
11502 op0
= gen_rtx_MEM (mode0
, op0
);
11504 pat
= GEN_FCN (icode
) (op0
);
11509 case IX86_BUILTIN_XSETBV
:
11510 arg0
= CALL_EXPR_ARG (exp
, 0);
11511 arg1
= CALL_EXPR_ARG (exp
, 1);
11512 op0
= expand_normal (arg0
);
11513 op1
= expand_normal (arg1
);
11516 op0
= copy_to_mode_reg (SImode
, op0
);
11518 op1
= force_reg (DImode
, op1
);
11522 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11523 NULL
, 1, OPTAB_DIRECT
);
11525 icode
= CODE_FOR_xsetbv_rex64
;
11527 op2
= gen_lowpart (SImode
, op2
);
11528 op1
= gen_lowpart (SImode
, op1
);
11529 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11533 icode
= CODE_FOR_xsetbv
;
11535 pat
= GEN_FCN (icode
) (op0
, op1
);
11541 case IX86_BUILTIN_XSAVE
:
11542 case IX86_BUILTIN_XRSTOR
:
11543 case IX86_BUILTIN_XSAVE64
:
11544 case IX86_BUILTIN_XRSTOR64
:
11545 case IX86_BUILTIN_XSAVEOPT
:
11546 case IX86_BUILTIN_XSAVEOPT64
:
11547 case IX86_BUILTIN_XSAVES
:
11548 case IX86_BUILTIN_XRSTORS
:
11549 case IX86_BUILTIN_XSAVES64
:
11550 case IX86_BUILTIN_XRSTORS64
:
11551 case IX86_BUILTIN_XSAVEC
:
11552 case IX86_BUILTIN_XSAVEC64
:
11553 arg0
= CALL_EXPR_ARG (exp
, 0);
11554 arg1
= CALL_EXPR_ARG (exp
, 1);
11555 op0
= expand_normal (arg0
);
11556 op1
= expand_normal (arg1
);
11558 if (!address_operand (op0
, VOIDmode
))
11560 op0
= convert_memory_address (Pmode
, op0
);
11561 op0
= copy_addr_to_reg (op0
);
11563 op0
= gen_rtx_MEM (BLKmode
, op0
);
11565 op1
= force_reg (DImode
, op1
);
11569 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11570 NULL
, 1, OPTAB_DIRECT
);
11573 case IX86_BUILTIN_XSAVE
:
11574 icode
= CODE_FOR_xsave_rex64
;
11576 case IX86_BUILTIN_XRSTOR
:
11577 icode
= CODE_FOR_xrstor_rex64
;
11579 case IX86_BUILTIN_XSAVE64
:
11580 icode
= CODE_FOR_xsave64
;
11582 case IX86_BUILTIN_XRSTOR64
:
11583 icode
= CODE_FOR_xrstor64
;
11585 case IX86_BUILTIN_XSAVEOPT
:
11586 icode
= CODE_FOR_xsaveopt_rex64
;
11588 case IX86_BUILTIN_XSAVEOPT64
:
11589 icode
= CODE_FOR_xsaveopt64
;
11591 case IX86_BUILTIN_XSAVES
:
11592 icode
= CODE_FOR_xsaves_rex64
;
11594 case IX86_BUILTIN_XRSTORS
:
11595 icode
= CODE_FOR_xrstors_rex64
;
11597 case IX86_BUILTIN_XSAVES64
:
11598 icode
= CODE_FOR_xsaves64
;
11600 case IX86_BUILTIN_XRSTORS64
:
11601 icode
= CODE_FOR_xrstors64
;
11603 case IX86_BUILTIN_XSAVEC
:
11604 icode
= CODE_FOR_xsavec_rex64
;
11606 case IX86_BUILTIN_XSAVEC64
:
11607 icode
= CODE_FOR_xsavec64
;
11610 gcc_unreachable ();
11613 op2
= gen_lowpart (SImode
, op2
);
11614 op1
= gen_lowpart (SImode
, op1
);
11615 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11621 case IX86_BUILTIN_XSAVE
:
11622 icode
= CODE_FOR_xsave
;
11624 case IX86_BUILTIN_XRSTOR
:
11625 icode
= CODE_FOR_xrstor
;
11627 case IX86_BUILTIN_XSAVEOPT
:
11628 icode
= CODE_FOR_xsaveopt
;
11630 case IX86_BUILTIN_XSAVES
:
11631 icode
= CODE_FOR_xsaves
;
11633 case IX86_BUILTIN_XRSTORS
:
11634 icode
= CODE_FOR_xrstors
;
11636 case IX86_BUILTIN_XSAVEC
:
11637 icode
= CODE_FOR_xsavec
;
11640 gcc_unreachable ();
11642 pat
= GEN_FCN (icode
) (op0
, op1
);
11649 case IX86_BUILTIN_LLWPCB
:
11650 arg0
= CALL_EXPR_ARG (exp
, 0);
11651 op0
= expand_normal (arg0
);
11653 if (!register_operand (op0
, Pmode
))
11654 op0
= ix86_zero_extend_to_Pmode (op0
);
11655 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
11658 case IX86_BUILTIN_SLWPCB
:
11660 || !register_operand (target
, Pmode
))
11661 target
= gen_reg_rtx (Pmode
);
11662 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
11665 case IX86_BUILTIN_LWPVAL32
:
11666 case IX86_BUILTIN_LWPVAL64
:
11667 case IX86_BUILTIN_LWPINS32
:
11668 case IX86_BUILTIN_LWPINS64
:
11669 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
11670 || fcode
== IX86_BUILTIN_LWPINS32
)
11671 ? SImode
: DImode
);
11673 if (fcode
== IX86_BUILTIN_LWPVAL32
11674 || fcode
== IX86_BUILTIN_LWPVAL64
)
11675 icode
= code_for_lwp_lwpval (mode
);
11677 icode
= code_for_lwp_lwpins (mode
);
11679 arg0
= CALL_EXPR_ARG (exp
, 0);
11680 arg1
= CALL_EXPR_ARG (exp
, 1);
11681 arg2
= CALL_EXPR_ARG (exp
, 2);
11682 op0
= expand_normal (arg0
);
11683 op1
= expand_normal (arg1
);
11684 op2
= expand_normal (arg2
);
11685 mode0
= insn_data
[icode
].operand
[0].mode
;
11687 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11688 op0
= copy_to_mode_reg (mode0
, op0
);
11689 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
11690 op1
= copy_to_mode_reg (SImode
, op1
);
11692 if (!CONST_INT_P (op2
))
11694 error ("the last argument must be a 32-bit immediate");
11698 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
11700 if (fcode
== IX86_BUILTIN_LWPINS32
11701 || fcode
== IX86_BUILTIN_LWPINS64
)
11704 || !nonimmediate_operand (target
, QImode
))
11705 target
= gen_reg_rtx (QImode
);
11707 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11709 emit_insn (gen_rtx_SET (target
, pat
));
11716 case IX86_BUILTIN_BEXTRI32
:
11717 case IX86_BUILTIN_BEXTRI64
:
11718 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
11720 arg0
= CALL_EXPR_ARG (exp
, 0);
11721 arg1
= CALL_EXPR_ARG (exp
, 1);
11722 op0
= expand_normal (arg0
);
11723 op1
= expand_normal (arg1
);
11725 if (!CONST_INT_P (op1
))
11727 error ("last argument must be an immediate");
11732 unsigned char lsb_index
= UINTVAL (op1
);
11733 unsigned char length
= UINTVAL (op1
) >> 8;
11735 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
11737 icode
= code_for_tbm_bextri (mode
);
11739 mode1
= insn_data
[icode
].operand
[1].mode
;
11740 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11741 op0
= copy_to_mode_reg (mode1
, op0
);
11743 mode0
= insn_data
[icode
].operand
[0].mode
;
11745 || !register_operand (target
, mode0
))
11746 target
= gen_reg_rtx (mode0
);
11748 if (length
== 0 || lsb_index
>= bitsize
)
11750 emit_move_insn (target
, const0_rtx
);
11754 if (length
+ lsb_index
> bitsize
)
11755 length
= bitsize
- lsb_index
;
11757 op1
= GEN_INT (length
);
11758 op2
= GEN_INT (lsb_index
);
11760 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
11764 case IX86_BUILTIN_RDRAND16_STEP
:
11768 case IX86_BUILTIN_RDRAND32_STEP
:
11772 case IX86_BUILTIN_RDRAND64_STEP
:
11776 arg0
= CALL_EXPR_ARG (exp
, 0);
11777 op1
= expand_normal (arg0
);
11778 if (!address_operand (op1
, VOIDmode
))
11780 op1
= convert_memory_address (Pmode
, op1
);
11781 op1
= copy_addr_to_reg (op1
);
11784 op0
= gen_reg_rtx (mode
);
11785 emit_insn (gen_rdrand (mode
, op0
));
11787 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
11789 op1
= force_reg (SImode
, const1_rtx
);
11791 /* Emit SImode conditional move. */
11792 if (mode
== HImode
)
11794 if (TARGET_ZERO_EXTEND_WITH_AND
11795 && optimize_function_for_speed_p (cfun
))
11797 op2
= force_reg (SImode
, const0_rtx
);
11799 emit_insn (gen_movstricthi
11800 (gen_lowpart (HImode
, op2
), op0
));
11804 op2
= gen_reg_rtx (SImode
);
11806 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11809 else if (mode
== SImode
)
11812 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11815 || !register_operand (target
, SImode
))
11816 target
= gen_reg_rtx (SImode
);
11818 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11820 emit_insn (gen_rtx_SET (target
,
11821 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11824 case IX86_BUILTIN_RDSEED16_STEP
:
11828 case IX86_BUILTIN_RDSEED32_STEP
:
11832 case IX86_BUILTIN_RDSEED64_STEP
:
11836 arg0
= CALL_EXPR_ARG (exp
, 0);
11837 op1
= expand_normal (arg0
);
11838 if (!address_operand (op1
, VOIDmode
))
11840 op1
= convert_memory_address (Pmode
, op1
);
11841 op1
= copy_addr_to_reg (op1
);
11844 op0
= gen_reg_rtx (mode
);
11845 emit_insn (gen_rdseed (mode
, op0
));
11847 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
11849 op2
= gen_reg_rtx (QImode
);
11851 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11853 emit_insn (gen_rtx_SET (op2
, pat
));
11856 || !register_operand (target
, SImode
))
11857 target
= gen_reg_rtx (SImode
);
11859 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11862 case IX86_BUILTIN_SBB32
:
11863 icode
= CODE_FOR_subborrowsi
;
11864 icode2
= CODE_FOR_subborrowsi_0
;
11870 case IX86_BUILTIN_SBB64
:
11871 icode
= CODE_FOR_subborrowdi
;
11872 icode2
= CODE_FOR_subborrowdi_0
;
11878 case IX86_BUILTIN_ADDCARRYX32
:
11879 icode
= CODE_FOR_addcarrysi
;
11880 icode2
= CODE_FOR_addcarrysi_0
;
11886 case IX86_BUILTIN_ADDCARRYX64
:
11887 icode
= CODE_FOR_addcarrydi
;
11888 icode2
= CODE_FOR_addcarrydi_0
;
11894 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11895 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11896 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11897 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11899 op1
= expand_normal (arg0
);
11900 if (!integer_zerop (arg0
))
11901 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11903 op2
= expand_normal (arg1
);
11904 if (!register_operand (op2
, mode0
))
11905 op2
= copy_to_mode_reg (mode0
, op2
);
11907 op3
= expand_normal (arg2
);
11908 if (!register_operand (op3
, mode0
))
11909 op3
= copy_to_mode_reg (mode0
, op3
);
11911 op4
= expand_normal (arg3
);
11912 if (!address_operand (op4
, VOIDmode
))
11914 op4
= convert_memory_address (Pmode
, op4
);
11915 op4
= copy_addr_to_reg (op4
);
11918 op0
= gen_reg_rtx (mode0
);
11919 if (integer_zerop (arg0
))
11921 /* If arg0 is 0, optimize right away into add or sub
11922 instruction that sets CCCmode flags. */
11923 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11924 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11928 /* Generate CF from input operand. */
11929 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11931 /* Generate instruction that consumes CF. */
11932 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11933 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11934 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11935 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11938 /* Return current CF value. */
11940 target
= gen_reg_rtx (QImode
);
11942 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11943 emit_insn (gen_rtx_SET (target
, pat
));
11945 /* Store the result. */
11946 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11950 case IX86_BUILTIN_READ_FLAGS
:
11951 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11954 || target
== NULL_RTX
11955 || !nonimmediate_operand (target
, word_mode
)
11956 || GET_MODE (target
) != word_mode
)
11957 target
= gen_reg_rtx (word_mode
);
11959 emit_insn (gen_pop (target
));
11962 case IX86_BUILTIN_WRITE_FLAGS
:
11964 arg0
= CALL_EXPR_ARG (exp
, 0);
11965 op0
= expand_normal (arg0
);
11966 if (!general_no_elim_operand (op0
, word_mode
))
11967 op0
= copy_to_mode_reg (word_mode
, op0
);
11969 emit_insn (gen_push (op0
));
11970 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11973 case IX86_BUILTIN_KTESTC8
:
11974 icode
= CODE_FOR_ktestqi
;
11978 case IX86_BUILTIN_KTESTZ8
:
11979 icode
= CODE_FOR_ktestqi
;
11983 case IX86_BUILTIN_KTESTC16
:
11984 icode
= CODE_FOR_ktesthi
;
11988 case IX86_BUILTIN_KTESTZ16
:
11989 icode
= CODE_FOR_ktesthi
;
11993 case IX86_BUILTIN_KTESTC32
:
11994 icode
= CODE_FOR_ktestsi
;
11998 case IX86_BUILTIN_KTESTZ32
:
11999 icode
= CODE_FOR_ktestsi
;
12003 case IX86_BUILTIN_KTESTC64
:
12004 icode
= CODE_FOR_ktestdi
;
12008 case IX86_BUILTIN_KTESTZ64
:
12009 icode
= CODE_FOR_ktestdi
;
12013 case IX86_BUILTIN_KORTESTC8
:
12014 icode
= CODE_FOR_kortestqi
;
12018 case IX86_BUILTIN_KORTESTZ8
:
12019 icode
= CODE_FOR_kortestqi
;
12023 case IX86_BUILTIN_KORTESTC16
:
12024 icode
= CODE_FOR_kortesthi
;
12028 case IX86_BUILTIN_KORTESTZ16
:
12029 icode
= CODE_FOR_kortesthi
;
12033 case IX86_BUILTIN_KORTESTC32
:
12034 icode
= CODE_FOR_kortestsi
;
12038 case IX86_BUILTIN_KORTESTZ32
:
12039 icode
= CODE_FOR_kortestsi
;
12043 case IX86_BUILTIN_KORTESTC64
:
12044 icode
= CODE_FOR_kortestdi
;
12048 case IX86_BUILTIN_KORTESTZ64
:
12049 icode
= CODE_FOR_kortestdi
;
12053 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12054 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12055 op0
= expand_normal (arg0
);
12056 op1
= expand_normal (arg1
);
12058 mode0
= insn_data
[icode
].operand
[0].mode
;
12059 mode1
= insn_data
[icode
].operand
[1].mode
;
12061 if (GET_MODE (op0
) != VOIDmode
)
12062 op0
= force_reg (GET_MODE (op0
), op0
);
12064 op0
= gen_lowpart (mode0
, op0
);
12066 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12067 op0
= copy_to_mode_reg (mode0
, op0
);
12069 if (GET_MODE (op1
) != VOIDmode
)
12070 op1
= force_reg (GET_MODE (op1
), op1
);
12072 op1
= gen_lowpart (mode1
, op1
);
12074 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12075 op1
= copy_to_mode_reg (mode1
, op1
);
12077 target
= gen_reg_rtx (QImode
);
12079 /* Emit kortest. */
12080 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12081 /* And use setcc to return result from flags. */
12082 ix86_expand_setcc (target
, EQ
,
12083 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12086 case IX86_BUILTIN_GATHERSIV2DF
:
12087 icode
= CODE_FOR_avx2_gathersiv2df
;
12089 case IX86_BUILTIN_GATHERSIV4DF
:
12090 icode
= CODE_FOR_avx2_gathersiv4df
;
12092 case IX86_BUILTIN_GATHERDIV2DF
:
12093 icode
= CODE_FOR_avx2_gatherdiv2df
;
12095 case IX86_BUILTIN_GATHERDIV4DF
:
12096 icode
= CODE_FOR_avx2_gatherdiv4df
;
12098 case IX86_BUILTIN_GATHERSIV4SF
:
12099 icode
= CODE_FOR_avx2_gathersiv4sf
;
12101 case IX86_BUILTIN_GATHERSIV8SF
:
12102 icode
= CODE_FOR_avx2_gathersiv8sf
;
12104 case IX86_BUILTIN_GATHERDIV4SF
:
12105 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12107 case IX86_BUILTIN_GATHERDIV8SF
:
12108 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12110 case IX86_BUILTIN_GATHERSIV2DI
:
12111 icode
= CODE_FOR_avx2_gathersiv2di
;
12113 case IX86_BUILTIN_GATHERSIV4DI
:
12114 icode
= CODE_FOR_avx2_gathersiv4di
;
12116 case IX86_BUILTIN_GATHERDIV2DI
:
12117 icode
= CODE_FOR_avx2_gatherdiv2di
;
12119 case IX86_BUILTIN_GATHERDIV4DI
:
12120 icode
= CODE_FOR_avx2_gatherdiv4di
;
12122 case IX86_BUILTIN_GATHERSIV4SI
:
12123 icode
= CODE_FOR_avx2_gathersiv4si
;
12125 case IX86_BUILTIN_GATHERSIV8SI
:
12126 icode
= CODE_FOR_avx2_gathersiv8si
;
12128 case IX86_BUILTIN_GATHERDIV4SI
:
12129 icode
= CODE_FOR_avx2_gatherdiv4si
;
12131 case IX86_BUILTIN_GATHERDIV8SI
:
12132 icode
= CODE_FOR_avx2_gatherdiv8si
;
12134 case IX86_BUILTIN_GATHERALTSIV4DF
:
12135 icode
= CODE_FOR_avx2_gathersiv4df
;
12137 case IX86_BUILTIN_GATHERALTDIV8SF
:
12138 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12140 case IX86_BUILTIN_GATHERALTSIV4DI
:
12141 icode
= CODE_FOR_avx2_gathersiv4di
;
12143 case IX86_BUILTIN_GATHERALTDIV8SI
:
12144 icode
= CODE_FOR_avx2_gatherdiv8si
;
12146 case IX86_BUILTIN_GATHER3SIV16SF
:
12147 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12149 case IX86_BUILTIN_GATHER3SIV8DF
:
12150 icode
= CODE_FOR_avx512f_gathersiv8df
;
12152 case IX86_BUILTIN_GATHER3DIV16SF
:
12153 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12155 case IX86_BUILTIN_GATHER3DIV8DF
:
12156 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12158 case IX86_BUILTIN_GATHER3SIV16SI
:
12159 icode
= CODE_FOR_avx512f_gathersiv16si
;
12161 case IX86_BUILTIN_GATHER3SIV8DI
:
12162 icode
= CODE_FOR_avx512f_gathersiv8di
;
12164 case IX86_BUILTIN_GATHER3DIV16SI
:
12165 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12167 case IX86_BUILTIN_GATHER3DIV8DI
:
12168 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12170 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12171 icode
= CODE_FOR_avx512f_gathersiv8df
;
12173 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12174 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12176 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12177 icode
= CODE_FOR_avx512f_gathersiv8di
;
12179 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12180 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12182 case IX86_BUILTIN_GATHER3SIV2DF
:
12183 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12185 case IX86_BUILTIN_GATHER3SIV4DF
:
12186 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12188 case IX86_BUILTIN_GATHER3DIV2DF
:
12189 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12191 case IX86_BUILTIN_GATHER3DIV4DF
:
12192 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12194 case IX86_BUILTIN_GATHER3SIV4SF
:
12195 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12197 case IX86_BUILTIN_GATHER3SIV8SF
:
12198 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12200 case IX86_BUILTIN_GATHER3DIV4SF
:
12201 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12203 case IX86_BUILTIN_GATHER3DIV8SF
:
12204 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12206 case IX86_BUILTIN_GATHER3SIV2DI
:
12207 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12209 case IX86_BUILTIN_GATHER3SIV4DI
:
12210 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12212 case IX86_BUILTIN_GATHER3DIV2DI
:
12213 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12215 case IX86_BUILTIN_GATHER3DIV4DI
:
12216 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12218 case IX86_BUILTIN_GATHER3SIV4SI
:
12219 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12221 case IX86_BUILTIN_GATHER3SIV8SI
:
12222 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12224 case IX86_BUILTIN_GATHER3DIV4SI
:
12225 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12227 case IX86_BUILTIN_GATHER3DIV8SI
:
12228 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12230 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12231 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12233 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12234 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12236 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12237 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12239 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12240 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12242 case IX86_BUILTIN_SCATTERSIV16SF
:
12243 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12245 case IX86_BUILTIN_SCATTERSIV8DF
:
12246 icode
= CODE_FOR_avx512f_scattersiv8df
;
12248 case IX86_BUILTIN_SCATTERDIV16SF
:
12249 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12251 case IX86_BUILTIN_SCATTERDIV8DF
:
12252 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12254 case IX86_BUILTIN_SCATTERSIV16SI
:
12255 icode
= CODE_FOR_avx512f_scattersiv16si
;
12257 case IX86_BUILTIN_SCATTERSIV8DI
:
12258 icode
= CODE_FOR_avx512f_scattersiv8di
;
12260 case IX86_BUILTIN_SCATTERDIV16SI
:
12261 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12263 case IX86_BUILTIN_SCATTERDIV8DI
:
12264 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12266 case IX86_BUILTIN_SCATTERSIV8SF
:
12267 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12269 case IX86_BUILTIN_SCATTERSIV4SF
:
12270 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12272 case IX86_BUILTIN_SCATTERSIV4DF
:
12273 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12275 case IX86_BUILTIN_SCATTERSIV2DF
:
12276 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12278 case IX86_BUILTIN_SCATTERDIV8SF
:
12279 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12281 case IX86_BUILTIN_SCATTERDIV4SF
:
12282 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12284 case IX86_BUILTIN_SCATTERDIV4DF
:
12285 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12287 case IX86_BUILTIN_SCATTERDIV2DF
:
12288 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12290 case IX86_BUILTIN_SCATTERSIV8SI
:
12291 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12293 case IX86_BUILTIN_SCATTERSIV4SI
:
12294 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12296 case IX86_BUILTIN_SCATTERSIV4DI
:
12297 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12299 case IX86_BUILTIN_SCATTERSIV2DI
:
12300 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12302 case IX86_BUILTIN_SCATTERDIV8SI
:
12303 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12305 case IX86_BUILTIN_SCATTERDIV4SI
:
12306 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12308 case IX86_BUILTIN_SCATTERDIV4DI
:
12309 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12311 case IX86_BUILTIN_SCATTERDIV2DI
:
12312 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12314 case IX86_BUILTIN_GATHERPFDPD
:
12315 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12316 goto vec_prefetch_gen
;
12317 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12318 icode
= CODE_FOR_avx512f_scattersiv8df
;
12320 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12321 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12323 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12324 icode
= CODE_FOR_avx512f_scattersiv8di
;
12326 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12327 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12329 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12330 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12332 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12333 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12335 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12336 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12338 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12339 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12341 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12342 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12344 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12345 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12347 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12348 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12350 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12351 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12353 case IX86_BUILTIN_GATHERPFDPS
:
12354 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12355 goto vec_prefetch_gen
;
12356 case IX86_BUILTIN_GATHERPFQPD
:
12357 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12358 goto vec_prefetch_gen
;
12359 case IX86_BUILTIN_GATHERPFQPS
:
12360 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12361 goto vec_prefetch_gen
;
12362 case IX86_BUILTIN_SCATTERPFDPD
:
12363 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12364 goto vec_prefetch_gen
;
12365 case IX86_BUILTIN_SCATTERPFDPS
:
12366 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12367 goto vec_prefetch_gen
;
12368 case IX86_BUILTIN_SCATTERPFQPD
:
12369 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12370 goto vec_prefetch_gen
;
12371 case IX86_BUILTIN_SCATTERPFQPS
:
12372 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12373 goto vec_prefetch_gen
;
12377 rtx (*gen
) (rtx
, rtx
);
12379 arg0
= CALL_EXPR_ARG (exp
, 0);
12380 arg1
= CALL_EXPR_ARG (exp
, 1);
12381 arg2
= CALL_EXPR_ARG (exp
, 2);
12382 arg3
= CALL_EXPR_ARG (exp
, 3);
12383 arg4
= CALL_EXPR_ARG (exp
, 4);
12384 op0
= expand_normal (arg0
);
12385 op1
= expand_normal (arg1
);
12386 op2
= expand_normal (arg2
);
12387 op3
= expand_normal (arg3
);
12388 op4
= expand_normal (arg4
);
12389 /* Note the arg order is different from the operand order. */
12390 mode0
= insn_data
[icode
].operand
[1].mode
;
12391 mode2
= insn_data
[icode
].operand
[3].mode
;
12392 mode3
= insn_data
[icode
].operand
[4].mode
;
12393 mode4
= insn_data
[icode
].operand
[5].mode
;
12395 if (target
== NULL_RTX
12396 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12397 || !insn_data
[icode
].operand
[0].predicate (target
,
12398 GET_MODE (target
)))
12399 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12401 subtarget
= target
;
12405 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12406 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12407 half
= gen_reg_rtx (V8SImode
);
12408 if (!nonimmediate_operand (op2
, V16SImode
))
12409 op2
= copy_to_mode_reg (V16SImode
, op2
);
12410 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12413 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12414 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12415 case IX86_BUILTIN_GATHERALTSIV4DF
:
12416 case IX86_BUILTIN_GATHERALTSIV4DI
:
12417 half
= gen_reg_rtx (V4SImode
);
12418 if (!nonimmediate_operand (op2
, V8SImode
))
12419 op2
= copy_to_mode_reg (V8SImode
, op2
);
12420 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12423 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12424 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12425 half
= gen_reg_rtx (mode0
);
12426 if (mode0
== V8SFmode
)
12427 gen
= gen_vec_extract_lo_v16sf
;
12429 gen
= gen_vec_extract_lo_v16si
;
12430 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12431 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12432 emit_insn (gen (half
, op0
));
12434 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12436 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12437 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12438 case IX86_BUILTIN_GATHERALTDIV8SF
:
12439 case IX86_BUILTIN_GATHERALTDIV8SI
:
12440 half
= gen_reg_rtx (mode0
);
12441 if (mode0
== V4SFmode
)
12442 gen
= gen_vec_extract_lo_v8sf
;
12444 gen
= gen_vec_extract_lo_v8si
;
12445 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12446 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12447 emit_insn (gen (half
, op0
));
12449 if (VECTOR_MODE_P (GET_MODE (op3
)))
12451 half
= gen_reg_rtx (mode0
);
12452 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12453 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12454 emit_insn (gen (half
, op3
));
12462 /* Force memory operand only with base register here. But we
12463 don't want to do it on memory operand for other builtin
12465 op1
= ix86_zero_extend_to_Pmode (op1
);
12467 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12468 op0
= copy_to_mode_reg (mode0
, op0
);
12469 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12470 op1
= copy_to_mode_reg (Pmode
, op1
);
12471 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12472 op2
= copy_to_mode_reg (mode2
, op2
);
12474 op3
= fixup_modeless_constant (op3
, mode3
);
12476 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12478 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12479 op3
= copy_to_mode_reg (mode3
, op3
);
12483 op3
= copy_to_reg (op3
);
12484 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12486 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12488 error ("the last argument must be scale 1, 2, 4, 8");
12492 /* Optimize. If mask is known to have all high bits set,
12493 replace op0 with pc_rtx to signal that the instruction
12494 overwrites the whole destination and doesn't use its
12495 previous contents. */
12498 if (TREE_CODE (arg3
) == INTEGER_CST
)
12500 if (integer_all_onesp (arg3
))
12503 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12505 unsigned int negative
= 0;
12506 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12508 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12509 if (TREE_CODE (cst
) == INTEGER_CST
12510 && tree_int_cst_sign_bit (cst
))
12512 else if (TREE_CODE (cst
) == REAL_CST
12513 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12516 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12519 else if (TREE_CODE (arg3
) == SSA_NAME
12520 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12522 /* Recognize also when mask is like:
12523 __v2df src = _mm_setzero_pd ();
12524 __v2df mask = _mm_cmpeq_pd (src, src);
12526 __v8sf src = _mm256_setzero_ps ();
12527 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12528 as that is a cheaper way to load all ones into
12529 a register than having to load a constant from
12531 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12532 if (is_gimple_call (def_stmt
))
12534 tree fndecl
= gimple_call_fndecl (def_stmt
);
12536 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12537 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12539 case IX86_BUILTIN_CMPPD
:
12540 case IX86_BUILTIN_CMPPS
:
12541 case IX86_BUILTIN_CMPPD256
:
12542 case IX86_BUILTIN_CMPPS256
:
12543 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12546 case IX86_BUILTIN_CMPEQPD
:
12547 case IX86_BUILTIN_CMPEQPS
:
12548 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12549 && initializer_zerop (gimple_call_arg (def_stmt
,
12560 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12567 case IX86_BUILTIN_GATHER3DIV16SF
:
12568 if (target
== NULL_RTX
)
12569 target
= gen_reg_rtx (V8SFmode
);
12570 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12572 case IX86_BUILTIN_GATHER3DIV16SI
:
12573 if (target
== NULL_RTX
)
12574 target
= gen_reg_rtx (V8SImode
);
12575 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12577 case IX86_BUILTIN_GATHER3DIV8SF
:
12578 case IX86_BUILTIN_GATHERDIV8SF
:
12579 if (target
== NULL_RTX
)
12580 target
= gen_reg_rtx (V4SFmode
);
12581 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12583 case IX86_BUILTIN_GATHER3DIV8SI
:
12584 case IX86_BUILTIN_GATHERDIV8SI
:
12585 if (target
== NULL_RTX
)
12586 target
= gen_reg_rtx (V4SImode
);
12587 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12590 target
= subtarget
;
12596 arg0
= CALL_EXPR_ARG (exp
, 0);
12597 arg1
= CALL_EXPR_ARG (exp
, 1);
12598 arg2
= CALL_EXPR_ARG (exp
, 2);
12599 arg3
= CALL_EXPR_ARG (exp
, 3);
12600 arg4
= CALL_EXPR_ARG (exp
, 4);
12601 op0
= expand_normal (arg0
);
12602 op1
= expand_normal (arg1
);
12603 op2
= expand_normal (arg2
);
12604 op3
= expand_normal (arg3
);
12605 op4
= expand_normal (arg4
);
12606 mode1
= insn_data
[icode
].operand
[1].mode
;
12607 mode2
= insn_data
[icode
].operand
[2].mode
;
12608 mode3
= insn_data
[icode
].operand
[3].mode
;
12609 mode4
= insn_data
[icode
].operand
[4].mode
;
12611 /* Scatter instruction stores operand op3 to memory with
12612 indices from op2 and scale from op4 under writemask op1.
12613 If index operand op2 has more elements then source operand
12614 op3 one need to use only its low half. And vice versa. */
12617 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12618 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12619 half
= gen_reg_rtx (V8SImode
);
12620 if (!nonimmediate_operand (op2
, V16SImode
))
12621 op2
= copy_to_mode_reg (V16SImode
, op2
);
12622 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12625 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12626 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12627 half
= gen_reg_rtx (mode3
);
12628 if (mode3
== V8SFmode
)
12629 gen
= gen_vec_extract_lo_v16sf
;
12631 gen
= gen_vec_extract_lo_v16si
;
12632 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12633 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12634 emit_insn (gen (half
, op3
));
12637 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12638 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12639 half
= gen_reg_rtx (V4SImode
);
12640 if (!nonimmediate_operand (op2
, V8SImode
))
12641 op2
= copy_to_mode_reg (V8SImode
, op2
);
12642 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12645 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12646 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12647 half
= gen_reg_rtx (mode3
);
12648 if (mode3
== V4SFmode
)
12649 gen
= gen_vec_extract_lo_v8sf
;
12651 gen
= gen_vec_extract_lo_v8si
;
12652 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12653 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12654 emit_insn (gen (half
, op3
));
12657 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12658 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12659 if (!nonimmediate_operand (op2
, V4SImode
))
12660 op2
= copy_to_mode_reg (V4SImode
, op2
);
12662 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12663 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12664 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12665 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12671 /* Force memory operand only with base register here. But we
12672 don't want to do it on memory operand for other builtin
12674 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12676 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12677 op0
= copy_to_mode_reg (Pmode
, op0
);
12679 op1
= fixup_modeless_constant (op1
, mode1
);
12681 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12683 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12684 op1
= copy_to_mode_reg (mode1
, op1
);
12688 op1
= copy_to_reg (op1
);
12689 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12692 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12693 op2
= copy_to_mode_reg (mode2
, op2
);
12695 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12696 op3
= copy_to_mode_reg (mode3
, op3
);
12698 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12700 error ("the last argument must be scale 1, 2, 4, 8");
12704 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12712 arg0
= CALL_EXPR_ARG (exp
, 0);
12713 arg1
= CALL_EXPR_ARG (exp
, 1);
12714 arg2
= CALL_EXPR_ARG (exp
, 2);
12715 arg3
= CALL_EXPR_ARG (exp
, 3);
12716 arg4
= CALL_EXPR_ARG (exp
, 4);
12717 op0
= expand_normal (arg0
);
12718 op1
= expand_normal (arg1
);
12719 op2
= expand_normal (arg2
);
12720 op3
= expand_normal (arg3
);
12721 op4
= expand_normal (arg4
);
12722 mode0
= insn_data
[icode
].operand
[0].mode
;
12723 mode1
= insn_data
[icode
].operand
[1].mode
;
12724 mode3
= insn_data
[icode
].operand
[3].mode
;
12725 mode4
= insn_data
[icode
].operand
[4].mode
;
12727 op0
= fixup_modeless_constant (op0
, mode0
);
12729 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12731 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12732 op0
= copy_to_mode_reg (mode0
, op0
);
12736 op0
= copy_to_reg (op0
);
12737 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12740 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12741 op1
= copy_to_mode_reg (mode1
, op1
);
12743 /* Force memory operand only with base register here. But we
12744 don't want to do it on memory operand for other builtin
12746 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12748 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12749 op2
= copy_to_mode_reg (Pmode
, op2
);
12751 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12753 error ("the forth argument must be scale 1, 2, 4, 8");
12757 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12759 error ("incorrect hint operand");
12763 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12771 case IX86_BUILTIN_XABORT
:
12772 icode
= CODE_FOR_xabort
;
12773 arg0
= CALL_EXPR_ARG (exp
, 0);
12774 op0
= expand_normal (arg0
);
12775 mode0
= insn_data
[icode
].operand
[0].mode
;
12776 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12778 error ("the argument to %<xabort%> intrinsic must "
12779 "be an 8-bit immediate");
12782 emit_insn (gen_xabort (op0
));
12785 case IX86_BUILTIN_RDSSPD
:
12786 case IX86_BUILTIN_RDSSPQ
:
12787 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
12790 || !register_operand (target
, mode
))
12791 target
= gen_reg_rtx (mode
);
12793 op0
= force_reg (mode
, const0_rtx
);
12795 emit_insn (gen_rdssp (mode
, target
, op0
));
12798 case IX86_BUILTIN_INCSSPD
:
12799 case IX86_BUILTIN_INCSSPQ
:
12800 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
12802 arg0
= CALL_EXPR_ARG (exp
, 0);
12803 op0
= expand_normal (arg0
);
12805 op0
= force_reg (mode
, op0
);
12807 emit_insn (gen_incssp (mode
, op0
));
12810 case IX86_BUILTIN_RSTORSSP
:
12811 case IX86_BUILTIN_CLRSSBSY
:
12812 arg0
= CALL_EXPR_ARG (exp
, 0);
12813 op0
= expand_normal (arg0
);
12814 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12815 ? CODE_FOR_rstorssp
12816 : CODE_FOR_clrssbsy
);
12818 if (!address_operand (op0
, VOIDmode
))
12820 op0
= convert_memory_address (Pmode
, op0
);
12821 op0
= copy_addr_to_reg (op0
);
12823 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
12826 case IX86_BUILTIN_WRSSD
:
12827 case IX86_BUILTIN_WRSSQ
:
12828 case IX86_BUILTIN_WRUSSD
:
12829 case IX86_BUILTIN_WRUSSQ
:
12830 mode
= ((fcode
== IX86_BUILTIN_WRSSD
12831 || fcode
== IX86_BUILTIN_WRUSSD
)
12832 ? SImode
: DImode
);
12834 arg0
= CALL_EXPR_ARG (exp
, 0);
12835 op0
= expand_normal (arg0
);
12836 arg1
= CALL_EXPR_ARG (exp
, 1);
12837 op1
= expand_normal (arg1
);
12839 op0
= force_reg (mode
, op0
);
12841 if (!address_operand (op1
, VOIDmode
))
12843 op1
= convert_memory_address (Pmode
, op1
);
12844 op1
= copy_addr_to_reg (op1
);
12846 op1
= gen_rtx_MEM (mode
, op1
);
12848 icode
= ((fcode
== IX86_BUILTIN_WRSSD
12849 || fcode
== IX86_BUILTIN_WRSSQ
)
12850 ? code_for_wrss (mode
)
12851 : code_for_wruss (mode
));
12852 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12860 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12861 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12863 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12864 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12868 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12869 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12871 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12872 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12873 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12874 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12876 machine_mode mode
, wide_mode
, nar_mode
;
12878 nar_mode
= V4SFmode
;
12880 wide_mode
= V64SFmode
;
12881 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12882 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12886 case IX86_BUILTIN_4FMAPS
:
12887 fcn
= gen_avx5124fmaddps_4fmaddps
;
12891 case IX86_BUILTIN_4DPWSSD
:
12892 nar_mode
= V4SImode
;
12894 wide_mode
= V64SImode
;
12895 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12899 case IX86_BUILTIN_4DPWSSDS
:
12900 nar_mode
= V4SImode
;
12902 wide_mode
= V64SImode
;
12903 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12907 case IX86_BUILTIN_4FNMAPS
:
12908 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12912 case IX86_BUILTIN_4FNMAPS_MASK
:
12913 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12914 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12917 case IX86_BUILTIN_4DPWSSD_MASK
:
12918 nar_mode
= V4SImode
;
12920 wide_mode
= V64SImode
;
12921 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12922 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12925 case IX86_BUILTIN_4DPWSSDS_MASK
:
12926 nar_mode
= V4SImode
;
12928 wide_mode
= V64SImode
;
12929 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12930 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12933 case IX86_BUILTIN_4FMAPS_MASK
:
12943 wide_reg
= gen_reg_rtx (wide_mode
);
12944 for (i
= 0; i
< 4; i
++)
12946 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12947 ops
[i
] = expand_normal (args
[i
]);
12949 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12953 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12954 accum
= force_reg (mode
, accum
);
12956 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12957 addr
= force_reg (Pmode
, addr
);
12959 mem
= gen_rtx_MEM (nar_mode
, addr
);
12961 target
= gen_reg_rtx (mode
);
12963 emit_move_insn (target
, accum
);
12966 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12970 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12972 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12974 if (CONST_INT_P (mask
))
12975 mask
= fixup_modeless_constant (mask
, HImode
);
12977 mask
= force_reg (HImode
, mask
);
12979 if (GET_MODE (mask
) != HImode
)
12980 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12982 /* If merge is 0 then we're about to emit z-masked variant. */
12983 if (const0_operand (merge
, mode
))
12984 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12985 /* If merge is the same as accum then emit merge-masked variant. */
12986 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12988 merge
= force_reg (mode
, merge
);
12989 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12991 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12994 target
= gen_reg_rtx (mode
);
12995 emit_move_insn (target
, merge
);
12996 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13002 case IX86_BUILTIN_4FNMASS
:
13003 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13007 case IX86_BUILTIN_4FMASS
:
13008 fcn
= gen_avx5124fmaddps_4fmaddss
;
13012 case IX86_BUILTIN_4FNMASS_MASK
:
13013 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13014 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13017 case IX86_BUILTIN_4FMASS_MASK
:
13026 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13027 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13031 wide_reg
= gen_reg_rtx (V64SFmode
);
13032 for (i
= 0; i
< 4; i
++)
13035 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13036 ops
[i
] = expand_normal (args
[i
]);
13038 tmp
= gen_reg_rtx (SFmode
);
13039 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13041 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13042 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13045 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13046 accum
= force_reg (V4SFmode
, accum
);
13048 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13049 addr
= force_reg (Pmode
, addr
);
13051 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13053 target
= gen_reg_rtx (V4SFmode
);
13055 emit_move_insn (target
, accum
);
13058 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13062 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13064 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13066 if (CONST_INT_P (mask
))
13067 mask
= fixup_modeless_constant (mask
, QImode
);
13069 mask
= force_reg (QImode
, mask
);
13071 if (GET_MODE (mask
) != QImode
)
13072 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13074 /* If merge is 0 then we're about to emit z-masked variant. */
13075 if (const0_operand (merge
, mode
))
13076 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13077 /* If merge is the same as accum then emit merge-masked
13079 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13081 merge
= force_reg (mode
, merge
);
13082 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13084 /* Merge with something unknown might happen if we z-mask
13088 target
= gen_reg_rtx (mode
);
13089 emit_move_insn (target
, merge
);
13090 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13095 case IX86_BUILTIN_RDPID
:
13096 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13098 case IX86_BUILTIN_FABSQ
:
13099 case IX86_BUILTIN_COPYSIGNQ
:
13101 /* Emit a normal call if SSE isn't available. */
13102 return expand_call (exp
, target
, ignore
);
13105 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13109 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13110 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13112 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13113 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13116 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13117 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13119 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13120 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13123 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13124 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13126 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13127 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13130 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13131 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13133 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13134 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13137 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13138 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13140 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13141 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13142 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13143 (enum ix86_builtin_func_type
)
13144 d
->flag
, d
->comparison
);
13147 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13148 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13150 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13151 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13155 gcc_unreachable ();
13158 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13159 fill target with val via vec_duplicate. */
13162 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13168 /* First attempt to recognize VAL as-is. */
13169 dup
= gen_vec_duplicate (mode
, val
);
13170 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13171 if (recog_memoized (insn
) < 0)
13174 machine_mode innermode
= GET_MODE_INNER (mode
);
13177 /* If that fails, force VAL into a register. */
13180 reg
= force_reg (innermode
, val
);
13181 if (GET_MODE (reg
) != innermode
)
13182 reg
= gen_lowpart (innermode
, reg
);
13183 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13184 seq
= get_insns ();
13187 emit_insn_before (seq
, insn
);
13189 ok
= recog_memoized (insn
) >= 0;
13195 /* Get a vector mode of the same size as the original but with elements
13196 twice as wide. This is only guaranteed to apply to integral vectors. */
13198 static machine_mode
13199 get_mode_wider_vector (machine_mode o
)
13201 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13202 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13203 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13204 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13208 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13209 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13211 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13212 with all elements equal to VAR. Return true if successful. */
13215 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13216 rtx target
, rtx val
)
13240 return ix86_vector_duplicate_value (mode
, target
, val
);
13245 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13249 val
= gen_lowpart (SImode
, val
);
13250 x
= gen_rtx_TRUNCATE (HImode
, val
);
13251 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13252 emit_insn (gen_rtx_SET (target
, x
));
13264 return ix86_vector_duplicate_value (mode
, target
, val
);
13268 struct expand_vec_perm_d dperm
;
13272 memset (&dperm
, 0, sizeof (dperm
));
13273 dperm
.target
= target
;
13274 dperm
.vmode
= mode
;
13275 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13276 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13277 dperm
.one_operand_p
= true;
13279 /* Extend to SImode using a paradoxical SUBREG. */
13280 tmp1
= gen_reg_rtx (SImode
);
13281 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13283 /* Insert the SImode value as low element of a V4SImode vector. */
13284 tmp2
= gen_reg_rtx (V4SImode
);
13285 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13286 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13288 ok
= (expand_vec_perm_1 (&dperm
)
13289 || expand_vec_perm_broadcast_1 (&dperm
));
13297 return ix86_vector_duplicate_value (mode
, target
, val
);
13304 /* Replicate the value once into the next wider mode and recurse. */
13306 machine_mode smode
, wsmode
, wvmode
;
13309 smode
= GET_MODE_INNER (mode
);
13310 wvmode
= get_mode_wider_vector (mode
);
13311 wsmode
= GET_MODE_INNER (wvmode
);
13313 val
= convert_modes (wsmode
, smode
, val
, true);
13314 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13315 GEN_INT (GET_MODE_BITSIZE (smode
)),
13316 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13317 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13319 x
= gen_reg_rtx (wvmode
);
13320 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13322 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13329 return ix86_vector_duplicate_value (mode
, target
, val
);
13332 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13333 rtx x
= gen_reg_rtx (hvmode
);
13335 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13338 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13339 emit_insn (gen_rtx_SET (target
, x
));
13345 if (TARGET_AVX512BW
)
13346 return ix86_vector_duplicate_value (mode
, target
, val
);
13349 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13350 rtx x
= gen_reg_rtx (hvmode
);
13352 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13355 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13356 emit_insn (gen_rtx_SET (target
, x
));
13365 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13366 whose ONE_VAR element is VAR, and other elements are zero. Return true
13370 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13371 rtx target
, rtx var
, int one_var
)
13373 machine_mode vsimode
;
13376 bool use_vector_set
= false;
13377 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13382 /* For SSE4.1, we normally use vector set. But if the second
13383 element is zero and inter-unit moves are OK, we use movq
13385 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13386 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13392 use_vector_set
= TARGET_SSE4_1
;
13395 use_vector_set
= TARGET_SSE2
;
13398 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13401 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13405 use_vector_set
= TARGET_AVX
;
13408 use_vector_set
= TARGET_AVX
;
13409 gen_vec_set_0
= gen_vec_setv8si_0
;
13412 use_vector_set
= TARGET_AVX
;
13413 gen_vec_set_0
= gen_vec_setv8sf_0
;
13416 use_vector_set
= TARGET_AVX
;
13417 gen_vec_set_0
= gen_vec_setv4df_0
;
13420 /* Use ix86_expand_vector_set in 64bit mode only. */
13421 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13422 gen_vec_set_0
= gen_vec_setv4di_0
;
13425 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13426 gen_vec_set_0
= gen_vec_setv16si_0
;
13429 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13430 gen_vec_set_0
= gen_vec_setv16sf_0
;
13433 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13434 gen_vec_set_0
= gen_vec_setv8df_0
;
13437 /* Use ix86_expand_vector_set in 64bit mode only. */
13438 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13439 gen_vec_set_0
= gen_vec_setv8di_0
;
13445 if (use_vector_set
)
13447 if (gen_vec_set_0
&& one_var
== 0)
13449 var
= force_reg (GET_MODE_INNER (mode
), var
);
13450 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13453 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13454 var
= force_reg (GET_MODE_INNER (mode
), var
);
13455 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13471 var
= force_reg (GET_MODE_INNER (mode
), var
);
13472 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13473 emit_insn (gen_rtx_SET (target
, x
));
13478 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13479 new_target
= gen_reg_rtx (mode
);
13481 new_target
= target
;
13482 var
= force_reg (GET_MODE_INNER (mode
), var
);
13483 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13484 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13485 emit_insn (gen_rtx_SET (new_target
, x
));
13488 /* We need to shuffle the value to the correct position, so
13489 create a new pseudo to store the intermediate result. */
13491 /* With SSE2, we can use the integer shuffle insns. */
13492 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13494 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13496 GEN_INT (one_var
== 1 ? 0 : 1),
13497 GEN_INT (one_var
== 2 ? 0 : 1),
13498 GEN_INT (one_var
== 3 ? 0 : 1)));
13499 if (target
!= new_target
)
13500 emit_move_insn (target
, new_target
);
13504 /* Otherwise convert the intermediate result to V4SFmode and
13505 use the SSE1 shuffle instructions. */
13506 if (mode
!= V4SFmode
)
13508 tmp
= gen_reg_rtx (V4SFmode
);
13509 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13514 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13516 GEN_INT (one_var
== 1 ? 0 : 1),
13517 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13518 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13520 if (mode
!= V4SFmode
)
13521 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13522 else if (tmp
!= target
)
13523 emit_move_insn (target
, tmp
);
13525 else if (target
!= new_target
)
13526 emit_move_insn (target
, new_target
);
13531 vsimode
= V4SImode
;
13537 vsimode
= V2SImode
;
13543 /* Zero extend the variable element to SImode and recurse. */
13544 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13546 x
= gen_reg_rtx (vsimode
);
13547 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13549 gcc_unreachable ();
13551 emit_move_insn (target
, gen_lowpart (mode
, x
));
13559 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13560 consisting of the values in VALS. It is known that all elements
13561 except ONE_VAR are constants. Return true if successful. */
13564 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13565 rtx target
, rtx vals
, int one_var
)
13567 rtx var
= XVECEXP (vals
, 0, one_var
);
13568 machine_mode wmode
;
13571 const_vec
= copy_rtx (vals
);
13572 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13573 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13581 /* For the two element vectors, it's just as easy to use
13582 the general case. */
13586 /* Use ix86_expand_vector_set in 64bit mode only. */
13607 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13612 /* There's no way to set one QImode entry easily. Combine
13613 the variable value with its adjacent constant value, and
13614 promote to an HImode set. */
13615 x
= XVECEXP (vals
, 0, one_var
^ 1);
13618 var
= convert_modes (HImode
, QImode
, var
, true);
13619 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13620 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13621 x
= GEN_INT (INTVAL (x
) & 0xff);
13625 var
= convert_modes (HImode
, QImode
, var
, true);
13626 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13628 if (x
!= const0_rtx
)
13629 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13630 1, OPTAB_LIB_WIDEN
);
13632 x
= gen_reg_rtx (wmode
);
13633 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13634 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13636 emit_move_insn (target
, gen_lowpart (mode
, x
));
13643 emit_move_insn (target
, const_vec
);
13644 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13648 /* A subroutine of ix86_expand_vector_init_general. Use vector
13649 concatenate to handle the most general case: all values variable,
13650 and none identical. */
13653 ix86_expand_vector_init_concat (machine_mode mode
,
13654 rtx target
, rtx
*ops
, int n
)
13656 machine_mode half_mode
= VOIDmode
;
13667 half_mode
= V8SImode
;
13670 half_mode
= V8SFmode
;
13673 half_mode
= V4DImode
;
13676 half_mode
= V4DFmode
;
13679 half_mode
= V4SImode
;
13682 half_mode
= V4SFmode
;
13685 half_mode
= V2DImode
;
13688 half_mode
= V2DFmode
;
13691 half_mode
= V2SImode
;
13694 half_mode
= V2SFmode
;
13697 half_mode
= DImode
;
13700 half_mode
= SImode
;
13703 half_mode
= DFmode
;
13706 half_mode
= SFmode
;
13709 gcc_unreachable ();
13712 if (!register_operand (ops
[1], half_mode
))
13713 ops
[1] = force_reg (half_mode
, ops
[1]);
13714 if (!register_operand (ops
[0], half_mode
))
13715 ops
[0] = force_reg (half_mode
, ops
[0]);
13716 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13724 half_mode
= V2DImode
;
13727 half_mode
= V2DFmode
;
13730 half_mode
= V2SImode
;
13733 half_mode
= V2SFmode
;
13736 gcc_unreachable ();
13744 half_mode
= V4DImode
;
13747 half_mode
= V4DFmode
;
13750 half_mode
= V4SImode
;
13753 half_mode
= V4SFmode
;
13756 gcc_unreachable ();
13764 half_mode
= V8SImode
;
13767 half_mode
= V8SFmode
;
13770 gcc_unreachable ();
13775 /* FIXME: We process inputs backward to help RA. PR 36222. */
13777 for (j
= 1; j
!= -1; j
--)
13779 half
[j
] = gen_reg_rtx (half_mode
);
13783 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
13787 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
13791 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
13792 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
13796 gcc_unreachable ();
13798 ix86_expand_vector_init (false, half
[j
],
13799 gen_rtx_PARALLEL (half_mode
, v
));
13802 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
13806 gcc_unreachable ();
13810 /* A subroutine of ix86_expand_vector_init_general. Use vector
13811 interleave to handle the most general case: all values variable,
13812 and none identical. */
13815 ix86_expand_vector_init_interleave (machine_mode mode
,
13816 rtx target
, rtx
*ops
, int n
)
13818 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13821 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13822 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13823 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13828 gen_load_even
= gen_vec_setv8hi
;
13829 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13830 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13831 inner_mode
= HImode
;
13832 first_imode
= V4SImode
;
13833 second_imode
= V2DImode
;
13834 third_imode
= VOIDmode
;
13837 gen_load_even
= gen_vec_setv16qi
;
13838 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13839 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13840 inner_mode
= QImode
;
13841 first_imode
= V8HImode
;
13842 second_imode
= V4SImode
;
13843 third_imode
= V2DImode
;
13846 gcc_unreachable ();
13849 for (i
= 0; i
< n
; i
++)
13851 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13852 op0
= gen_reg_rtx (SImode
);
13853 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13855 /* Insert the SImode value as low element of V4SImode vector. */
13856 op1
= gen_reg_rtx (V4SImode
);
13857 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13858 gen_rtx_VEC_DUPLICATE (V4SImode
,
13860 CONST0_RTX (V4SImode
),
13862 emit_insn (gen_rtx_SET (op1
, op0
));
13864 /* Cast the V4SImode vector back to a vector in orignal mode. */
13865 op0
= gen_reg_rtx (mode
);
13866 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13868 /* Load even elements into the second position. */
13869 emit_insn (gen_load_even (op0
,
13870 force_reg (inner_mode
,
13874 /* Cast vector to FIRST_IMODE vector. */
13875 ops
[i
] = gen_reg_rtx (first_imode
);
13876 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13879 /* Interleave low FIRST_IMODE vectors. */
13880 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13882 op0
= gen_reg_rtx (first_imode
);
13883 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13885 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13886 ops
[j
] = gen_reg_rtx (second_imode
);
13887 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13890 /* Interleave low SECOND_IMODE vectors. */
13891 switch (second_imode
)
13894 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13896 op0
= gen_reg_rtx (second_imode
);
13897 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13900 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13902 ops
[j
] = gen_reg_rtx (third_imode
);
13903 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13905 second_imode
= V2DImode
;
13906 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13910 op0
= gen_reg_rtx (second_imode
);
13911 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13914 /* Cast the SECOND_IMODE vector back to a vector on original
13916 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13920 gcc_unreachable ();
13924 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13925 all values variable, and none identical. */
13928 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13929 rtx target
, rtx vals
)
13931 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13932 machine_mode half_mode
= VOIDmode
;
13933 machine_mode quarter_mode
= VOIDmode
;
13940 if (!mmx_ok
&& !TARGET_SSE
)
13956 n
= GET_MODE_NUNITS (mode
);
13957 for (i
= 0; i
< n
; i
++)
13958 ops
[i
] = XVECEXP (vals
, 0, i
);
13959 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13963 for (i
= 0; i
< 2; i
++)
13964 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13965 op0
= gen_reg_rtx (V4DImode
);
13966 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13967 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13971 for (i
= 0; i
< 4; i
++)
13972 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13973 ops
[4] = gen_reg_rtx (V4DImode
);
13974 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
13975 ops
[5] = gen_reg_rtx (V4DImode
);
13976 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
13977 op0
= gen_reg_rtx (V8DImode
);
13978 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
13979 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13983 half_mode
= V16QImode
;
13987 half_mode
= V8HImode
;
13991 n
= GET_MODE_NUNITS (mode
);
13992 for (i
= 0; i
< n
; i
++)
13993 ops
[i
] = XVECEXP (vals
, 0, i
);
13994 op0
= gen_reg_rtx (half_mode
);
13995 op1
= gen_reg_rtx (half_mode
);
13996 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
13998 ix86_expand_vector_init_interleave (half_mode
, op1
,
13999 &ops
[n
>> 1], n
>> 2);
14000 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14004 quarter_mode
= V16QImode
;
14005 half_mode
= V32QImode
;
14009 quarter_mode
= V8HImode
;
14010 half_mode
= V16HImode
;
14014 n
= GET_MODE_NUNITS (mode
);
14015 for (i
= 0; i
< n
; i
++)
14016 ops
[i
] = XVECEXP (vals
, 0, i
);
14017 op0
= gen_reg_rtx (quarter_mode
);
14018 op1
= gen_reg_rtx (quarter_mode
);
14019 op2
= gen_reg_rtx (quarter_mode
);
14020 op3
= gen_reg_rtx (quarter_mode
);
14021 op4
= gen_reg_rtx (half_mode
);
14022 op5
= gen_reg_rtx (half_mode
);
14023 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14025 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14026 &ops
[n
>> 2], n
>> 3);
14027 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14028 &ops
[n
>> 1], n
>> 3);
14029 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14030 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14031 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14032 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14033 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14037 if (!TARGET_SSE4_1
)
14045 /* Don't use ix86_expand_vector_init_interleave if we can't
14046 move from GPR to SSE register directly. */
14047 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14050 n
= GET_MODE_NUNITS (mode
);
14051 for (i
= 0; i
< n
; i
++)
14052 ops
[i
] = XVECEXP (vals
, 0, i
);
14053 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14061 gcc_unreachable ();
14065 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14066 machine_mode inner_mode
;
14067 rtx words
[4], shift
;
14069 inner_mode
= GET_MODE_INNER (mode
);
14070 n_elts
= GET_MODE_NUNITS (mode
);
14071 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14072 n_elt_per_word
= n_elts
/ n_words
;
14073 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14075 for (i
= 0; i
< n_words
; ++i
)
14077 rtx word
= NULL_RTX
;
14079 for (j
= 0; j
< n_elt_per_word
; ++j
)
14081 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14082 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14088 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14089 word
, 1, OPTAB_LIB_WIDEN
);
14090 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14091 word
, 1, OPTAB_LIB_WIDEN
);
14099 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14100 else if (n_words
== 2)
14102 rtx tmp
= gen_reg_rtx (mode
);
14103 emit_clobber (tmp
);
14104 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14105 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14106 emit_move_insn (target
, tmp
);
14108 else if (n_words
== 4)
14110 rtx tmp
= gen_reg_rtx (V4SImode
);
14111 gcc_assert (word_mode
== SImode
);
14112 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14113 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14114 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14117 gcc_unreachable ();
14121 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14122 instructions unless MMX_OK is true. */
14125 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14127 machine_mode mode
= GET_MODE (target
);
14128 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14129 int n_elts
= GET_MODE_NUNITS (mode
);
14130 int n_var
= 0, one_var
= -1;
14131 bool all_same
= true, all_const_zero
= true;
14135 /* Handle first initialization from vector elts. */
14136 if (n_elts
!= XVECLEN (vals
, 0))
14138 rtx subtarget
= target
;
14139 x
= XVECEXP (vals
, 0, 0);
14140 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14141 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14143 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14144 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14146 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14147 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14148 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14149 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14150 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14151 subtarget
= gen_reg_rtx (mode
);
14153 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14154 if (subtarget
!= target
)
14155 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14158 gcc_unreachable ();
14161 for (i
= 0; i
< n_elts
; ++i
)
14163 x
= XVECEXP (vals
, 0, i
);
14164 if (!(CONST_SCALAR_INT_P (x
)
14165 || CONST_DOUBLE_P (x
)
14166 || CONST_FIXED_P (x
)))
14167 n_var
++, one_var
= i
;
14168 else if (x
!= CONST0_RTX (inner_mode
))
14169 all_const_zero
= false;
14170 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14174 /* Constants are best loaded from the constant pool. */
14177 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14181 /* If all values are identical, broadcast the value. */
14183 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14184 XVECEXP (vals
, 0, 0)))
14187 /* Values where only one field is non-constant are best loaded from
14188 the pool and overwritten via move later. */
14192 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14193 XVECEXP (vals
, 0, one_var
),
14197 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14201 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14205 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14207 machine_mode mode
= GET_MODE (target
);
14208 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14209 machine_mode half_mode
;
14210 bool use_vec_merge
= false;
14212 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14214 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14215 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14216 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14217 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14218 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14219 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14221 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14223 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14224 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14225 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14226 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14227 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14228 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14231 machine_mode mmode
= VOIDmode
;
14232 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14237 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14245 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14246 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14248 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14250 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14251 emit_insn (gen_rtx_SET (target
, tmp
));
14257 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14261 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14262 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14264 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14266 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14267 emit_insn (gen_rtx_SET (target
, tmp
));
14271 /* NB: For ELT == 0, use standard scalar operation patterns which
14272 preserve the rest of the vector for combiner:
14275 (vec_duplicate:V2DF (reg:DF))
14285 /* For the two element vectors, we implement a VEC_CONCAT with
14286 the extraction of the other element. */
14288 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14289 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14292 op0
= val
, op1
= tmp
;
14294 op0
= tmp
, op1
= val
;
14296 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14297 emit_insn (gen_rtx_SET (target
, tmp
));
14302 use_vec_merge
= TARGET_SSE4_1
;
14309 use_vec_merge
= true;
14313 /* tmp = target = A B C D */
14314 tmp
= copy_to_reg (target
);
14315 /* target = A A B B */
14316 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14317 /* target = X A B B */
14318 ix86_expand_vector_set (false, target
, val
, 0);
14319 /* target = A X C D */
14320 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14321 const1_rtx
, const0_rtx
,
14322 GEN_INT (2+4), GEN_INT (3+4)));
14326 /* tmp = target = A B C D */
14327 tmp
= copy_to_reg (target
);
14328 /* tmp = X B C D */
14329 ix86_expand_vector_set (false, tmp
, val
, 0);
14330 /* target = A B X D */
14331 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14332 const0_rtx
, const1_rtx
,
14333 GEN_INT (0+4), GEN_INT (3+4)));
14337 /* tmp = target = A B C D */
14338 tmp
= copy_to_reg (target
);
14339 /* tmp = X B C D */
14340 ix86_expand_vector_set (false, tmp
, val
, 0);
14341 /* target = A B X D */
14342 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14343 const0_rtx
, const1_rtx
,
14344 GEN_INT (2+4), GEN_INT (0+4)));
14348 gcc_unreachable ();
14353 use_vec_merge
= TARGET_SSE4_1
;
14357 /* Element 0 handled by vec_merge below. */
14360 use_vec_merge
= true;
14366 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14367 store into element 0, then shuffle them back. */
14371 order
[0] = GEN_INT (elt
);
14372 order
[1] = const1_rtx
;
14373 order
[2] = const2_rtx
;
14374 order
[3] = GEN_INT (3);
14375 order
[elt
] = const0_rtx
;
14377 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14378 order
[1], order
[2], order
[3]));
14380 ix86_expand_vector_set (false, target
, val
, 0);
14382 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14383 order
[1], order
[2], order
[3]));
14387 /* For SSE1, we have to reuse the V4SF code. */
14388 rtx t
= gen_reg_rtx (V4SFmode
);
14389 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14390 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14391 emit_move_insn (target
, gen_lowpart (mode
, t
));
14396 use_vec_merge
= TARGET_SSE2
;
14399 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14403 use_vec_merge
= TARGET_SSE4_1
;
14407 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14411 half_mode
= V16QImode
;
14417 half_mode
= V8HImode
;
14423 half_mode
= V4SImode
;
14429 half_mode
= V2DImode
;
14435 half_mode
= V4SFmode
;
14441 half_mode
= V2DFmode
;
14447 /* Compute offset. */
14451 gcc_assert (i
<= 1);
14453 /* Extract the half. */
14454 tmp
= gen_reg_rtx (half_mode
);
14455 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14457 /* Put val in tmp at elt. */
14458 ix86_expand_vector_set (false, tmp
, val
, elt
);
14461 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14465 if (TARGET_AVX512F
)
14468 gen_blendm
= gen_avx512f_blendmv8df
;
14473 if (TARGET_AVX512F
)
14476 gen_blendm
= gen_avx512f_blendmv8di
;
14481 if (TARGET_AVX512F
)
14484 gen_blendm
= gen_avx512f_blendmv16sf
;
14489 if (TARGET_AVX512F
)
14492 gen_blendm
= gen_avx512f_blendmv16si
;
14497 if (TARGET_AVX512BW
)
14500 gen_blendm
= gen_avx512bw_blendmv32hi
;
14502 else if (TARGET_AVX512F
)
14504 half_mode
= E_V8HImode
;
14511 if (TARGET_AVX512BW
)
14514 gen_blendm
= gen_avx512bw_blendmv64qi
;
14516 else if (TARGET_AVX512F
)
14518 half_mode
= E_V16QImode
;
14525 /* Compute offset. */
14529 gcc_assert (i
<= 3);
14532 /* Extract the quarter. */
14533 tmp
= gen_reg_rtx (V4SImode
);
14534 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14535 rtx mask
= gen_reg_rtx (QImode
);
14537 emit_move_insn (mask
, constm1_rtx
);
14538 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14541 tmp2
= gen_reg_rtx (half_mode
);
14542 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14545 /* Put val in tmp at elt. */
14546 ix86_expand_vector_set (false, tmp
, val
, elt
);
14549 tmp2
= gen_reg_rtx (V16SImode
);
14550 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14551 mask
= gen_reg_rtx (HImode
);
14552 emit_move_insn (mask
, constm1_rtx
);
14553 tmp
= gen_lowpart (V4SImode
, tmp
);
14554 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14556 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14564 if (mmode
!= VOIDmode
)
14566 tmp
= gen_reg_rtx (mode
);
14567 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14568 /* The avx512*_blendm<mode> expanders have different operand order
14569 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14570 elements where the mask is set and second input operand otherwise,
14571 in {sse,avx}*_*blend* the first input operand is used for elements
14572 where the mask is clear and second input operand otherwise. */
14573 emit_insn (gen_blendm (target
, target
, tmp
,
14575 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14578 else if (use_vec_merge
)
14581 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14582 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14583 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14584 emit_insn (gen_rtx_SET (target
, tmp
));
14588 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14590 emit_move_insn (mem
, target
);
14592 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14593 emit_move_insn (tmp
, val
);
14595 emit_move_insn (target
, mem
);
14600 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14602 machine_mode mode
= GET_MODE (vec
);
14603 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14604 bool use_vec_extr
= false;
14610 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14624 use_vec_extr
= true;
14628 use_vec_extr
= TARGET_SSE4_1
;
14640 tmp
= gen_reg_rtx (mode
);
14641 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14642 GEN_INT (elt
), GEN_INT (elt
),
14643 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14647 tmp
= gen_reg_rtx (mode
);
14648 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14652 gcc_unreachable ();
14655 use_vec_extr
= true;
14660 use_vec_extr
= TARGET_SSE4_1
;
14674 tmp
= gen_reg_rtx (mode
);
14675 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14676 GEN_INT (elt
), GEN_INT (elt
),
14677 GEN_INT (elt
), GEN_INT (elt
)));
14681 tmp
= gen_reg_rtx (mode
);
14682 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14686 gcc_unreachable ();
14689 use_vec_extr
= true;
14694 /* For SSE1, we have to reuse the V4SF code. */
14695 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14696 gen_lowpart (V4SFmode
, vec
), elt
);
14702 use_vec_extr
= TARGET_SSE2
;
14705 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14709 use_vec_extr
= TARGET_SSE4_1
;
14713 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
14715 tmp
= gen_reg_rtx (SImode
);
14716 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
14718 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
14726 tmp
= gen_reg_rtx (V4SFmode
);
14728 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14730 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14731 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14739 tmp
= gen_reg_rtx (V2DFmode
);
14741 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14743 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14744 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14752 tmp
= gen_reg_rtx (V16QImode
);
14754 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14756 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14757 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14765 tmp
= gen_reg_rtx (V8HImode
);
14767 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14769 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14770 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14778 tmp
= gen_reg_rtx (V4SImode
);
14780 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14782 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14783 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14791 tmp
= gen_reg_rtx (V2DImode
);
14793 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14795 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14796 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14802 if (TARGET_AVX512BW
)
14804 tmp
= gen_reg_rtx (V16HImode
);
14806 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14808 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14809 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14815 if (TARGET_AVX512BW
)
14817 tmp
= gen_reg_rtx (V32QImode
);
14819 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14821 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14822 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14828 tmp
= gen_reg_rtx (V8SFmode
);
14830 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14832 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14833 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14837 tmp
= gen_reg_rtx (V4DFmode
);
14839 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14841 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14842 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14846 tmp
= gen_reg_rtx (V8SImode
);
14848 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14850 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14851 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14855 tmp
= gen_reg_rtx (V4DImode
);
14857 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14859 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14860 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14864 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14865 /* ??? Could extract the appropriate HImode element and shift. */
14874 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14875 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14877 /* Let the rtl optimizers know about the zero extension performed. */
14878 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14880 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14881 target
= gen_lowpart (SImode
, target
);
14884 emit_insn (gen_rtx_SET (target
, tmp
));
14888 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14890 emit_move_insn (mem
, vec
);
14892 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14893 emit_move_insn (target
, tmp
);
14897 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14898 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14899 The upper bits of DEST are undefined, though they shouldn't cause
14900 exceptions (some bits from src or all zeros are ok). */
14903 emit_reduc_half (rtx dest
, rtx src
, int i
)
14906 switch (GET_MODE (src
))
14910 tem
= gen_sse_movhlps (dest
, src
, src
);
14912 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14913 GEN_INT (1 + 4), GEN_INT (1 + 4));
14916 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14922 d
= gen_reg_rtx (V1TImode
);
14923 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14928 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14930 tem
= gen_avx_shufps256 (dest
, src
, src
,
14931 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14935 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14937 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14945 if (GET_MODE (dest
) != V4DImode
)
14946 d
= gen_reg_rtx (V4DImode
);
14947 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14948 gen_lowpart (V4DImode
, src
),
14953 d
= gen_reg_rtx (V2TImode
);
14954 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14962 d
= gen_reg_rtx (V4TImode
);
14963 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
14973 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14974 gen_lowpart (V16SImode
, src
),
14975 gen_lowpart (V16SImode
, src
),
14976 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14977 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14978 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14979 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14980 GEN_INT (0xC), GEN_INT (0xD),
14981 GEN_INT (0xE), GEN_INT (0xF),
14982 GEN_INT (0x10), GEN_INT (0x11),
14983 GEN_INT (0x12), GEN_INT (0x13),
14984 GEN_INT (0x14), GEN_INT (0x15),
14985 GEN_INT (0x16), GEN_INT (0x17));
14987 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
14988 gen_lowpart (V16SImode
, src
),
14989 GEN_INT (i
== 128 ? 0x2 : 0x1),
14993 GEN_INT (i
== 128 ? 0x6 : 0x5),
14997 GEN_INT (i
== 128 ? 0xA : 0x9),
15001 GEN_INT (i
== 128 ? 0xE : 0xD),
15007 gcc_unreachable ();
15011 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15014 /* Expand a vector reduction. FN is the binary pattern to reduce;
15015 DEST is the destination; IN is the input vector. */
15018 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15020 rtx half
, dst
, vec
= in
;
15021 machine_mode mode
= GET_MODE (in
);
15024 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15026 && mode
== V8HImode
15027 && fn
== gen_uminv8hi3
)
15029 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15033 for (i
= GET_MODE_BITSIZE (mode
);
15034 i
> GET_MODE_UNIT_BITSIZE (mode
);
15037 half
= gen_reg_rtx (mode
);
15038 emit_reduc_half (half
, vec
, i
);
15039 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15042 dst
= gen_reg_rtx (mode
);
15043 emit_insn (fn (dst
, half
, vec
));
15048 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15049 FP status register is set. */
15052 ix86_emit_fp_unordered_jump (rtx label
)
15054 rtx reg
= gen_reg_rtx (HImode
);
15058 emit_insn (gen_x86_fnstsw_1 (reg
));
15060 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15062 emit_insn (gen_x86_sahf_1 (reg
));
15064 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15065 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15069 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15071 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15072 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15075 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15076 gen_rtx_LABEL_REF (VOIDmode
, label
),
15078 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15079 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15080 JUMP_LABEL (insn
) = label
;
15083 /* Output code to perform an sinh XFmode calculation. */
15085 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15087 rtx e1
= gen_reg_rtx (XFmode
);
15088 rtx e2
= gen_reg_rtx (XFmode
);
15089 rtx scratch
= gen_reg_rtx (HImode
);
15090 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15091 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15093 rtx_code_label
*jump_label
= gen_label_rtx ();
15096 /* scratch = fxam (op1) */
15097 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15099 /* e1 = expm1 (|op1|) */
15100 emit_insn (gen_absxf2 (e2
, op1
));
15101 emit_insn (gen_expm1xf2 (e1
, e2
));
15103 /* e2 = e1 / (e1 + 1.0) + e1 */
15104 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15105 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15106 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15107 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15109 /* flags = signbit (op1) */
15110 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15112 /* if (flags) then e2 = -e2 */
15113 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15114 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15115 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15117 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15118 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15119 JUMP_LABEL (insn
) = jump_label
;
15121 emit_insn (gen_negxf2 (e2
, e2
));
15123 emit_label (jump_label
);
15124 LABEL_NUSES (jump_label
) = 1;
15126 /* op0 = 0.5 * e2 */
15127 half
= force_reg (XFmode
, half
);
15128 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15131 /* Output code to perform an cosh XFmode calculation. */
15133 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15135 rtx e1
= gen_reg_rtx (XFmode
);
15136 rtx e2
= gen_reg_rtx (XFmode
);
15137 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15140 /* e1 = exp (op1) */
15141 emit_insn (gen_expxf2 (e1
, op1
));
15143 /* e2 = e1 + 1.0 / e1 */
15144 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15145 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15146 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15148 /* op0 = 0.5 * e2 */
15149 half
= force_reg (XFmode
, half
);
15150 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15153 /* Output code to perform an tanh XFmode calculation. */
15155 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15157 rtx e1
= gen_reg_rtx (XFmode
);
15158 rtx e2
= gen_reg_rtx (XFmode
);
15159 rtx scratch
= gen_reg_rtx (HImode
);
15160 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15162 rtx_code_label
*jump_label
= gen_label_rtx ();
15165 /* scratch = fxam (op1) */
15166 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15168 /* e1 = expm1 (-|2 * op1|) */
15169 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15170 emit_insn (gen_absxf2 (e2
, e2
));
15171 emit_insn (gen_negxf2 (e2
, e2
));
15172 emit_insn (gen_expm1xf2 (e1
, e2
));
15174 /* e2 = e1 / (e1 + 2.0) */
15175 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15176 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15177 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15179 /* flags = signbit (op1) */
15180 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15182 /* if (!flags) then e2 = -e2 */
15183 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15184 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15185 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15187 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15188 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15189 JUMP_LABEL (insn
) = jump_label
;
15191 emit_insn (gen_negxf2 (e2
, e2
));
15193 emit_label (jump_label
);
15194 LABEL_NUSES (jump_label
) = 1;
15196 emit_move_insn (op0
, e2
);
15199 /* Output code to perform an asinh XFmode calculation. */
15201 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15203 rtx e1
= gen_reg_rtx (XFmode
);
15204 rtx e2
= gen_reg_rtx (XFmode
);
15205 rtx scratch
= gen_reg_rtx (HImode
);
15206 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15208 rtx_code_label
*jump_label
= gen_label_rtx ();
15211 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15212 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15213 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15214 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15215 emit_insn (gen_sqrtxf2 (e2
, e2
));
15216 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15219 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15221 /* scratch = fxam (op1) */
15222 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15224 /* e1 = e1 + |op1| */
15225 emit_insn (gen_absxf2 (e2
, op1
));
15226 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15228 /* e2 = log1p (e1) */
15229 ix86_emit_i387_log1p (e2
, e1
);
15231 /* flags = signbit (op1) */
15232 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15234 /* if (flags) then e2 = -e2 */
15235 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15236 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15237 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15239 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15240 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15241 JUMP_LABEL (insn
) = jump_label
;
15243 emit_insn (gen_negxf2 (e2
, e2
));
15245 emit_label (jump_label
);
15246 LABEL_NUSES (jump_label
) = 1;
15248 emit_move_insn (op0
, e2
);
15251 /* Output code to perform an acosh XFmode calculation. */
15253 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15255 rtx e1
= gen_reg_rtx (XFmode
);
15256 rtx e2
= gen_reg_rtx (XFmode
);
15257 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15259 /* e2 = sqrt (op1 + 1.0) */
15260 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15261 emit_insn (gen_sqrtxf2 (e2
, e2
));
15263 /* e1 = sqrt (op1 - 1.0) */
15264 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15265 emit_insn (gen_sqrtxf2 (e1
, e1
));
15268 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15270 /* e1 = e1 + op1 */
15271 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15273 /* op0 = log (e1) */
15274 emit_insn (gen_logxf2 (op0
, e1
));
15277 /* Output code to perform an atanh XFmode calculation. */
15279 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15281 rtx e1
= gen_reg_rtx (XFmode
);
15282 rtx e2
= gen_reg_rtx (XFmode
);
15283 rtx scratch
= gen_reg_rtx (HImode
);
15284 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15285 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15287 rtx_code_label
*jump_label
= gen_label_rtx ();
15290 /* scratch = fxam (op1) */
15291 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15294 emit_insn (gen_absxf2 (e2
, op1
));
15296 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15297 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15298 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15299 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15300 emit_insn (gen_negxf2 (e2
, e2
));
15301 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15303 /* e2 = log1p (e1) */
15304 ix86_emit_i387_log1p (e2
, e1
);
15306 /* flags = signbit (op1) */
15307 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15309 /* if (!flags) then e2 = -e2 */
15310 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15311 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15312 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15314 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15315 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15316 JUMP_LABEL (insn
) = jump_label
;
15318 emit_insn (gen_negxf2 (e2
, e2
));
15320 emit_label (jump_label
);
15321 LABEL_NUSES (jump_label
) = 1;
15323 /* op0 = 0.5 * e2 */
15324 half
= force_reg (XFmode
, half
);
15325 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15328 /* Output code to perform a log1p XFmode calculation. */
15330 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15332 rtx_code_label
*label1
= gen_label_rtx ();
15333 rtx_code_label
*label2
= gen_label_rtx ();
15335 rtx tmp
= gen_reg_rtx (XFmode
);
15336 rtx res
= gen_reg_rtx (XFmode
);
15337 rtx cst
, cstln2
, cst1
;
15340 cst
= const_double_from_real_value
15341 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15342 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15344 emit_insn (gen_absxf2 (tmp
, op1
));
15346 cst
= force_reg (XFmode
, cst
);
15347 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15348 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15349 insn
= get_last_insn ();
15350 JUMP_LABEL (insn
) = label1
;
15352 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15353 emit_jump (label2
);
15355 emit_label (label1
);
15356 LABEL_NUSES (label1
) = 1;
15358 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15359 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15360 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15362 emit_label (label2
);
15363 LABEL_NUSES (label2
) = 1;
15365 emit_move_insn (op0
, res
);
15368 /* Emit code for round calculation. */
15369 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15371 machine_mode inmode
= GET_MODE (op1
);
15372 machine_mode outmode
= GET_MODE (op0
);
15373 rtx e1
= gen_reg_rtx (XFmode
);
15374 rtx e2
= gen_reg_rtx (XFmode
);
15375 rtx scratch
= gen_reg_rtx (HImode
);
15376 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15377 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15378 rtx res
= gen_reg_rtx (outmode
);
15379 rtx_code_label
*jump_label
= gen_label_rtx ();
15380 rtx (*floor_insn
) (rtx
, rtx
);
15381 rtx (*neg_insn
) (rtx
, rtx
);
15389 tmp
= gen_reg_rtx (XFmode
);
15391 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15397 gcc_unreachable ();
15403 floor_insn
= gen_frndintxf2_floor
;
15404 neg_insn
= gen_negsf2
;
15407 floor_insn
= gen_frndintxf2_floor
;
15408 neg_insn
= gen_negdf2
;
15411 floor_insn
= gen_frndintxf2_floor
;
15412 neg_insn
= gen_negxf2
;
15415 floor_insn
= gen_lfloorxfhi2
;
15416 neg_insn
= gen_neghi2
;
15419 floor_insn
= gen_lfloorxfsi2
;
15420 neg_insn
= gen_negsi2
;
15423 floor_insn
= gen_lfloorxfdi2
;
15424 neg_insn
= gen_negdi2
;
15427 gcc_unreachable ();
15430 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15432 /* scratch = fxam(op1) */
15433 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15435 /* e1 = fabs(op1) */
15436 emit_insn (gen_absxf2 (e1
, op1
));
15438 /* e2 = e1 + 0.5 */
15439 half
= force_reg (XFmode
, half
);
15440 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15442 /* res = floor(e2) */
15448 tmp
= gen_reg_rtx (XFmode
);
15450 emit_insn (floor_insn (tmp
, e2
));
15451 emit_insn (gen_rtx_SET (res
,
15452 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15453 UNSPEC_TRUNC_NOOP
)));
15457 emit_insn (floor_insn (res
, e2
));
15460 /* flags = signbit(a) */
15461 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15463 /* if (flags) then res = -res */
15464 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15465 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15466 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15468 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15469 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15470 JUMP_LABEL (insn
) = jump_label
;
15472 emit_insn (neg_insn (res
, res
));
15474 emit_label (jump_label
);
15475 LABEL_NUSES (jump_label
) = 1;
15477 emit_move_insn (op0
, res
);
15480 /* Output code to perform a Newton-Rhapson approximation of a single precision
15481 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15483 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15485 rtx x0
, x1
, e0
, e1
;
15487 x0
= gen_reg_rtx (mode
);
15488 e0
= gen_reg_rtx (mode
);
15489 e1
= gen_reg_rtx (mode
);
15490 x1
= gen_reg_rtx (mode
);
15492 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15494 b
= force_reg (mode
, b
);
15496 /* x0 = rcp(b) estimate */
15497 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15499 if (TARGET_AVX512ER
)
15501 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15504 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15508 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15512 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15516 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15519 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15522 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15525 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15528 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15531 /* Output code to perform a Newton-Rhapson approximation of a
15532 single precision floating point [reciprocal] square root. */
15534 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15536 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15540 x0
= gen_reg_rtx (mode
);
15541 e0
= gen_reg_rtx (mode
);
15542 e1
= gen_reg_rtx (mode
);
15543 e2
= gen_reg_rtx (mode
);
15544 e3
= gen_reg_rtx (mode
);
15546 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15549 /* res = rsqrt28(a) estimate */
15550 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15554 /* x0 = rsqrt28(a) estimate */
15555 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15557 /* res = rcp28(x0) estimate */
15558 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15564 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15565 mthree
= const_double_from_real_value (r
, SFmode
);
15567 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15568 mhalf
= const_double_from_real_value (r
, SFmode
);
15569 unspec
= UNSPEC_RSQRT
;
15571 if (VECTOR_MODE_P (mode
))
15573 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15574 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15575 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15576 if (GET_MODE_SIZE (mode
) == 64)
15577 unspec
= UNSPEC_RSQRT14
;
15580 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15581 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15583 a
= force_reg (mode
, a
);
15585 /* x0 = rsqrt(a) estimate */
15586 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15589 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15592 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15595 /* Handle masked compare. */
15596 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15598 mask
= gen_reg_rtx (HImode
);
15599 /* Imm value 0x4 corresponds to not-equal comparison. */
15600 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15601 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15605 mask
= gen_reg_rtx (mode
);
15606 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15607 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15611 mthree
= force_reg (mode
, mthree
);
15614 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15616 unsigned vector_size
= GET_MODE_SIZE (mode
);
15618 || (TARGET_AVX512F
&& vector_size
== 64)
15619 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
15620 emit_insn (gen_rtx_SET (e2
,
15621 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
15625 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15628 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15631 mhalf
= force_reg (mode
, mhalf
);
15633 /* e3 = -.5 * x0 */
15634 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15636 /* e3 = -.5 * e0 */
15637 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15638 /* ret = e2 * e3 */
15639 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15642 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15643 mask for masking out the sign-bit is stored in *SMASK, if that is
15647 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15649 machine_mode vmode
, mode
= GET_MODE (op0
);
15652 xa
= gen_reg_rtx (mode
);
15653 if (mode
== SFmode
)
15655 else if (mode
== DFmode
)
15659 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15660 if (!VECTOR_MODE_P (mode
))
15662 /* We need to generate a scalar mode mask in this case. */
15663 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15664 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15665 mask
= gen_reg_rtx (mode
);
15666 emit_insn (gen_rtx_SET (mask
, tmp
));
15668 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15676 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15677 swapping the operands if SWAP_OPERANDS is true. The expanded
15678 code is a forward jump to a newly created label in case the
15679 comparison is true. The generated label rtx is returned. */
15680 static rtx_code_label
*
15681 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15682 bool swap_operands
)
15684 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15685 rtx_code_label
*label
;
15689 std::swap (op0
, op1
);
15691 label
= gen_label_rtx ();
15692 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15693 if (unordered_compare
)
15694 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15695 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15696 emit_insn (gen_rtx_SET (reg
, tmp
));
15697 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15698 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15699 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15700 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15701 JUMP_LABEL (tmp
) = label
;
15706 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15707 using comparison code CODE. Operands are swapped for the comparison if
15708 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15710 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15711 bool swap_operands
)
15713 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15714 machine_mode mode
= GET_MODE (op0
);
15715 rtx mask
= gen_reg_rtx (mode
);
15718 std::swap (op0
, op1
);
15720 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15722 emit_insn (insn (mask
, op0
, op1
,
15723 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15727 /* Expand copysign from SIGN to the positive value ABS_VALUE
15728 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15732 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15734 machine_mode mode
= GET_MODE (sign
);
15735 rtx sgn
= gen_reg_rtx (mode
);
15736 if (mask
== NULL_RTX
)
15738 machine_mode vmode
;
15740 if (mode
== SFmode
)
15742 else if (mode
== DFmode
)
15747 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15748 if (!VECTOR_MODE_P (mode
))
15750 /* We need to generate a scalar mode mask in this case. */
15751 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15752 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15753 mask
= gen_reg_rtx (mode
);
15754 emit_insn (gen_rtx_SET (mask
, tmp
));
15758 mask
= gen_rtx_NOT (mode
, mask
);
15759 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15760 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15763 /* Expand SSE sequence for computing lround from OP1 storing
15767 ix86_expand_lround (rtx op0
, rtx op1
)
15769 /* C code for the stuff we're doing below:
15770 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15773 machine_mode mode
= GET_MODE (op1
);
15774 const struct real_format
*fmt
;
15775 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15778 /* load nextafter (0.5, 0.0) */
15779 fmt
= REAL_MODE_FORMAT (mode
);
15780 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15781 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15783 /* adj = copysign (0.5, op1) */
15784 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15785 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15787 /* adj = op1 + adj */
15788 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15790 /* op0 = (imode)adj */
15791 expand_fix (op0
, adj
, 0);
15794 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15798 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15800 /* C code for the stuff we're doing below (for do_floor):
15802 xi -= (double)xi > op1 ? 1 : 0;
15805 machine_mode fmode
= GET_MODE (op1
);
15806 machine_mode imode
= GET_MODE (op0
);
15807 rtx ireg
, freg
, tmp
;
15808 rtx_code_label
*label
;
15810 /* reg = (long)op1 */
15811 ireg
= gen_reg_rtx (imode
);
15812 expand_fix (ireg
, op1
, 0);
15814 /* freg = (double)reg */
15815 freg
= gen_reg_rtx (fmode
);
15816 expand_float (freg
, ireg
, 0);
15818 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15819 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15820 freg
, op1
, !do_floor
);
15821 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15822 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15823 emit_move_insn (ireg
, tmp
);
15825 emit_label (label
);
15826 LABEL_NUSES (label
) = 1;
15828 emit_move_insn (op0
, ireg
);
15831 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15832 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15835 ix86_gen_TWO52 (machine_mode mode
)
15837 REAL_VALUE_TYPE TWO52r
;
15840 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15841 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15842 TWO52
= force_reg (mode
, TWO52
);
15847 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15850 ix86_expand_rint (rtx operand0
, rtx operand1
)
15852 /* C code for the stuff we're doing below:
15853 xa = fabs (operand1);
15854 if (!isless (xa, 2**52))
15857 if (flag_rounding_math)
15859 two52 = copysign (two52, operand1);
15862 xa = xa + two52 - two52;
15863 return copysign (xa, operand1);
15865 machine_mode mode
= GET_MODE (operand0
);
15866 rtx res
, xa
, TWO52
, two52
, mask
;
15867 rtx_code_label
*label
;
15869 res
= gen_reg_rtx (mode
);
15870 emit_move_insn (res
, operand1
);
15872 /* xa = abs (operand1) */
15873 xa
= ix86_expand_sse_fabs (res
, &mask
);
15875 /* if (!isless (xa, TWO52)) goto label; */
15876 TWO52
= ix86_gen_TWO52 (mode
);
15877 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15880 if (flag_rounding_math
)
15882 two52
= gen_reg_rtx (mode
);
15883 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15887 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15888 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15890 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15892 emit_label (label
);
15893 LABEL_NUSES (label
) = 1;
15895 emit_move_insn (operand0
, res
);
15898 /* Expand SSE2 sequence for computing floor or ceil
15899 from OPERAND1 storing into OPERAND0. */
15901 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15903 /* C code for the stuff we expand below.
15904 double xa = fabs (x), x2;
15905 if (!isless (xa, TWO52))
15907 x2 = (double)(long)x;
15914 if (HONOR_SIGNED_ZEROS (mode))
15915 return copysign (x2, x);
15918 machine_mode mode
= GET_MODE (operand0
);
15919 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15920 rtx_code_label
*label
;
15922 TWO52
= ix86_gen_TWO52 (mode
);
15924 /* Temporary for holding the result, initialized to the input
15925 operand to ease control flow. */
15926 res
= gen_reg_rtx (mode
);
15927 emit_move_insn (res
, operand1
);
15929 /* xa = abs (operand1) */
15930 xa
= ix86_expand_sse_fabs (res
, &mask
);
15932 /* if (!isless (xa, TWO52)) goto label; */
15933 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15935 /* xa = (double)(long)x */
15936 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15937 expand_fix (xi
, res
, 0);
15938 expand_float (xa
, xi
, 0);
15941 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15943 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15944 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15945 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15946 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15947 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15948 emit_move_insn (res
, tmp
);
15950 if (HONOR_SIGNED_ZEROS (mode
))
15951 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15953 emit_label (label
);
15954 LABEL_NUSES (label
) = 1;
15956 emit_move_insn (operand0
, res
);
15959 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15960 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15961 that is only available on 64bit targets. */
15963 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15965 /* C code for the stuff we expand below.
15966 double xa = fabs (x), x2;
15967 if (!isless (xa, TWO52))
15969 xa = xa + TWO52 - TWO52;
15970 x2 = copysign (xa, x);
15977 if (HONOR_SIGNED_ZEROS (mode))
15978 x2 = copysign (x2, x);
15981 machine_mode mode
= GET_MODE (operand0
);
15982 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15983 rtx_code_label
*label
;
15985 TWO52
= ix86_gen_TWO52 (mode
);
15987 /* Temporary for holding the result, initialized to the input
15988 operand to ease control flow. */
15989 res
= gen_reg_rtx (mode
);
15990 emit_move_insn (res
, operand1
);
15992 /* xa = abs (operand1) */
15993 xa
= ix86_expand_sse_fabs (res
, &mask
);
15995 /* if (!isless (xa, TWO52)) goto label; */
15996 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15998 /* xa = xa + TWO52 - TWO52; */
15999 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16000 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16002 /* xa = copysign (xa, operand1) */
16003 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16006 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16008 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16009 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16010 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16011 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16012 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16013 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
16014 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16015 emit_move_insn (res
, tmp
);
16017 emit_label (label
);
16018 LABEL_NUSES (label
) = 1;
16020 emit_move_insn (operand0
, res
);
16023 /* Expand SSE sequence for computing trunc
16024 from OPERAND1 storing into OPERAND0. */
16026 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16028 /* C code for SSE variant we expand below.
16029 double xa = fabs (x), x2;
16030 if (!isless (xa, TWO52))
16032 x2 = (double)(long)x;
16033 if (HONOR_SIGNED_ZEROS (mode))
16034 return copysign (x2, x);
16037 machine_mode mode
= GET_MODE (operand0
);
16038 rtx xa
, xi
, TWO52
, res
, mask
;
16039 rtx_code_label
*label
;
16041 TWO52
= ix86_gen_TWO52 (mode
);
16043 /* Temporary for holding the result, initialized to the input
16044 operand to ease control flow. */
16045 res
= gen_reg_rtx (mode
);
16046 emit_move_insn (res
, operand1
);
16048 /* xa = abs (operand1) */
16049 xa
= ix86_expand_sse_fabs (res
, &mask
);
16051 /* if (!isless (xa, TWO52)) goto label; */
16052 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16054 /* x = (double)(long)x */
16055 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16056 expand_fix (xi
, res
, 0);
16057 expand_float (res
, xi
, 0);
16059 if (HONOR_SIGNED_ZEROS (mode
))
16060 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
16062 emit_label (label
);
16063 LABEL_NUSES (label
) = 1;
16065 emit_move_insn (operand0
, res
);
16068 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16069 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16070 that is only available on 64bit targets. */
16072 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16074 machine_mode mode
= GET_MODE (operand0
);
16075 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
16076 rtx_code_label
*label
;
16078 /* C code for SSE variant we expand below.
16079 double xa = fabs (x), x2;
16080 if (!isless (xa, TWO52))
16082 xa2 = xa + TWO52 - TWO52;
16086 x2 = copysign (xa2, x);
16090 TWO52
= ix86_gen_TWO52 (mode
);
16092 /* Temporary for holding the result, initialized to the input
16093 operand to ease control flow. */
16094 res
= gen_reg_rtx (mode
);
16095 emit_move_insn (res
, operand1
);
16097 /* xa = abs (operand1) */
16098 xa
= ix86_expand_sse_fabs (res
, &smask
);
16100 /* if (!isless (xa, TWO52)) goto label; */
16101 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16103 /* res = xa + TWO52 - TWO52; */
16104 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16105 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
16106 emit_move_insn (res
, tmp
);
16109 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16111 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16112 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
16113 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
16114 tmp
= expand_simple_binop (mode
, MINUS
,
16115 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
16116 emit_move_insn (res
, tmp
);
16118 /* res = copysign (res, operand1) */
16119 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
16121 emit_label (label
);
16122 LABEL_NUSES (label
) = 1;
16124 emit_move_insn (operand0
, res
);
16127 /* Expand SSE sequence for computing round
16128 from OPERAND1 storing into OPERAND0. */
16130 ix86_expand_round (rtx operand0
, rtx operand1
)
16132 /* C code for the stuff we're doing below:
16133 double xa = fabs (x);
16134 if (!isless (xa, TWO52))
16136 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16137 return copysign (xa, x);
16139 machine_mode mode
= GET_MODE (operand0
);
16140 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16141 rtx_code_label
*label
;
16142 const struct real_format
*fmt
;
16143 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16145 /* Temporary for holding the result, initialized to the input
16146 operand to ease control flow. */
16147 res
= gen_reg_rtx (mode
);
16148 emit_move_insn (res
, operand1
);
16150 TWO52
= ix86_gen_TWO52 (mode
);
16151 xa
= ix86_expand_sse_fabs (res
, &mask
);
16152 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16154 /* load nextafter (0.5, 0.0) */
16155 fmt
= REAL_MODE_FORMAT (mode
);
16156 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16157 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16159 /* xa = xa + 0.5 */
16160 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16161 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16163 /* xa = (double)(int64_t)xa */
16164 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16165 expand_fix (xi
, xa
, 0);
16166 expand_float (xa
, xi
, 0);
16168 /* res = copysign (xa, operand1) */
16169 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16171 emit_label (label
);
16172 LABEL_NUSES (label
) = 1;
16174 emit_move_insn (operand0
, res
);
16177 /* Expand SSE sequence for computing round from OPERAND1 storing
16178 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16179 that is only available on 64bit targets. */
16181 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16183 /* C code for the stuff we expand below.
16184 double xa = fabs (x), xa2, x2;
16185 if (!isless (xa, TWO52))
16187 Using the absolute value and copying back sign makes
16188 -0.0 -> -0.0 correct.
16189 xa2 = xa + TWO52 - TWO52;
16194 else if (dxa > 0.5)
16196 x2 = copysign (xa2, x);
16199 machine_mode mode
= GET_MODE (operand0
);
16200 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16201 rtx_code_label
*label
;
16203 TWO52
= ix86_gen_TWO52 (mode
);
16205 /* Temporary for holding the result, initialized to the input
16206 operand to ease control flow. */
16207 res
= gen_reg_rtx (mode
);
16208 emit_move_insn (res
, operand1
);
16210 /* xa = abs (operand1) */
16211 xa
= ix86_expand_sse_fabs (res
, &mask
);
16213 /* if (!isless (xa, TWO52)) goto label; */
16214 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16216 /* xa2 = xa + TWO52 - TWO52; */
16217 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16218 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16220 /* dxa = xa2 - xa; */
16221 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16223 /* generate 0.5, 1.0 and -0.5 */
16224 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16225 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16226 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16230 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16231 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16232 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16233 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16234 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16235 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16236 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16237 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16239 /* res = copysign (xa2, operand1) */
16240 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
16242 emit_label (label
);
16243 LABEL_NUSES (label
) = 1;
16245 emit_move_insn (operand0
, res
);
16248 /* Expand SSE sequence for computing round
16249 from OP1 storing into OP0 using sse4 round insn. */
16251 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16253 machine_mode mode
= GET_MODE (op0
);
16254 rtx e1
, e2
, res
, half
;
16255 const struct real_format
*fmt
;
16256 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16257 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16258 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16263 gen_copysign
= gen_copysignsf3
;
16264 gen_round
= gen_sse4_1_roundsf2
;
16267 gen_copysign
= gen_copysigndf3
;
16268 gen_round
= gen_sse4_1_rounddf2
;
16271 gcc_unreachable ();
16274 /* round (a) = trunc (a + copysign (0.5, a)) */
16276 /* load nextafter (0.5, 0.0) */
16277 fmt
= REAL_MODE_FORMAT (mode
);
16278 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16279 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16280 half
= const_double_from_real_value (pred_half
, mode
);
16282 /* e1 = copysign (0.5, op1) */
16283 e1
= gen_reg_rtx (mode
);
16284 emit_insn (gen_copysign (e1
, half
, op1
));
16286 /* e2 = op1 + e1 */
16287 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16289 /* res = trunc (e2) */
16290 res
= gen_reg_rtx (mode
);
16291 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16293 emit_move_insn (op0
, res
);
16296 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16297 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16298 insn every time. */
16300 static GTY(()) rtx_insn
*vselect_insn
;
16302 /* Initialize vselect_insn. */
16305 init_vselect_insn (void)
16310 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16311 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16312 XVECEXP (x
, 0, i
) = const0_rtx
;
16313 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16315 x
= gen_rtx_SET (const0_rtx
, x
);
16317 vselect_insn
= emit_insn (x
);
16321 /* Construct (set target (vec_select op0 (parallel perm))) and
16322 return true if that's a valid instruction in the active ISA. */
16325 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16326 unsigned nelt
, bool testing_p
)
16329 rtx x
, save_vconcat
;
16332 if (vselect_insn
== NULL_RTX
)
16333 init_vselect_insn ();
16335 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16336 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16337 for (i
= 0; i
< nelt
; ++i
)
16338 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16339 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16340 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16341 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16342 SET_DEST (PATTERN (vselect_insn
)) = target
;
16343 icode
= recog_memoized (vselect_insn
);
16345 if (icode
>= 0 && !testing_p
)
16346 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16348 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16349 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16350 INSN_CODE (vselect_insn
) = -1;
16355 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16358 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16359 const unsigned char *perm
, unsigned nelt
,
16362 machine_mode v2mode
;
16366 if (vselect_insn
== NULL_RTX
)
16367 init_vselect_insn ();
16369 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16371 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16372 PUT_MODE (x
, v2mode
);
16375 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16376 XEXP (x
, 0) = const0_rtx
;
16377 XEXP (x
, 1) = const0_rtx
;
16381 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16382 using movss or movsd. */
16384 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16386 machine_mode vmode
= d
->vmode
;
16387 unsigned i
, nelt
= d
->nelt
;
16390 if (d
->one_operand_p
)
16393 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16394 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
16395 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16398 /* Only the first element is changed. */
16399 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16401 for (i
= 1; i
< nelt
; ++i
)
16402 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16408 if (d
->perm
[0] == nelt
)
16409 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16411 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16413 emit_insn (gen_rtx_SET (d
->target
, x
));
16418 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16419 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16422 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16424 machine_mode mmode
, vmode
= d
->vmode
;
16425 unsigned i
, nelt
= d
->nelt
;
16426 unsigned HOST_WIDE_INT mask
;
16427 rtx target
, op0
, op1
, maskop
, x
;
16428 rtx rperm
[32], vperm
;
16430 if (d
->one_operand_p
)
16432 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16433 && (TARGET_AVX512BW
16434 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16436 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16438 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16440 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16445 /* This is a blend, not a permute. Elements must stay in their
16446 respective lanes. */
16447 for (i
= 0; i
< nelt
; ++i
)
16449 unsigned e
= d
->perm
[i
];
16450 if (!(e
== i
|| e
== i
+ nelt
))
16457 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16458 decision should be extracted elsewhere, so that we only try that
16459 sequence once all budget==3 options have been tried. */
16460 target
= d
->target
;
16479 for (i
= 0; i
< nelt
; ++i
)
16480 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16484 for (i
= 0; i
< 2; ++i
)
16485 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16490 for (i
= 0; i
< 4; ++i
)
16491 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16496 /* See if bytes move in pairs so we can use pblendw with
16497 an immediate argument, rather than pblendvb with a vector
16499 for (i
= 0; i
< 16; i
+= 2)
16500 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16503 for (i
= 0; i
< nelt
; ++i
)
16504 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16507 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16508 vperm
= force_reg (vmode
, vperm
);
16510 if (GET_MODE_SIZE (vmode
) == 16)
16511 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16513 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16514 if (target
!= d
->target
)
16515 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16519 for (i
= 0; i
< 8; ++i
)
16520 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16525 target
= gen_reg_rtx (vmode
);
16526 op0
= gen_lowpart (vmode
, op0
);
16527 op1
= gen_lowpart (vmode
, op1
);
16531 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16532 for (i
= 0; i
< 32; i
+= 2)
16533 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16535 /* See if bytes move in quadruplets. If yes, vpblendd
16536 with immediate can be used. */
16537 for (i
= 0; i
< 32; i
+= 4)
16538 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16542 /* See if bytes move the same in both lanes. If yes,
16543 vpblendw with immediate can be used. */
16544 for (i
= 0; i
< 16; i
+= 2)
16545 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16548 /* Use vpblendw. */
16549 for (i
= 0; i
< 16; ++i
)
16550 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16555 /* Use vpblendd. */
16556 for (i
= 0; i
< 8; ++i
)
16557 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16562 /* See if words move in pairs. If yes, vpblendd can be used. */
16563 for (i
= 0; i
< 16; i
+= 2)
16564 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16568 /* See if words move the same in both lanes. If not,
16569 vpblendvb must be used. */
16570 for (i
= 0; i
< 8; i
++)
16571 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16573 /* Use vpblendvb. */
16574 for (i
= 0; i
< 32; ++i
)
16575 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16579 target
= gen_reg_rtx (vmode
);
16580 op0
= gen_lowpart (vmode
, op0
);
16581 op1
= gen_lowpart (vmode
, op1
);
16582 goto finish_pblendvb
;
16585 /* Use vpblendw. */
16586 for (i
= 0; i
< 16; ++i
)
16587 mask
|= (d
->perm
[i
] >= 16) << i
;
16591 /* Use vpblendd. */
16592 for (i
= 0; i
< 8; ++i
)
16593 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16598 /* Use vpblendd. */
16599 for (i
= 0; i
< 4; ++i
)
16600 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16605 gcc_unreachable ();
16628 if (mmode
!= VOIDmode
)
16629 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16631 maskop
= GEN_INT (mask
);
16633 /* This matches five different patterns with the different modes. */
16634 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16635 x
= gen_rtx_SET (target
, x
);
16637 if (target
!= d
->target
)
16638 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16643 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16644 in terms of the variable form of vpermilps.
16646 Note that we will have already failed the immediate input vpermilps,
16647 which requires that the high and low part shuffle be identical; the
16648 variable form doesn't require that. */
16651 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16653 rtx rperm
[8], vperm
;
16656 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16659 /* We can only permute within the 128-bit lane. */
16660 for (i
= 0; i
< 8; ++i
)
16662 unsigned e
= d
->perm
[i
];
16663 if (i
< 4 ? e
>= 4 : e
< 4)
16670 for (i
= 0; i
< 8; ++i
)
16672 unsigned e
= d
->perm
[i
];
16674 /* Within each 128-bit lane, the elements of op0 are numbered
16675 from 0 and the elements of op1 are numbered from 4. */
16681 rperm
[i
] = GEN_INT (e
);
16684 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16685 vperm
= force_reg (V8SImode
, vperm
);
16686 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16691 /* Return true if permutation D can be performed as VMODE permutation
16695 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16697 unsigned int i
, j
, chunk
;
16699 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16700 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16701 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16704 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16707 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16708 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16709 if (d
->perm
[i
] & (chunk
- 1))
16712 for (j
= 1; j
< chunk
; ++j
)
16713 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16719 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16720 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16723 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16725 unsigned i
, nelt
, eltsz
, mask
;
16726 unsigned char perm
[64];
16727 machine_mode vmode
= V16QImode
;
16728 rtx rperm
[64], vperm
, target
, op0
, op1
;
16732 if (!d
->one_operand_p
)
16734 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16737 && valid_perm_using_mode_p (V2TImode
, d
))
16742 /* Use vperm2i128 insn. The pattern uses
16743 V4DImode instead of V2TImode. */
16744 target
= d
->target
;
16745 if (d
->vmode
!= V4DImode
)
16746 target
= gen_reg_rtx (V4DImode
);
16747 op0
= gen_lowpart (V4DImode
, d
->op0
);
16748 op1
= gen_lowpart (V4DImode
, d
->op1
);
16750 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16751 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16752 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16753 if (target
!= d
->target
)
16754 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16762 if (GET_MODE_SIZE (d
->vmode
) == 16)
16767 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16772 /* V4DImode should be already handled through
16773 expand_vselect by vpermq instruction. */
16774 gcc_assert (d
->vmode
!= V4DImode
);
16777 if (d
->vmode
== V8SImode
16778 || d
->vmode
== V16HImode
16779 || d
->vmode
== V32QImode
)
16781 /* First see if vpermq can be used for
16782 V8SImode/V16HImode/V32QImode. */
16783 if (valid_perm_using_mode_p (V4DImode
, d
))
16785 for (i
= 0; i
< 4; i
++)
16786 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16789 target
= gen_reg_rtx (V4DImode
);
16790 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16793 emit_move_insn (d
->target
,
16794 gen_lowpart (d
->vmode
, target
));
16800 /* Next see if vpermd can be used. */
16801 if (valid_perm_using_mode_p (V8SImode
, d
))
16804 /* Or if vpermps can be used. */
16805 else if (d
->vmode
== V8SFmode
)
16808 if (vmode
== V32QImode
)
16810 /* vpshufb only works intra lanes, it is not
16811 possible to shuffle bytes in between the lanes. */
16812 for (i
= 0; i
< nelt
; ++i
)
16813 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16817 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16819 if (!TARGET_AVX512BW
)
16822 /* If vpermq didn't work, vpshufb won't work either. */
16823 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16827 if (d
->vmode
== V16SImode
16828 || d
->vmode
== V32HImode
16829 || d
->vmode
== V64QImode
)
16831 /* First see if vpermq can be used for
16832 V16SImode/V32HImode/V64QImode. */
16833 if (valid_perm_using_mode_p (V8DImode
, d
))
16835 for (i
= 0; i
< 8; i
++)
16836 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16839 target
= gen_reg_rtx (V8DImode
);
16840 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16843 emit_move_insn (d
->target
,
16844 gen_lowpart (d
->vmode
, target
));
16850 /* Next see if vpermd can be used. */
16851 if (valid_perm_using_mode_p (V16SImode
, d
))
16854 /* Or if vpermps can be used. */
16855 else if (d
->vmode
== V16SFmode
)
16857 if (vmode
== V64QImode
)
16859 /* vpshufb only works intra lanes, it is not
16860 possible to shuffle bytes in between the lanes. */
16861 for (i
= 0; i
< nelt
; ++i
)
16862 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
16873 if (vmode
== V8SImode
)
16874 for (i
= 0; i
< 8; ++i
)
16875 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16876 else if (vmode
== V16SImode
)
16877 for (i
= 0; i
< 16; ++i
)
16878 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16881 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16882 if (!d
->one_operand_p
)
16883 mask
= 2 * nelt
- 1;
16884 else if (vmode
== V16QImode
)
16886 else if (vmode
== V64QImode
)
16887 mask
= nelt
/ 4 - 1;
16889 mask
= nelt
/ 2 - 1;
16891 for (i
= 0; i
< nelt
; ++i
)
16893 unsigned j
, e
= d
->perm
[i
] & mask
;
16894 for (j
= 0; j
< eltsz
; ++j
)
16895 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16899 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16900 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16901 vperm
= force_reg (vmode
, vperm
);
16903 target
= d
->target
;
16904 if (d
->vmode
!= vmode
)
16905 target
= gen_reg_rtx (vmode
);
16906 op0
= gen_lowpart (vmode
, d
->op0
);
16907 if (d
->one_operand_p
)
16909 if (vmode
== V16QImode
)
16910 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16911 else if (vmode
== V32QImode
)
16912 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16913 else if (vmode
== V64QImode
)
16914 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16915 else if (vmode
== V8SFmode
)
16916 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16917 else if (vmode
== V8SImode
)
16918 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16919 else if (vmode
== V16SFmode
)
16920 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16921 else if (vmode
== V16SImode
)
16922 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16924 gcc_unreachable ();
16928 op1
= gen_lowpart (vmode
, d
->op1
);
16929 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16931 if (target
!= d
->target
)
16932 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16937 /* For V*[QHS]Imode permutations, check if the same permutation
16938 can't be performed in a 2x, 4x or 8x wider inner mode. */
16941 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16942 struct expand_vec_perm_d
*nd
)
16945 machine_mode mode
= VOIDmode
;
16949 case E_V16QImode
: mode
= V8HImode
; break;
16950 case E_V32QImode
: mode
= V16HImode
; break;
16951 case E_V64QImode
: mode
= V32HImode
; break;
16952 case E_V8HImode
: mode
= V4SImode
; break;
16953 case E_V16HImode
: mode
= V8SImode
; break;
16954 case E_V32HImode
: mode
= V16SImode
; break;
16955 case E_V4SImode
: mode
= V2DImode
; break;
16956 case E_V8SImode
: mode
= V4DImode
; break;
16957 case E_V16SImode
: mode
= V8DImode
; break;
16958 default: return false;
16960 for (i
= 0; i
< d
->nelt
; i
+= 2)
16961 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16964 nd
->nelt
= d
->nelt
/ 2;
16965 for (i
= 0; i
< nd
->nelt
; i
++)
16966 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16967 if (GET_MODE_INNER (mode
) != DImode
)
16968 canonicalize_vector_int_perm (nd
, nd
);
16971 nd
->one_operand_p
= d
->one_operand_p
;
16972 nd
->testing_p
= d
->testing_p
;
16973 if (d
->op0
== d
->op1
)
16974 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16977 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16978 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16981 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16983 nd
->target
= gen_reg_rtx (nd
->vmode
);
16988 /* Try to expand one-operand permutation with constant mask. */
16991 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16993 machine_mode mode
= GET_MODE (d
->op0
);
16994 machine_mode maskmode
= mode
;
16995 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
16996 rtx target
, op0
, mask
;
16999 if (!rtx_equal_p (d
->op0
, d
->op1
))
17002 if (!TARGET_AVX512F
)
17008 gen
= gen_avx512f_permvarv16si
;
17011 gen
= gen_avx512f_permvarv16sf
;
17012 maskmode
= V16SImode
;
17015 gen
= gen_avx512f_permvarv8di
;
17018 gen
= gen_avx512f_permvarv8df
;
17019 maskmode
= V8DImode
;
17025 target
= d
->target
;
17027 for (int i
= 0; i
< d
->nelt
; ++i
)
17028 vec
[i
] = GEN_INT (d
->perm
[i
]);
17029 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17030 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17034 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17036 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17037 in a single instruction. */
17040 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17042 unsigned i
, nelt
= d
->nelt
;
17043 struct expand_vec_perm_d nd
;
17045 /* Check plain VEC_SELECT first, because AVX has instructions that could
17046 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17047 input where SEL+CONCAT may not. */
17048 if (d
->one_operand_p
)
17050 int mask
= nelt
- 1;
17051 bool identity_perm
= true;
17052 bool broadcast_perm
= true;
17054 for (i
= 0; i
< nelt
; i
++)
17056 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17057 if (nd
.perm
[i
] != i
)
17058 identity_perm
= false;
17060 broadcast_perm
= false;
17066 emit_move_insn (d
->target
, d
->op0
);
17069 else if (broadcast_perm
&& TARGET_AVX2
)
17071 /* Use vpbroadcast{b,w,d}. */
17072 rtx (*gen
) (rtx
, rtx
) = NULL
;
17076 if (TARGET_AVX512BW
)
17077 gen
= gen_avx512bw_vec_dupv64qi_1
;
17080 gen
= gen_avx2_pbroadcastv32qi_1
;
17083 if (TARGET_AVX512BW
)
17084 gen
= gen_avx512bw_vec_dupv32hi_1
;
17087 gen
= gen_avx2_pbroadcastv16hi_1
;
17090 if (TARGET_AVX512F
)
17091 gen
= gen_avx512f_vec_dupv16si_1
;
17094 gen
= gen_avx2_pbroadcastv8si_1
;
17097 gen
= gen_avx2_pbroadcastv16qi
;
17100 gen
= gen_avx2_pbroadcastv8hi
;
17103 if (TARGET_AVX512F
)
17104 gen
= gen_avx512f_vec_dupv16sf_1
;
17107 gen
= gen_avx2_vec_dupv8sf_1
;
17110 if (TARGET_AVX512F
)
17111 gen
= gen_avx512f_vec_dupv8df_1
;
17114 if (TARGET_AVX512F
)
17115 gen
= gen_avx512f_vec_dupv8di_1
;
17117 /* For other modes prefer other shuffles this function creates. */
17123 emit_insn (gen (d
->target
, d
->op0
));
17128 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17131 /* There are plenty of patterns in sse.md that are written for
17132 SEL+CONCAT and are not replicated for a single op. Perhaps
17133 that should be changed, to avoid the nastiness here. */
17135 /* Recognize interleave style patterns, which means incrementing
17136 every other permutation operand. */
17137 for (i
= 0; i
< nelt
; i
+= 2)
17139 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17140 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17142 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17146 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17149 for (i
= 0; i
< nelt
; i
+= 4)
17151 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17152 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17153 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17154 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17157 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17163 /* Try movss/movsd instructions. */
17164 if (expand_vec_perm_movs (d
))
17167 /* Finally, try the fully general two operand permute. */
17168 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17172 /* Recognize interleave style patterns with reversed operands. */
17173 if (!d
->one_operand_p
)
17175 for (i
= 0; i
< nelt
; ++i
)
17177 unsigned e
= d
->perm
[i
];
17185 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17190 /* Try the SSE4.1 blend variable merge instructions. */
17191 if (expand_vec_perm_blend (d
))
17194 /* Try one of the AVX vpermil variable permutations. */
17195 if (expand_vec_perm_vpermil (d
))
17198 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17199 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17200 if (expand_vec_perm_pshufb (d
))
17203 /* Try the AVX2 vpalignr instruction. */
17204 if (expand_vec_perm_palignr (d
, true))
17207 /* Try the AVX512F vperm{s,d} instructions. */
17208 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17211 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17212 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17215 /* See if we can get the same permutation in different vector integer
17217 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17220 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17226 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17227 in terms of a pair of pshuflw + pshufhw instructions. */
17230 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17232 unsigned char perm2
[MAX_VECT_LEN
];
17236 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17239 /* The two permutations only operate in 64-bit lanes. */
17240 for (i
= 0; i
< 4; ++i
)
17241 if (d
->perm
[i
] >= 4)
17243 for (i
= 4; i
< 8; ++i
)
17244 if (d
->perm
[i
] < 4)
17250 /* Emit the pshuflw. */
17251 memcpy (perm2
, d
->perm
, 4);
17252 for (i
= 4; i
< 8; ++i
)
17254 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17257 /* Emit the pshufhw. */
17258 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17259 for (i
= 0; i
< 4; ++i
)
17261 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17267 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17268 the permutation using the SSSE3 palignr instruction. This succeeds
17269 when all of the elements in PERM fit within one vector and we merely
17270 need to shift them down so that a single vector permutation has a
17271 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17272 the vpalignr instruction itself can perform the requested permutation. */
17275 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17277 unsigned i
, nelt
= d
->nelt
;
17278 unsigned min
, max
, minswap
, maxswap
;
17279 bool in_order
, ok
, swap
= false;
17281 struct expand_vec_perm_d dcopy
;
17283 /* Even with AVX, palignr only operates on 128-bit vectors,
17284 in AVX2 palignr operates on both 128-bit lanes. */
17285 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17286 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17291 minswap
= 2 * nelt
;
17293 for (i
= 0; i
< nelt
; ++i
)
17295 unsigned e
= d
->perm
[i
];
17296 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17297 if (GET_MODE_SIZE (d
->vmode
) == 32)
17299 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17300 eswap
= e
^ (nelt
/ 2);
17306 if (eswap
< minswap
)
17308 if (eswap
> maxswap
)
17312 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17314 if (d
->one_operand_p
17316 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17317 ? nelt
/ 2 : nelt
))
17324 /* Given that we have SSSE3, we know we'll be able to implement the
17325 single operand permutation after the palignr with pshufb for
17326 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17328 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17334 dcopy
.op0
= d
->op1
;
17335 dcopy
.op1
= d
->op0
;
17336 for (i
= 0; i
< nelt
; ++i
)
17337 dcopy
.perm
[i
] ^= nelt
;
17341 for (i
= 0; i
< nelt
; ++i
)
17343 unsigned e
= dcopy
.perm
[i
];
17344 if (GET_MODE_SIZE (d
->vmode
) == 32
17346 && (e
& (nelt
/ 2 - 1)) < min
)
17347 e
= e
- min
- (nelt
/ 2);
17354 dcopy
.one_operand_p
= true;
17356 if (single_insn_only_p
&& !in_order
)
17359 /* For AVX2, test whether we can permute the result in one instruction. */
17364 dcopy
.op1
= dcopy
.op0
;
17365 return expand_vec_perm_1 (&dcopy
);
17368 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17369 if (GET_MODE_SIZE (d
->vmode
) == 16)
17371 target
= gen_reg_rtx (TImode
);
17372 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17373 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17377 target
= gen_reg_rtx (V2TImode
);
17378 emit_insn (gen_avx2_palignrv2ti (target
,
17379 gen_lowpart (V2TImode
, dcopy
.op1
),
17380 gen_lowpart (V2TImode
, dcopy
.op0
),
17384 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17386 /* Test for the degenerate case where the alignment by itself
17387 produces the desired permutation. */
17390 emit_move_insn (d
->target
, dcopy
.op0
);
17394 ok
= expand_vec_perm_1 (&dcopy
);
17395 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17400 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17401 the permutation using the SSE4_1 pblendv instruction. Potentially
17402 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17405 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17407 unsigned i
, which
, nelt
= d
->nelt
;
17408 struct expand_vec_perm_d dcopy
, dcopy1
;
17409 machine_mode vmode
= d
->vmode
;
17412 /* Use the same checks as in expand_vec_perm_blend. */
17413 if (d
->one_operand_p
)
17415 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17417 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17419 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17424 /* Figure out where permutation elements stay not in their
17425 respective lanes. */
17426 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17428 unsigned e
= d
->perm
[i
];
17430 which
|= (e
< nelt
? 1 : 2);
17432 /* We can pblend the part where elements stay not in their
17433 respective lanes only when these elements are all in one
17434 half of a permutation.
17435 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17436 lanes, but both 8 and 9 >= 8
17437 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17438 respective lanes and 8 >= 8, but 2 not. */
17439 if (which
!= 1 && which
!= 2)
17441 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17444 /* First we apply one operand permutation to the part where
17445 elements stay not in their respective lanes. */
17448 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17450 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17452 dcopy
.target
= gen_reg_rtx (vmode
);
17453 dcopy
.one_operand_p
= true;
17455 for (i
= 0; i
< nelt
; ++i
)
17456 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17458 ok
= expand_vec_perm_1 (&dcopy
);
17459 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17466 /* Next we put permuted elements into their positions. */
17469 dcopy1
.op1
= dcopy
.target
;
17471 dcopy1
.op0
= dcopy
.target
;
17473 for (i
= 0; i
< nelt
; ++i
)
17474 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17476 ok
= expand_vec_perm_blend (&dcopy1
);
17482 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17484 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17485 a two vector permutation into a single vector permutation by using
17486 an interleave operation to merge the vectors. */
17489 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17491 struct expand_vec_perm_d dremap
, dfinal
;
17492 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17493 unsigned HOST_WIDE_INT contents
;
17494 unsigned char remap
[2 * MAX_VECT_LEN
];
17496 bool ok
, same_halves
= false;
17498 if (GET_MODE_SIZE (d
->vmode
) == 16)
17500 if (d
->one_operand_p
)
17503 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17507 /* For 32-byte modes allow even d->one_operand_p.
17508 The lack of cross-lane shuffling in some instructions
17509 might prevent a single insn shuffle. */
17511 dfinal
.testing_p
= true;
17512 /* If expand_vec_perm_interleave3 can expand this into
17513 a 3 insn sequence, give up and let it be expanded as
17514 3 insn sequence. While that is one insn longer,
17515 it doesn't need a memory operand and in the common
17516 case that both interleave low and high permutations
17517 with the same operands are adjacent needs 4 insns
17518 for both after CSE. */
17519 if (expand_vec_perm_interleave3 (&dfinal
))
17525 /* Examine from whence the elements come. */
17527 for (i
= 0; i
< nelt
; ++i
)
17528 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17530 memset (remap
, 0xff, sizeof (remap
));
17533 if (GET_MODE_SIZE (d
->vmode
) == 16)
17535 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17537 /* Split the two input vectors into 4 halves. */
17538 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17543 /* If the elements from the low halves use interleave low, and similarly
17544 for interleave high. If the elements are from mis-matched halves, we
17545 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17546 if ((contents
& (h1
| h3
)) == contents
)
17549 for (i
= 0; i
< nelt2
; ++i
)
17552 remap
[i
+ nelt
] = i
* 2 + 1;
17553 dremap
.perm
[i
* 2] = i
;
17554 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17556 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17557 dremap
.vmode
= V4SFmode
;
17559 else if ((contents
& (h2
| h4
)) == contents
)
17562 for (i
= 0; i
< nelt2
; ++i
)
17564 remap
[i
+ nelt2
] = i
* 2;
17565 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17566 dremap
.perm
[i
* 2] = i
+ nelt2
;
17567 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17569 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17570 dremap
.vmode
= V4SFmode
;
17572 else if ((contents
& (h1
| h4
)) == contents
)
17575 for (i
= 0; i
< nelt2
; ++i
)
17578 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17579 dremap
.perm
[i
] = i
;
17580 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17585 dremap
.vmode
= V2DImode
;
17587 dremap
.perm
[0] = 0;
17588 dremap
.perm
[1] = 3;
17591 else if ((contents
& (h2
| h3
)) == contents
)
17594 for (i
= 0; i
< nelt2
; ++i
)
17596 remap
[i
+ nelt2
] = i
;
17597 remap
[i
+ nelt
] = i
+ nelt2
;
17598 dremap
.perm
[i
] = i
+ nelt2
;
17599 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17604 dremap
.vmode
= V2DImode
;
17606 dremap
.perm
[0] = 1;
17607 dremap
.perm
[1] = 2;
17615 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17616 unsigned HOST_WIDE_INT q
[8];
17617 unsigned int nonzero_halves
[4];
17619 /* Split the two input vectors into 8 quarters. */
17620 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17621 for (i
= 1; i
< 8; ++i
)
17622 q
[i
] = q
[0] << (nelt4
* i
);
17623 for (i
= 0; i
< 4; ++i
)
17624 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17626 nonzero_halves
[nzcnt
] = i
;
17632 gcc_assert (d
->one_operand_p
);
17633 nonzero_halves
[1] = nonzero_halves
[0];
17634 same_halves
= true;
17636 else if (d
->one_operand_p
)
17638 gcc_assert (nonzero_halves
[0] == 0);
17639 gcc_assert (nonzero_halves
[1] == 1);
17644 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17646 /* Attempt to increase the likelihood that dfinal
17647 shuffle will be intra-lane. */
17648 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17651 /* vperm2f128 or vperm2i128. */
17652 for (i
= 0; i
< nelt2
; ++i
)
17654 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17655 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17656 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17657 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17660 if (d
->vmode
!= V8SFmode
17661 && d
->vmode
!= V4DFmode
17662 && d
->vmode
!= V8SImode
)
17664 dremap
.vmode
= V8SImode
;
17666 for (i
= 0; i
< 4; ++i
)
17668 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17669 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17673 else if (d
->one_operand_p
)
17675 else if (TARGET_AVX2
17676 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17679 for (i
= 0; i
< nelt4
; ++i
)
17682 remap
[i
+ nelt
] = i
* 2 + 1;
17683 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17684 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17685 dremap
.perm
[i
* 2] = i
;
17686 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17687 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17688 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17691 else if (TARGET_AVX2
17692 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17695 for (i
= 0; i
< nelt4
; ++i
)
17697 remap
[i
+ nelt4
] = i
* 2;
17698 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17699 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17700 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17701 dremap
.perm
[i
* 2] = i
+ nelt4
;
17702 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17703 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17704 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17711 /* Use the remapping array set up above to move the elements from their
17712 swizzled locations into their final destinations. */
17714 for (i
= 0; i
< nelt
; ++i
)
17716 unsigned e
= remap
[d
->perm
[i
]];
17717 gcc_assert (e
< nelt
);
17718 /* If same_halves is true, both halves of the remapped vector are the
17719 same. Avoid cross-lane accesses if possible. */
17720 if (same_halves
&& i
>= nelt2
)
17722 gcc_assert (e
< nelt2
);
17723 dfinal
.perm
[i
] = e
+ nelt2
;
17726 dfinal
.perm
[i
] = e
;
17730 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17731 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17733 dfinal
.op1
= dfinal
.op0
;
17734 dfinal
.one_operand_p
= true;
17736 /* Test if the final remap can be done with a single insn. For V4SFmode or
17737 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17739 ok
= expand_vec_perm_1 (&dfinal
);
17740 seq
= get_insns ();
17749 if (dremap
.vmode
!= dfinal
.vmode
)
17751 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17752 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17755 ok
= expand_vec_perm_1 (&dremap
);
17762 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17763 a single vector cross-lane permutation into vpermq followed
17764 by any of the single insn permutations. */
17767 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17769 struct expand_vec_perm_d dremap
, dfinal
;
17770 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17771 unsigned contents
[2];
17775 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17776 && d
->one_operand_p
))
17781 for (i
= 0; i
< nelt2
; ++i
)
17783 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17784 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17787 for (i
= 0; i
< 2; ++i
)
17789 unsigned int cnt
= 0;
17790 for (j
= 0; j
< 4; ++j
)
17791 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17799 dremap
.vmode
= V4DImode
;
17801 dremap
.target
= gen_reg_rtx (V4DImode
);
17802 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17803 dremap
.op1
= dremap
.op0
;
17804 dremap
.one_operand_p
= true;
17805 for (i
= 0; i
< 2; ++i
)
17807 unsigned int cnt
= 0;
17808 for (j
= 0; j
< 4; ++j
)
17809 if ((contents
[i
] & (1u << j
)) != 0)
17810 dremap
.perm
[2 * i
+ cnt
++] = j
;
17811 for (; cnt
< 2; ++cnt
)
17812 dremap
.perm
[2 * i
+ cnt
] = 0;
17816 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17817 dfinal
.op1
= dfinal
.op0
;
17818 dfinal
.one_operand_p
= true;
17819 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17823 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17824 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17826 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17827 dfinal
.perm
[i
] |= nelt4
;
17829 gcc_unreachable ();
17832 ok
= expand_vec_perm_1 (&dremap
);
17835 ok
= expand_vec_perm_1 (&dfinal
);
17841 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17843 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17844 a vector permutation using two instructions, vperm2f128 resp.
17845 vperm2i128 followed by any single in-lane permutation. */
17848 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17850 struct expand_vec_perm_d dfirst
, dsecond
;
17851 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17855 || GET_MODE_SIZE (d
->vmode
) != 32
17856 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17860 dsecond
.one_operand_p
= false;
17861 dsecond
.testing_p
= true;
17863 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17864 immediate. For perm < 16 the second permutation uses
17865 d->op0 as first operand, for perm >= 16 it uses d->op1
17866 as first operand. The second operand is the result of
17868 for (perm
= 0; perm
< 32; perm
++)
17870 /* Ignore permutations which do not move anything cross-lane. */
17873 /* The second shuffle for e.g. V4DFmode has
17874 0123 and ABCD operands.
17875 Ignore AB23, as 23 is already in the second lane
17876 of the first operand. */
17877 if ((perm
& 0xc) == (1 << 2)) continue;
17878 /* And 01CD, as 01 is in the first lane of the first
17880 if ((perm
& 3) == 0) continue;
17881 /* And 4567, as then the vperm2[fi]128 doesn't change
17882 anything on the original 4567 second operand. */
17883 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17887 /* The second shuffle for e.g. V4DFmode has
17888 4567 and ABCD operands.
17889 Ignore AB67, as 67 is already in the second lane
17890 of the first operand. */
17891 if ((perm
& 0xc) == (3 << 2)) continue;
17892 /* And 45CD, as 45 is in the first lane of the first
17894 if ((perm
& 3) == 2) continue;
17895 /* And 0123, as then the vperm2[fi]128 doesn't change
17896 anything on the original 0123 first operand. */
17897 if ((perm
& 0xf) == (1 << 2)) continue;
17900 for (i
= 0; i
< nelt
; i
++)
17902 j
= d
->perm
[i
] / nelt2
;
17903 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17904 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17905 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17906 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17914 ok
= expand_vec_perm_1 (&dsecond
);
17925 /* Found a usable second shuffle. dfirst will be
17926 vperm2f128 on d->op0 and d->op1. */
17927 dsecond
.testing_p
= false;
17929 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17930 for (i
= 0; i
< nelt
; i
++)
17931 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17932 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17934 canonicalize_perm (&dfirst
);
17935 ok
= expand_vec_perm_1 (&dfirst
);
17938 /* And dsecond is some single insn shuffle, taking
17939 d->op0 and result of vperm2f128 (if perm < 16) or
17940 d->op1 and result of vperm2f128 (otherwise). */
17942 dsecond
.op0
= dsecond
.op1
;
17943 dsecond
.op1
= dfirst
.target
;
17945 ok
= expand_vec_perm_1 (&dsecond
);
17951 /* For one operand, the only useful vperm2f128 permutation is 0x01
17953 if (d
->one_operand_p
)
17960 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17961 a two vector permutation using 2 intra-lane interleave insns
17962 and cross-lane shuffle for 32-byte vectors. */
17965 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17968 rtx (*gen
) (rtx
, rtx
, rtx
);
17970 if (d
->one_operand_p
)
17972 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17974 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17980 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17982 for (i
= 0; i
< nelt
; i
+= 2)
17983 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17984 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17994 gen
= gen_vec_interleave_highv32qi
;
17996 gen
= gen_vec_interleave_lowv32qi
;
18000 gen
= gen_vec_interleave_highv16hi
;
18002 gen
= gen_vec_interleave_lowv16hi
;
18006 gen
= gen_vec_interleave_highv8si
;
18008 gen
= gen_vec_interleave_lowv8si
;
18012 gen
= gen_vec_interleave_highv4di
;
18014 gen
= gen_vec_interleave_lowv4di
;
18018 gen
= gen_vec_interleave_highv8sf
;
18020 gen
= gen_vec_interleave_lowv8sf
;
18024 gen
= gen_vec_interleave_highv4df
;
18026 gen
= gen_vec_interleave_lowv4df
;
18029 gcc_unreachable ();
18032 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18036 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18037 a single vector permutation using a single intra-lane vector
18038 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18039 the non-swapped and swapped vectors together. */
18042 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18044 struct expand_vec_perm_d dfirst
, dsecond
;
18045 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18048 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18052 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18053 || !d
->one_operand_p
)
18057 for (i
= 0; i
< nelt
; i
++)
18058 dfirst
.perm
[i
] = 0xff;
18059 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18061 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18062 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18064 dfirst
.perm
[j
] = d
->perm
[i
];
18068 for (i
= 0; i
< nelt
; i
++)
18069 if (dfirst
.perm
[i
] == 0xff)
18070 dfirst
.perm
[i
] = i
;
18073 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18076 ok
= expand_vec_perm_1 (&dfirst
);
18077 seq
= get_insns ();
18089 dsecond
.op0
= dfirst
.target
;
18090 dsecond
.op1
= dfirst
.target
;
18091 dsecond
.one_operand_p
= true;
18092 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18093 for (i
= 0; i
< nelt
; i
++)
18094 dsecond
.perm
[i
] = i
^ nelt2
;
18096 ok
= expand_vec_perm_1 (&dsecond
);
18099 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18100 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18104 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18105 permutation using two vperm2f128, followed by a vshufpd insn blending
18106 the two vectors together. */
18109 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18111 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18114 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18124 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18125 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18126 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18127 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18128 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18129 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18130 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18131 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18132 dthird
.perm
[0] = (d
->perm
[0] % 2);
18133 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18134 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18135 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18137 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18138 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18139 dthird
.op0
= dfirst
.target
;
18140 dthird
.op1
= dsecond
.target
;
18141 dthird
.one_operand_p
= false;
18143 canonicalize_perm (&dfirst
);
18144 canonicalize_perm (&dsecond
);
18146 ok
= expand_vec_perm_1 (&dfirst
)
18147 && expand_vec_perm_1 (&dsecond
)
18148 && expand_vec_perm_1 (&dthird
);
18155 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18157 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18158 a two vector permutation using two intra-lane vector
18159 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18160 the non-swapped and swapped vectors together. */
18163 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18165 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18166 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18167 rtx_insn
*seq1
, *seq2
;
18169 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18173 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18174 || d
->one_operand_p
)
18179 for (i
= 0; i
< nelt
; i
++)
18181 dfirst
.perm
[i
] = 0xff;
18182 dsecond
.perm
[i
] = 0xff;
18184 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18186 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18189 dfirst
.perm
[j
] = d
->perm
[i
];
18190 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18194 dsecond
.perm
[j
] = d
->perm
[i
];
18195 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18199 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18204 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18205 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18208 for (i
= 0; i
< nelt
; i
++)
18210 if (dfirst
.perm
[i
] == 0xff)
18211 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18212 if (dsecond
.perm
[i
] == 0xff)
18213 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18215 canonicalize_perm (&dfirst
);
18217 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18218 seq1
= get_insns ();
18224 canonicalize_perm (&dsecond
);
18226 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18227 seq2
= get_insns ();
18240 dthird
.op0
= dsecond
.target
;
18241 dthird
.op1
= dsecond
.target
;
18242 dthird
.one_operand_p
= true;
18243 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18244 for (i
= 0; i
< nelt
; i
++)
18245 dthird
.perm
[i
] = i
^ nelt2
;
18247 ok
= expand_vec_perm_1 (&dthird
);
18250 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18251 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18255 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18256 permutation with two pshufb insns and an ior. We should have already
18257 failed all two instruction sequences. */
18260 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18262 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18263 unsigned int i
, nelt
, eltsz
;
18265 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18267 gcc_assert (!d
->one_operand_p
);
18273 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18275 /* Generate two permutation masks. If the required element is within
18276 the given vector it is shuffled into the proper lane. If the required
18277 element is in the other vector, force a zero into the lane by setting
18278 bit 7 in the permutation mask. */
18279 m128
= GEN_INT (-128);
18280 for (i
= 0; i
< nelt
; ++i
)
18282 unsigned j
, e
= d
->perm
[i
];
18283 unsigned which
= (e
>= nelt
);
18287 for (j
= 0; j
< eltsz
; ++j
)
18289 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18290 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18294 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18295 vperm
= force_reg (V16QImode
, vperm
);
18297 l
= gen_reg_rtx (V16QImode
);
18298 op
= gen_lowpart (V16QImode
, d
->op0
);
18299 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18301 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18302 vperm
= force_reg (V16QImode
, vperm
);
18304 h
= gen_reg_rtx (V16QImode
);
18305 op
= gen_lowpart (V16QImode
, d
->op1
);
18306 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18309 if (d
->vmode
!= V16QImode
)
18310 op
= gen_reg_rtx (V16QImode
);
18311 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18312 if (op
!= d
->target
)
18313 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18318 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18319 with two vpshufb insns, vpermq and vpor. We should have already failed
18320 all two or three instruction sequences. */
18323 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18325 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18326 unsigned int i
, nelt
, eltsz
;
18329 || !d
->one_operand_p
18330 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18337 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18339 /* Generate two permutation masks. If the required element is within
18340 the same lane, it is shuffled in. If the required element from the
18341 other lane, force a zero by setting bit 7 in the permutation mask.
18342 In the other mask the mask has non-negative elements if element
18343 is requested from the other lane, but also moved to the other lane,
18344 so that the result of vpshufb can have the two V2TImode halves
18346 m128
= GEN_INT (-128);
18347 for (i
= 0; i
< nelt
; ++i
)
18349 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18350 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18352 for (j
= 0; j
< eltsz
; ++j
)
18354 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18355 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18359 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18360 vperm
= force_reg (V32QImode
, vperm
);
18362 h
= gen_reg_rtx (V32QImode
);
18363 op
= gen_lowpart (V32QImode
, d
->op0
);
18364 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18366 /* Swap the 128-byte lanes of h into hp. */
18367 hp
= gen_reg_rtx (V4DImode
);
18368 op
= gen_lowpart (V4DImode
, h
);
18369 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18372 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18373 vperm
= force_reg (V32QImode
, vperm
);
18375 l
= gen_reg_rtx (V32QImode
);
18376 op
= gen_lowpart (V32QImode
, d
->op0
);
18377 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18380 if (d
->vmode
!= V32QImode
)
18381 op
= gen_reg_rtx (V32QImode
);
18382 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18383 if (op
!= d
->target
)
18384 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18389 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18390 and extract-odd permutations of two V32QImode and V16QImode operand
18391 with two vpshufb insns, vpor and vpermq. We should have already
18392 failed all two or three instruction sequences. */
18395 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18397 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18398 unsigned int i
, nelt
, eltsz
;
18401 || d
->one_operand_p
18402 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18405 for (i
= 0; i
< d
->nelt
; ++i
)
18406 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18413 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18415 /* Generate two permutation masks. In the first permutation mask
18416 the first quarter will contain indexes for the first half
18417 of the op0, the second quarter will contain bit 7 set, third quarter
18418 will contain indexes for the second half of the op0 and the
18419 last quarter bit 7 set. In the second permutation mask
18420 the first quarter will contain bit 7 set, the second quarter
18421 indexes for the first half of the op1, the third quarter bit 7 set
18422 and last quarter indexes for the second half of the op1.
18423 I.e. the first mask e.g. for V32QImode extract even will be:
18424 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18425 (all values masked with 0xf except for -128) and second mask
18426 for extract even will be
18427 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18428 m128
= GEN_INT (-128);
18429 for (i
= 0; i
< nelt
; ++i
)
18431 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18432 unsigned which
= d
->perm
[i
] >= nelt
;
18433 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18435 for (j
= 0; j
< eltsz
; ++j
)
18437 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18438 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18442 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18443 vperm
= force_reg (V32QImode
, vperm
);
18445 l
= gen_reg_rtx (V32QImode
);
18446 op
= gen_lowpart (V32QImode
, d
->op0
);
18447 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18449 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18450 vperm
= force_reg (V32QImode
, vperm
);
18452 h
= gen_reg_rtx (V32QImode
);
18453 op
= gen_lowpart (V32QImode
, d
->op1
);
18454 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18456 ior
= gen_reg_rtx (V32QImode
);
18457 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18459 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18460 op
= gen_reg_rtx (V4DImode
);
18461 ior
= gen_lowpart (V4DImode
, ior
);
18462 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18463 const1_rtx
, GEN_INT (3)));
18464 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18469 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18470 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18471 with two "and" and "pack" or two "shift" and "pack" insns. We should
18472 have already failed all two instruction sequences. */
18475 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18477 rtx op
, dop0
, dop1
, t
;
18478 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18479 bool end_perm
= false;
18480 machine_mode half_mode
;
18481 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18482 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18483 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18485 if (d
->one_operand_p
)
18491 /* Required for "pack". */
18492 if (!TARGET_SSE4_1
)
18496 half_mode
= V4SImode
;
18497 gen_and
= gen_andv4si3
;
18498 gen_pack
= gen_sse4_1_packusdw
;
18499 gen_shift
= gen_lshrv4si3
;
18502 /* No check as all instructions are SSE2. */
18505 half_mode
= V8HImode
;
18506 gen_and
= gen_andv8hi3
;
18507 gen_pack
= gen_sse2_packuswb
;
18508 gen_shift
= gen_lshrv8hi3
;
18515 half_mode
= V8SImode
;
18516 gen_and
= gen_andv8si3
;
18517 gen_pack
= gen_avx2_packusdw
;
18518 gen_shift
= gen_lshrv8si3
;
18526 half_mode
= V16HImode
;
18527 gen_and
= gen_andv16hi3
;
18528 gen_pack
= gen_avx2_packuswb
;
18529 gen_shift
= gen_lshrv16hi3
;
18533 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18534 general shuffles. */
18538 /* Check that permutation is even or odd. */
18543 for (i
= 1; i
< nelt
; ++i
)
18544 if (d
->perm
[i
] != 2 * i
+ odd
)
18550 dop0
= gen_reg_rtx (half_mode
);
18551 dop1
= gen_reg_rtx (half_mode
);
18554 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18555 t
= force_reg (half_mode
, t
);
18556 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18557 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18561 emit_insn (gen_shift (dop0
,
18562 gen_lowpart (half_mode
, d
->op0
),
18564 emit_insn (gen_shift (dop1
,
18565 gen_lowpart (half_mode
, d
->op1
),
18568 /* In AVX2 for 256 bit case we need to permute pack result. */
18569 if (TARGET_AVX2
&& end_perm
)
18571 op
= gen_reg_rtx (d
->vmode
);
18572 t
= gen_reg_rtx (V4DImode
);
18573 emit_insn (gen_pack (op
, dop0
, dop1
));
18574 emit_insn (gen_avx2_permv4di_1 (t
,
18575 gen_lowpart (V4DImode
, op
),
18580 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18583 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18588 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18589 and extract-odd permutations of two V64QI operands
18590 with two "shifts", two "truncs" and one "concat" insns for "odd"
18591 and two "truncs" and one concat insn for "even."
18592 Have already failed all two instruction sequences. */
18595 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18597 rtx t1
, t2
, t3
, t4
;
18598 unsigned i
, odd
, nelt
= d
->nelt
;
18600 if (!TARGET_AVX512BW
18601 || d
->one_operand_p
18602 || d
->vmode
!= V64QImode
)
18605 /* Check that permutation is even or odd. */
18610 for (i
= 1; i
< nelt
; ++i
)
18611 if (d
->perm
[i
] != 2 * i
+ odd
)
18620 t1
= gen_reg_rtx (V32HImode
);
18621 t2
= gen_reg_rtx (V32HImode
);
18622 emit_insn (gen_lshrv32hi3 (t1
,
18623 gen_lowpart (V32HImode
, d
->op0
),
18625 emit_insn (gen_lshrv32hi3 (t2
,
18626 gen_lowpart (V32HImode
, d
->op1
),
18631 t1
= gen_lowpart (V32HImode
, d
->op0
);
18632 t2
= gen_lowpart (V32HImode
, d
->op1
);
18635 t3
= gen_reg_rtx (V32QImode
);
18636 t4
= gen_reg_rtx (V32QImode
);
18637 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18638 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18639 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18644 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18645 and extract-odd permutations. */
18648 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18650 rtx t1
, t2
, t3
, t4
, t5
;
18657 t1
= gen_reg_rtx (V4DFmode
);
18658 t2
= gen_reg_rtx (V4DFmode
);
18660 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18661 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18662 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18664 /* Now an unpck[lh]pd will produce the result required. */
18666 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18668 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18674 int mask
= odd
? 0xdd : 0x88;
18678 t1
= gen_reg_rtx (V8SFmode
);
18679 t2
= gen_reg_rtx (V8SFmode
);
18680 t3
= gen_reg_rtx (V8SFmode
);
18682 /* Shuffle within the 128-bit lanes to produce:
18683 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18684 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18687 /* Shuffle the lanes around to produce:
18688 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18689 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18692 /* Shuffle within the 128-bit lanes to produce:
18693 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18694 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18696 /* Shuffle within the 128-bit lanes to produce:
18697 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18698 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18700 /* Shuffle the lanes around to produce:
18701 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18702 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18712 /* These are always directly implementable by expand_vec_perm_1. */
18713 gcc_unreachable ();
18716 gcc_assert (TARGET_MMX_WITH_SSE
);
18717 /* We have no suitable instructions. */
18725 /* We need 2*log2(N)-1 operations to achieve odd/even
18726 with interleave. */
18727 t1
= gen_reg_rtx (V4HImode
);
18728 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
18729 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
18731 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
18733 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
18739 return expand_vec_perm_even_odd_pack (d
);
18740 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18741 return expand_vec_perm_pshufb2 (d
);
18746 /* We need 2*log2(N)-1 operations to achieve odd/even
18747 with interleave. */
18748 t1
= gen_reg_rtx (V8HImode
);
18749 t2
= gen_reg_rtx (V8HImode
);
18750 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18751 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18752 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18753 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18755 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18757 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18763 return expand_vec_perm_even_odd_pack (d
);
18767 return expand_vec_perm_even_odd_pack (d
);
18770 return expand_vec_perm_even_odd_trunc (d
);
18775 struct expand_vec_perm_d d_copy
= *d
;
18776 d_copy
.vmode
= V4DFmode
;
18778 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18780 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18781 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18782 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18783 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18786 emit_move_insn (d
->target
,
18787 gen_lowpart (V4DImode
, d_copy
.target
));
18796 t1
= gen_reg_rtx (V4DImode
);
18797 t2
= gen_reg_rtx (V4DImode
);
18799 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18800 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18801 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18803 /* Now an vpunpck[lh]qdq will produce the result required. */
18805 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18807 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18814 struct expand_vec_perm_d d_copy
= *d
;
18815 d_copy
.vmode
= V8SFmode
;
18817 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18819 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18820 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18821 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18822 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18825 emit_move_insn (d
->target
,
18826 gen_lowpart (V8SImode
, d_copy
.target
));
18835 t1
= gen_reg_rtx (V8SImode
);
18836 t2
= gen_reg_rtx (V8SImode
);
18837 t3
= gen_reg_rtx (V4DImode
);
18838 t4
= gen_reg_rtx (V4DImode
);
18839 t5
= gen_reg_rtx (V4DImode
);
18841 /* Shuffle the lanes around into
18842 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18843 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18844 gen_lowpart (V4DImode
, d
->op1
),
18846 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18847 gen_lowpart (V4DImode
, d
->op1
),
18850 /* Swap the 2nd and 3rd position in each lane into
18851 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18852 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18853 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18854 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18855 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18857 /* Now an vpunpck[lh]qdq will produce
18858 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18860 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18861 gen_lowpart (V4DImode
, t2
));
18863 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18864 gen_lowpart (V4DImode
, t2
));
18866 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18870 gcc_unreachable ();
18876 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18877 extract-even and extract-odd permutations. */
18880 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18882 unsigned i
, odd
, nelt
= d
->nelt
;
18885 if (odd
!= 0 && odd
!= 1)
18888 for (i
= 1; i
< nelt
; ++i
)
18889 if (d
->perm
[i
] != 2 * i
+ odd
)
18892 return expand_vec_perm_even_odd_1 (d
, odd
);
18895 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18896 permutations. We assume that expand_vec_perm_1 has already failed. */
18899 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18901 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18902 machine_mode vmode
= d
->vmode
;
18903 unsigned char perm2
[4];
18904 rtx op0
= d
->op0
, dest
;
18911 /* These are special-cased in sse.md so that we can optionally
18912 use the vbroadcast instruction. They expand to two insns
18913 if the input happens to be in a register. */
18914 gcc_unreachable ();
18922 /* These are always implementable using standard shuffle patterns. */
18923 gcc_unreachable ();
18927 /* These can be implemented via interleave. We save one insn by
18928 stopping once we have promoted to V4SImode and then use pshufd. */
18934 rtx (*gen
) (rtx
, rtx
, rtx
)
18935 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18936 : gen_vec_interleave_lowv8hi
;
18940 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18941 : gen_vec_interleave_highv8hi
;
18946 dest
= gen_reg_rtx (vmode
);
18947 emit_insn (gen (dest
, op0
, op0
));
18948 vmode
= get_mode_wider_vector (vmode
);
18949 op0
= gen_lowpart (vmode
, dest
);
18951 while (vmode
!= V4SImode
);
18953 memset (perm2
, elt
, 4);
18954 dest
= gen_reg_rtx (V4SImode
);
18955 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18958 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18966 /* For AVX2 broadcasts of the first element vpbroadcast* or
18967 vpermq should be used by expand_vec_perm_1. */
18968 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18972 gcc_unreachable ();
18976 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18977 broadcast permutations. */
18980 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18982 unsigned i
, elt
, nelt
= d
->nelt
;
18984 if (!d
->one_operand_p
)
18988 for (i
= 1; i
< nelt
; ++i
)
18989 if (d
->perm
[i
] != elt
)
18992 return expand_vec_perm_broadcast_1 (d
);
18995 /* Implement arbitrary permutations of two V64QImode operands
18996 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18998 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
19000 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
19006 struct expand_vec_perm_d ds
[2];
19007 rtx rperm
[128], vperm
, target0
, target1
;
19008 unsigned int i
, nelt
;
19009 machine_mode vmode
;
19014 for (i
= 0; i
< 2; i
++)
19017 ds
[i
].vmode
= V32HImode
;
19019 ds
[i
].target
= gen_reg_rtx (V32HImode
);
19020 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
19021 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
19024 /* Prepare permutations such that the first one takes care of
19025 putting the even bytes into the right positions or one higher
19026 positions (ds[0]) and the second one takes care of
19027 putting the odd bytes into the right positions or one below
19030 for (i
= 0; i
< nelt
; i
++)
19032 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
19035 rperm
[i
] = constm1_rtx
;
19036 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19040 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19041 rperm
[i
+ 64] = constm1_rtx
;
19045 bool ok
= expand_vec_perm_1 (&ds
[0]);
19047 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
19049 ok
= expand_vec_perm_1 (&ds
[1]);
19051 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
19053 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
19054 vperm
= force_reg (vmode
, vperm
);
19055 target0
= gen_reg_rtx (V64QImode
);
19056 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
19058 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
19059 vperm
= force_reg (vmode
, vperm
);
19060 target1
= gen_reg_rtx (V64QImode
);
19061 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
19063 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
19067 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19068 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19069 all the shorter instruction sequences. */
19072 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19074 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19075 unsigned int i
, nelt
, eltsz
;
19079 || d
->one_operand_p
19080 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19087 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19089 /* Generate 4 permutation masks. If the required element is within
19090 the same lane, it is shuffled in. If the required element from the
19091 other lane, force a zero by setting bit 7 in the permutation mask.
19092 In the other mask the mask has non-negative elements if element
19093 is requested from the other lane, but also moved to the other lane,
19094 so that the result of vpshufb can have the two V2TImode halves
19096 m128
= GEN_INT (-128);
19097 for (i
= 0; i
< 32; ++i
)
19099 rperm
[0][i
] = m128
;
19100 rperm
[1][i
] = m128
;
19101 rperm
[2][i
] = m128
;
19102 rperm
[3][i
] = m128
;
19108 for (i
= 0; i
< nelt
; ++i
)
19110 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19111 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19112 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19114 for (j
= 0; j
< eltsz
; ++j
)
19115 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19116 used
[which
] = true;
19119 for (i
= 0; i
< 2; ++i
)
19121 if (!used
[2 * i
+ 1])
19126 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19127 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19128 vperm
= force_reg (V32QImode
, vperm
);
19129 h
[i
] = gen_reg_rtx (V32QImode
);
19130 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19131 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19134 /* Swap the 128-byte lanes of h[X]. */
19135 for (i
= 0; i
< 2; ++i
)
19137 if (h
[i
] == NULL_RTX
)
19139 op
= gen_reg_rtx (V4DImode
);
19140 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19141 const2_rtx
, GEN_INT (3), const0_rtx
,
19143 h
[i
] = gen_lowpart (V32QImode
, op
);
19146 for (i
= 0; i
< 2; ++i
)
19153 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19154 vperm
= force_reg (V32QImode
, vperm
);
19155 l
[i
] = gen_reg_rtx (V32QImode
);
19156 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19157 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19160 for (i
= 0; i
< 2; ++i
)
19164 op
= gen_reg_rtx (V32QImode
);
19165 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19172 gcc_assert (l
[0] && l
[1]);
19174 if (d
->vmode
!= V32QImode
)
19175 op
= gen_reg_rtx (V32QImode
);
19176 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19177 if (op
!= d
->target
)
19178 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19182 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19183 taken care of, perform the expansion in D and return true on success. */
19186 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19188 /* Try a single instruction expansion. */
19189 if (expand_vec_perm_1 (d
))
19192 /* Try sequences of two instructions. */
19194 if (expand_vec_perm_pshuflw_pshufhw (d
))
19197 if (expand_vec_perm_palignr (d
, false))
19200 if (expand_vec_perm_interleave2 (d
))
19203 if (expand_vec_perm_broadcast (d
))
19206 if (expand_vec_perm_vpermq_perm_1 (d
))
19209 if (expand_vec_perm_vperm2f128 (d
))
19212 if (expand_vec_perm_pblendv (d
))
19215 /* Try sequences of three instructions. */
19217 if (expand_vec_perm_even_odd_pack (d
))
19220 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19223 if (expand_vec_perm_pshufb2 (d
))
19226 if (expand_vec_perm_interleave3 (d
))
19229 if (expand_vec_perm_vperm2f128_vblend (d
))
19232 /* Try sequences of four instructions. */
19234 if (expand_vec_perm_even_odd_trunc (d
))
19236 if (expand_vec_perm_vpshufb2_vpermq (d
))
19239 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19242 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19245 /* ??? Look for narrow permutations whose element orderings would
19246 allow the promotion to a wider mode. */
19248 /* ??? Look for sequences of interleave or a wider permute that place
19249 the data into the correct lanes for a half-vector shuffle like
19250 pshuf[lh]w or vpermilps. */
19252 /* ??? Look for sequences of interleave that produce the desired results.
19253 The combinatorics of punpck[lh] get pretty ugly... */
19255 if (expand_vec_perm_even_odd (d
))
19258 /* Even longer sequences. */
19259 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19262 /* See if we can get the same permutation in different vector integer
19264 struct expand_vec_perm_d nd
;
19265 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19268 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19272 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19273 if (expand_vec_perm2_vperm2f128_vblend (d
))
19279 /* If a permutation only uses one operand, make it clear. Returns true
19280 if the permutation references both operands. */
19283 canonicalize_perm (struct expand_vec_perm_d
*d
)
19285 int i
, which
, nelt
= d
->nelt
;
19287 for (i
= which
= 0; i
< nelt
; ++i
)
19288 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19290 d
->one_operand_p
= true;
19297 if (!rtx_equal_p (d
->op0
, d
->op1
))
19299 d
->one_operand_p
= false;
19302 /* The elements of PERM do not suggest that only the first operand
19303 is used, but both operands are identical. Allow easier matching
19304 of the permutation by folding the permutation into the single
19309 for (i
= 0; i
< nelt
; ++i
)
19310 d
->perm
[i
] &= nelt
- 1;
19319 return (which
== 3);
19322 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19325 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19326 rtx op1
, const vec_perm_indices
&sel
)
19328 struct expand_vec_perm_d d
;
19329 unsigned char perm
[MAX_VECT_LEN
];
19330 unsigned int i
, nelt
, which
;
19338 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19339 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19340 d
.testing_p
= !target
;
19342 gcc_assert (sel
.length () == nelt
);
19343 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19345 /* Given sufficient ISA support we can just return true here
19346 for selected vector modes. */
19353 if (!TARGET_AVX512F
)
19355 /* All implementable with a single vperm[it]2 insn. */
19360 if (!TARGET_AVX512BW
)
19363 /* All implementable with a single vperm[it]2 insn. */
19367 if (!TARGET_AVX512BW
)
19370 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19379 if (d
.testing_p
&& TARGET_AVX512VL
)
19380 /* All implementable with a single vperm[it]2 insn. */
19386 if (d
.testing_p
&& TARGET_AVX2
)
19387 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19393 if (d
.testing_p
&& TARGET_AVX2
)
19394 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19401 /* Fall through. */
19406 /* All implementable with a single vpperm insn. */
19407 if (d
.testing_p
&& TARGET_XOP
)
19409 /* All implementable with 2 pshufb + 1 ior. */
19410 if (d
.testing_p
&& TARGET_SSSE3
)
19416 if (!TARGET_MMX_WITH_SSE
)
19423 /* All implementable with shufpd or unpck[lh]pd. */
19431 for (i
= which
= 0; i
< nelt
; ++i
)
19433 unsigned char e
= sel
[i
];
19434 gcc_assert (e
< 2 * nelt
);
19437 which
|= (e
< nelt
? 1 : 2);
19442 /* For all elements from second vector, fold the elements to first. */
19444 for (i
= 0; i
< nelt
; ++i
)
19447 /* Check whether the mask can be applied to the vector type. */
19448 d
.one_operand_p
= (which
!= 3);
19450 /* Implementable with shufps or pshufd. */
19451 if (d
.one_operand_p
19452 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
19453 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
))
19456 /* Otherwise we have to go through the motions and see if we can
19457 figure out how to generate the requested permutation. */
19458 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19459 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19460 if (!d
.one_operand_p
)
19461 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19464 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19470 two_args
= canonicalize_perm (&d
);
19472 if (ix86_expand_vec_perm_const_1 (&d
))
19475 /* If the selector says both arguments are needed, but the operands are the
19476 same, the above tried to expand with one_operand_p and flattened selector.
19477 If that didn't work, retry without one_operand_p; we succeeded with that
19479 if (two_args
&& d
.one_operand_p
)
19481 d
.one_operand_p
= false;
19482 memcpy (d
.perm
, perm
, sizeof (perm
));
19483 return ix86_expand_vec_perm_const_1 (&d
);
19490 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19492 struct expand_vec_perm_d d
;
19498 d
.vmode
= GET_MODE (targ
);
19499 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19500 d
.one_operand_p
= false;
19501 d
.testing_p
= false;
19503 for (i
= 0; i
< nelt
; ++i
)
19504 d
.perm
[i
] = i
* 2 + odd
;
19506 /* We'll either be able to implement the permutation directly... */
19507 if (expand_vec_perm_1 (&d
))
19510 /* ... or we use the special-case patterns. */
19511 expand_vec_perm_even_odd_1 (&d
, odd
);
19515 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19517 struct expand_vec_perm_d d
;
19518 unsigned i
, nelt
, base
;
19524 d
.vmode
= GET_MODE (targ
);
19525 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19526 d
.one_operand_p
= false;
19527 d
.testing_p
= false;
19529 base
= high_p
? nelt
/ 2 : 0;
19530 for (i
= 0; i
< nelt
/ 2; ++i
)
19532 d
.perm
[i
* 2] = i
+ base
;
19533 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19536 /* Note that for AVX this isn't one instruction. */
19537 ok
= ix86_expand_vec_perm_const_1 (&d
);
19541 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
19542 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
19544 vpmovzxbw ymm2, xmm0
19545 vpmovzxbw ymm3, xmm1
19546 vpmullw ymm4, ymm2, ymm3
19549 it would take less instructions than ix86_expand_vecop_qihi.
19550 Return true if success. */
19553 ix86_expand_vecmul_qihi (rtx dest
, rtx op1
, rtx op2
)
19555 machine_mode himode
, qimode
= GET_MODE (dest
);
19556 rtx hop1
, hop2
, hdest
;
19557 rtx (*gen_extend
)(rtx
, rtx
);
19558 rtx (*gen_truncate
)(rtx
, rtx
);
19560 /* There's no V64HImode multiplication instruction. */
19561 if (qimode
== E_V64QImode
)
19564 /* vpmovwb only available under AVX512BW. */
19565 if (!TARGET_AVX512BW
)
19567 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
19568 && !TARGET_AVX512VL
)
19570 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
19571 if (qimode
== V32QImode
19572 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
19579 gen_extend
= gen_zero_extendv8qiv8hi2
;
19580 gen_truncate
= gen_truncv8hiv8qi2
;
19583 himode
= V16HImode
;
19584 gen_extend
= gen_zero_extendv16qiv16hi2
;
19585 gen_truncate
= gen_truncv16hiv16qi2
;
19588 himode
= V32HImode
;
19589 gen_extend
= gen_zero_extendv32qiv32hi2
;
19590 gen_truncate
= gen_truncv32hiv32qi2
;
19593 gcc_unreachable ();
19596 hop1
= gen_reg_rtx (himode
);
19597 hop2
= gen_reg_rtx (himode
);
19598 hdest
= gen_reg_rtx (himode
);
19599 emit_insn (gen_extend (hop1
, op1
));
19600 emit_insn (gen_extend (hop2
, op2
));
19601 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (MULT
, himode
,
19603 emit_insn (gen_truncate (dest
, hdest
));
19607 /* Expand a vector operation shift by constant for a V*QImode in terms of the
19608 same operation on V*HImode. Return true if success. */
19610 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19612 machine_mode qimode
, himode
;
19613 HOST_WIDE_INT and_constant
, xor_constant
;
19614 HOST_WIDE_INT shift_amount
;
19615 rtx vec_const_and
, vec_const_xor
;
19616 rtx tmp
, op1_subreg
;
19617 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
19618 rtx (*gen_and
) (rtx
, rtx
, rtx
);
19619 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
19620 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
19622 /* Only optimize shift by constant. */
19623 if (!CONST_INT_P (op2
))
19626 qimode
= GET_MODE (dest
);
19627 shift_amount
= INTVAL (op2
);
19628 /* Do nothing when shift amount greater equal 8. */
19629 if (shift_amount
> 7)
19632 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
19633 /* Record sign bit. */
19634 xor_constant
= 1 << (8 - shift_amount
- 1);
19636 /* Zero upper/lower bits shift from left/right element. */
19638 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
19639 : (1 << (8 - shift_amount
)) - 1);
19648 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
19649 gen_and
= gen_andv16qi3
;
19650 gen_xor
= gen_xorv16qi3
;
19651 gen_sub
= gen_subv16qi3
;
19654 himode
= V16HImode
;
19658 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
19659 gen_and
= gen_andv32qi3
;
19660 gen_xor
= gen_xorv32qi3
;
19661 gen_sub
= gen_subv32qi3
;
19664 himode
= V32HImode
;
19668 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
19669 gen_and
= gen_andv64qi3
;
19670 gen_xor
= gen_xorv64qi3
;
19671 gen_sub
= gen_subv64qi3
;
19674 gcc_unreachable ();
19677 tmp
= gen_reg_rtx (himode
);
19678 vec_const_and
= gen_reg_rtx (qimode
);
19679 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
19681 /* For ASHIFT and LSHIFTRT, perform operation like
19682 vpsllw/vpsrlw $shift_amount, %op1, %dest.
19683 vpand %vec_const_and, %dest. */
19684 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
19685 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
19686 emit_move_insn (vec_const_and
,
19687 ix86_build_const_vector (qimode
, true,
19688 gen_int_mode (and_constant
, QImode
)));
19689 emit_insn (gen_and (dest
, dest
, vec_const_and
));
19691 /* For ASHIFTRT, perform extra operation like
19692 vpxor %vec_const_xor, %dest, %dest
19693 vpsubb %vec_const_xor, %dest, %dest */
19694 if (code
== ASHIFTRT
)
19696 vec_const_xor
= gen_reg_rtx (qimode
);
19697 emit_move_insn (vec_const_xor
,
19698 ix86_build_const_vector (qimode
, true,
19699 gen_int_mode (xor_constant
, QImode
)));
19700 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
19701 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
19706 /* Expand a vector operation CODE for a V*QImode in terms of the
19707 same operation on V*HImode. */
19710 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19712 machine_mode qimode
= GET_MODE (dest
);
19713 machine_mode himode
;
19714 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19715 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19716 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19717 struct expand_vec_perm_d d
;
19718 bool ok
, full_interleave
;
19719 bool uns_p
= false;
19726 gen_il
= gen_vec_interleave_lowv16qi
;
19727 gen_ih
= gen_vec_interleave_highv16qi
;
19730 himode
= V16HImode
;
19731 gen_il
= gen_avx2_interleave_lowv32qi
;
19732 gen_ih
= gen_avx2_interleave_highv32qi
;
19735 himode
= V32HImode
;
19736 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19737 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19740 gcc_unreachable ();
19743 op2_l
= op2_h
= op2
;
19747 /* Unpack data such that we've got a source byte in each low byte of
19748 each word. We don't care what goes into the high byte of each word.
19749 Rather than trying to get zero in there, most convenient is to let
19750 it be a copy of the low byte. */
19751 op2_l
= gen_reg_rtx (qimode
);
19752 op2_h
= gen_reg_rtx (qimode
);
19753 emit_insn (gen_il (op2_l
, op2
, op2
));
19754 emit_insn (gen_ih (op2_h
, op2
, op2
));
19756 op1_l
= gen_reg_rtx (qimode
);
19757 op1_h
= gen_reg_rtx (qimode
);
19758 emit_insn (gen_il (op1_l
, op1
, op1
));
19759 emit_insn (gen_ih (op1_h
, op1
, op1
));
19760 full_interleave
= qimode
== V16QImode
;
19768 op1_l
= gen_reg_rtx (himode
);
19769 op1_h
= gen_reg_rtx (himode
);
19770 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19771 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19772 full_interleave
= true;
19775 gcc_unreachable ();
19778 /* Perform the operation. */
19779 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19781 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19783 gcc_assert (res_l
&& res_h
);
19785 /* Merge the data back into the right place. */
19787 d
.op0
= gen_lowpart (qimode
, res_l
);
19788 d
.op1
= gen_lowpart (qimode
, res_h
);
19790 d
.nelt
= GET_MODE_NUNITS (qimode
);
19791 d
.one_operand_p
= false;
19792 d
.testing_p
= false;
19794 if (full_interleave
)
19796 /* For SSE2, we used an full interleave, so the desired
19797 results are in the even elements. */
19798 for (i
= 0; i
< d
.nelt
; ++i
)
19803 /* For AVX, the interleave used above was not cross-lane. So the
19804 extraction is evens but with the second and third quarter swapped.
19805 Happily, that is even one insn shorter than even extraction.
19806 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19807 always first from the first and then from the second source operand,
19808 the index bits above the low 4 bits remains the same.
19809 Thus, for d.nelt == 32 we want permutation
19810 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19811 and for d.nelt == 64 we want permutation
19812 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19813 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19814 for (i
= 0; i
< d
.nelt
; ++i
)
19815 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19818 ok
= ix86_expand_vec_perm_const_1 (&d
);
19821 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19822 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19825 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19826 if op is CONST_VECTOR with all odd elements equal to their
19827 preceding element. */
19830 const_vector_equal_evenodd_p (rtx op
)
19832 machine_mode mode
= GET_MODE (op
);
19833 int i
, nunits
= GET_MODE_NUNITS (mode
);
19834 if (GET_CODE (op
) != CONST_VECTOR
19835 || nunits
!= CONST_VECTOR_NUNITS (op
))
19837 for (i
= 0; i
< nunits
; i
+= 2)
19838 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19844 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19845 bool uns_p
, bool odd_p
)
19847 machine_mode mode
= GET_MODE (op1
);
19848 machine_mode wmode
= GET_MODE (dest
);
19850 rtx orig_op1
= op1
, orig_op2
= op2
;
19852 if (!nonimmediate_operand (op1
, mode
))
19853 op1
= force_reg (mode
, op1
);
19854 if (!nonimmediate_operand (op2
, mode
))
19855 op2
= force_reg (mode
, op2
);
19857 /* We only play even/odd games with vectors of SImode. */
19858 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19860 /* If we're looking for the odd results, shift those members down to
19861 the even slots. For some cpus this is faster than a PSHUFD. */
19864 /* For XOP use vpmacsdqh, but only for smult, as it is only
19866 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19868 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19869 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19873 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19874 if (!const_vector_equal_evenodd_p (orig_op1
))
19875 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19876 x
, NULL
, 1, OPTAB_DIRECT
);
19877 if (!const_vector_equal_evenodd_p (orig_op2
))
19878 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19879 x
, NULL
, 1, OPTAB_DIRECT
);
19880 op1
= gen_lowpart (mode
, op1
);
19881 op2
= gen_lowpart (mode
, op2
);
19884 if (mode
== V16SImode
)
19887 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19889 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19891 else if (mode
== V8SImode
)
19894 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19896 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19899 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19900 else if (TARGET_SSE4_1
)
19901 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19904 rtx s1
, s2
, t0
, t1
, t2
;
19906 /* The easiest way to implement this without PMULDQ is to go through
19907 the motions as if we are performing a full 64-bit multiply. With
19908 the exception that we need to do less shuffling of the elements. */
19910 /* Compute the sign-extension, aka highparts, of the two operands. */
19911 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19912 op1
, pc_rtx
, pc_rtx
);
19913 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19914 op2
, pc_rtx
, pc_rtx
);
19916 /* Multiply LO(A) * HI(B), and vice-versa. */
19917 t1
= gen_reg_rtx (wmode
);
19918 t2
= gen_reg_rtx (wmode
);
19919 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19920 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19922 /* Multiply LO(A) * LO(B). */
19923 t0
= gen_reg_rtx (wmode
);
19924 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19926 /* Combine and shift the highparts into place. */
19927 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19928 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19931 /* Combine high and low parts. */
19932 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19939 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19940 bool uns_p
, bool high_p
)
19942 machine_mode wmode
= GET_MODE (dest
);
19943 machine_mode mode
= GET_MODE (op1
);
19944 rtx t1
, t2
, t3
, t4
, mask
;
19949 t1
= gen_reg_rtx (mode
);
19950 t2
= gen_reg_rtx (mode
);
19951 if (TARGET_XOP
&& !uns_p
)
19953 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19954 shuffle the elements once so that all elements are in the right
19955 place for immediate use: { A C B D }. */
19956 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19957 const1_rtx
, GEN_INT (3)));
19958 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19959 const1_rtx
, GEN_INT (3)));
19963 /* Put the elements into place for the multiply. */
19964 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19965 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19968 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19972 /* Shuffle the elements between the lanes. After this we
19973 have { A B E F | C D G H } for each operand. */
19974 t1
= gen_reg_rtx (V4DImode
);
19975 t2
= gen_reg_rtx (V4DImode
);
19976 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19977 const0_rtx
, const2_rtx
,
19978 const1_rtx
, GEN_INT (3)));
19979 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19980 const0_rtx
, const2_rtx
,
19981 const1_rtx
, GEN_INT (3)));
19983 /* Shuffle the elements within the lanes. After this we
19984 have { A A B B | C C D D } or { E E F F | G G H H }. */
19985 t3
= gen_reg_rtx (V8SImode
);
19986 t4
= gen_reg_rtx (V8SImode
);
19987 mask
= GEN_INT (high_p
19988 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19989 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19990 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19991 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19993 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19998 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19999 uns_p
, OPTAB_DIRECT
);
20000 t2
= expand_binop (mode
,
20001 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
20002 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
20003 gcc_assert (t1
&& t2
);
20005 t3
= gen_reg_rtx (mode
);
20006 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
20007 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
20015 t1
= gen_reg_rtx (wmode
);
20016 t2
= gen_reg_rtx (wmode
);
20017 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
20018 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
20020 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
20024 gcc_unreachable ();
20029 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
20031 rtx res_1
, res_2
, res_3
, res_4
;
20033 res_1
= gen_reg_rtx (V4SImode
);
20034 res_2
= gen_reg_rtx (V4SImode
);
20035 res_3
= gen_reg_rtx (V2DImode
);
20036 res_4
= gen_reg_rtx (V2DImode
);
20037 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
20038 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
20040 /* Move the results in element 2 down to element 1; we don't care
20041 what goes in elements 2 and 3. Then we can merge the parts
20042 back together with an interleave.
20044 Note that two other sequences were tried:
20045 (1) Use interleaves at the start instead of psrldq, which allows
20046 us to use a single shufps to merge things back at the end.
20047 (2) Use shufps here to combine the two vectors, then pshufd to
20048 put the elements in the correct order.
20049 In both cases the cost of the reformatting stall was too high
20050 and the overall sequence slower. */
20052 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
20053 const0_rtx
, const2_rtx
,
20054 const0_rtx
, const0_rtx
));
20055 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
20056 const0_rtx
, const2_rtx
,
20057 const0_rtx
, const0_rtx
));
20058 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
20060 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
20064 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
20066 machine_mode mode
= GET_MODE (op0
);
20067 rtx t1
, t2
, t3
, t4
, t5
, t6
;
20069 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
20070 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
20071 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
20072 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
20073 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
20074 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
20075 else if (TARGET_XOP
&& mode
== V2DImode
)
20077 /* op1: A,B,C,D, op2: E,F,G,H */
20078 op1
= gen_lowpart (V4SImode
, op1
);
20079 op2
= gen_lowpart (V4SImode
, op2
);
20081 t1
= gen_reg_rtx (V4SImode
);
20082 t2
= gen_reg_rtx (V4SImode
);
20083 t3
= gen_reg_rtx (V2DImode
);
20084 t4
= gen_reg_rtx (V2DImode
);
20087 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
20093 /* t2: (B*E),(A*F),(D*G),(C*H) */
20094 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
20096 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20097 emit_insn (gen_xop_phadddq (t3
, t2
));
20099 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20100 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
20102 /* Multiply lower parts and add all */
20103 t5
= gen_reg_rtx (V2DImode
);
20104 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
20105 gen_lowpart (V4SImode
, op1
),
20106 gen_lowpart (V4SImode
, op2
)));
20107 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
20111 machine_mode nmode
;
20112 rtx (*umul
) (rtx
, rtx
, rtx
);
20114 if (mode
== V2DImode
)
20116 umul
= gen_vec_widen_umult_even_v4si
;
20119 else if (mode
== V4DImode
)
20121 umul
= gen_vec_widen_umult_even_v8si
;
20124 else if (mode
== V8DImode
)
20126 umul
= gen_vec_widen_umult_even_v16si
;
20130 gcc_unreachable ();
20133 /* Multiply low parts. */
20134 t1
= gen_reg_rtx (mode
);
20135 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
20137 /* Shift input vectors right 32 bits so we can multiply high parts. */
20139 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
20140 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
20142 /* Multiply high parts by low parts. */
20143 t4
= gen_reg_rtx (mode
);
20144 t5
= gen_reg_rtx (mode
);
20145 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
20146 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
20148 /* Combine and shift the highparts back. */
20149 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
20150 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
20152 /* Combine high and low parts. */
20153 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
20156 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20157 gen_rtx_MULT (mode
, op1
, op2
));
20160 /* Return 1 if control tansfer instruction INSN
20161 should be encoded with notrack prefix. */
20164 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
20166 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
20171 rtx call
= get_call_rtx_from (insn
);
20172 gcc_assert (call
!= NULL_RTX
);
20173 rtx addr
= XEXP (call
, 0);
20175 /* Do not emit 'notrack' if it's not an indirect call. */
20177 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
20180 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
20183 if (JUMP_P (insn
) && !flag_cet_switch
)
20185 rtx target
= JUMP_LABEL (insn
);
20186 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
20189 /* Check the jump is a switch table. */
20190 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
20191 rtx_insn
*table
= next_insn (label
);
20192 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20200 /* Calculate integer abs() using only SSE2 instructions. */
20203 ix86_expand_sse2_abs (rtx target
, rtx input
)
20205 machine_mode mode
= GET_MODE (target
);
20212 /* For 64-bit signed integer X, with SSE4.2 use
20213 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20214 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20215 32 and use logical instead of arithmetic right shift (which is
20216 unimplemented) and subtract. */
20219 tmp0
= gen_reg_rtx (mode
);
20220 tmp1
= gen_reg_rtx (mode
);
20221 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20222 if (mode
== E_V2DImode
)
20223 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20225 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20229 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20230 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20231 - 1), NULL
, 0, OPTAB_DIRECT
);
20232 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20235 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20236 NULL
, 0, OPTAB_DIRECT
);
20237 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20238 target
, 0, OPTAB_DIRECT
);
20242 /* For 32-bit signed integer X, the best way to calculate the absolute
20243 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20244 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20245 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20246 NULL
, 0, OPTAB_DIRECT
);
20247 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20248 NULL
, 0, OPTAB_DIRECT
);
20249 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20250 target
, 0, OPTAB_DIRECT
);
20254 /* For 16-bit signed integer X, the best way to calculate the absolute
20255 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20256 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20258 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20259 target
, 0, OPTAB_DIRECT
);
20263 /* For 8-bit signed integer X, the best way to calculate the absolute
20264 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20265 as SSE2 provides the PMINUB insn. */
20266 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20268 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20269 target
, 0, OPTAB_DIRECT
);
20273 gcc_unreachable ();
20277 emit_move_insn (target
, x
);
20280 /* Expand an extract from a vector register through pextr insn.
20281 Return true if successful. */
20284 ix86_expand_pextr (rtx
*operands
)
20286 rtx dst
= operands
[0];
20287 rtx src
= operands
[1];
20289 unsigned int size
= INTVAL (operands
[2]);
20290 unsigned int pos
= INTVAL (operands
[3]);
20292 if (SUBREG_P (dst
))
20294 /* Reject non-lowpart subregs. */
20295 if (SUBREG_BYTE (dst
) > 0)
20297 dst
= SUBREG_REG (dst
);
20300 if (SUBREG_P (src
))
20302 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20303 src
= SUBREG_REG (src
);
20306 switch (GET_MODE (src
))
20314 machine_mode srcmode
, dstmode
;
20317 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20323 if (!TARGET_SSE4_1
)
20325 srcmode
= V16QImode
;
20331 srcmode
= V8HImode
;
20335 if (!TARGET_SSE4_1
)
20337 srcmode
= V4SImode
;
20341 gcc_assert (TARGET_64BIT
);
20342 if (!TARGET_SSE4_1
)
20344 srcmode
= V2DImode
;
20351 /* Reject extractions from misaligned positions. */
20352 if (pos
& (size
-1))
20355 if (GET_MODE (dst
) == dstmode
)
20358 d
= gen_reg_rtx (dstmode
);
20360 /* Construct insn pattern. */
20361 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20362 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20364 /* Let the rtl optimizers know about the zero extension performed. */
20365 if (dstmode
== QImode
|| dstmode
== HImode
)
20367 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20368 d
= gen_lowpart (SImode
, d
);
20371 emit_insn (gen_rtx_SET (d
, pat
));
20374 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20383 /* Expand an insert into a vector register through pinsr insn.
20384 Return true if successful. */
20387 ix86_expand_pinsr (rtx
*operands
)
20389 rtx dst
= operands
[0];
20390 rtx src
= operands
[3];
20392 unsigned int size
= INTVAL (operands
[1]);
20393 unsigned int pos
= INTVAL (operands
[2]);
20395 if (SUBREG_P (dst
))
20397 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20398 dst
= SUBREG_REG (dst
);
20401 switch (GET_MODE (dst
))
20409 machine_mode srcmode
, dstmode
;
20410 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20413 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20419 if (!TARGET_SSE4_1
)
20421 dstmode
= V16QImode
;
20422 pinsr
= gen_sse4_1_pinsrb
;
20428 dstmode
= V8HImode
;
20429 pinsr
= gen_sse2_pinsrw
;
20433 if (!TARGET_SSE4_1
)
20435 dstmode
= V4SImode
;
20436 pinsr
= gen_sse4_1_pinsrd
;
20440 gcc_assert (TARGET_64BIT
);
20441 if (!TARGET_SSE4_1
)
20443 dstmode
= V2DImode
;
20444 pinsr
= gen_sse4_1_pinsrq
;
20451 /* Reject insertions to misaligned positions. */
20452 if (pos
& (size
-1))
20455 if (SUBREG_P (src
))
20457 unsigned int srcpos
= SUBREG_BYTE (src
);
20463 extr_ops
[0] = gen_reg_rtx (srcmode
);
20464 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20465 extr_ops
[2] = GEN_INT (size
);
20466 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20468 if (!ix86_expand_pextr (extr_ops
))
20474 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20477 if (GET_MODE (dst
) == dstmode
)
20480 d
= gen_reg_rtx (dstmode
);
20482 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20483 gen_lowpart (srcmode
, src
),
20484 GEN_INT (1 << (pos
/ size
))));
20486 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20495 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20496 upper against lower halves up to SSE reg size. */
20499 ix86_split_reduction (machine_mode mode
)
20501 /* Reduce lowpart against highpart until we reach SSE reg width to
20502 avoid cross-lane operations. */
20528 /* Generate call to __divmoddi4. */
20531 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20533 rtx
*quot_p
, rtx
*rem_p
)
20535 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20537 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20538 mode
, op0
, mode
, op1
, mode
,
20539 XEXP (rem
, 0), Pmode
);
20544 #include "gt-i386-expand.h"