1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
63 #include "sched-int.h"
65 #include "tree-pass.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
74 #include "tree-iterator.h"
76 #include "case-cfn-macros.h"
78 #include "fold-const-call.h"
80 #include "tree-ssanames.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
86 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
109 rtx mem_op
= NULL_RTX
;
124 byte
= GET_MODE_SIZE (half_mode
);
128 rtx op
= operands
[num
];
130 /* simplify_subreg refuse to split volatile memory addresses,
131 but we still have to handle it. */
134 if (mem_op
&& rtx_equal_p (op
, mem_op
))
136 lo_half
[num
] = lo_half
[mem_num
];
137 hi_half
[num
] = hi_half
[mem_num
];
143 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
144 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
149 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
150 GET_MODE (op
) == VOIDmode
151 ? mode
: GET_MODE (op
), 0);
152 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
153 GET_MODE (op
) == VOIDmode
154 ? mode
: GET_MODE (op
), byte
);
159 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
163 ix86_expand_clear (rtx dest
)
167 /* We play register width games, which are only valid after reload. */
168 gcc_assert (reload_completed
);
170 /* Avoid HImode and its attendant prefix byte. */
171 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
172 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
173 tmp
= gen_rtx_SET (dest
, const0_rtx
);
175 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
177 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
178 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
185 ix86_expand_move (machine_mode mode
, rtx operands
[])
188 rtx tmp
, addend
= NULL_RTX
;
189 enum tls_model model
;
194 switch (GET_CODE (op1
))
199 if (GET_CODE (tmp
) != PLUS
200 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
204 addend
= XEXP (tmp
, 1);
208 model
= SYMBOL_REF_TLS_MODEL (op1
);
211 op1
= legitimize_tls_address (op1
, model
, true);
212 else if (ix86_force_load_from_GOT_p (op1
))
214 /* Load the external function address via GOT slot to avoid PLT. */
215 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
219 op1
= gen_rtx_CONST (Pmode
, op1
);
220 op1
= gen_const_mem (Pmode
, op1
);
221 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
225 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
241 op1
= force_operand (op1
, NULL_RTX
);
242 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
243 op0
, 1, OPTAB_DIRECT
);
246 op1
= force_operand (op1
, op0
);
251 op1
= convert_to_mode (mode
, op1
, 1);
257 if ((flag_pic
|| MACHOPIC_INDIRECT
)
258 && symbolic_operand (op1
, mode
))
260 if (TARGET_MACHO
&& !TARGET_64BIT
)
264 if (MACHOPIC_INDIRECT
)
266 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
267 ? op0
: gen_reg_rtx (Pmode
);
268 op1
= machopic_indirect_data_reference (op1
, temp
);
270 op1
= machopic_legitimize_pic_address (op1
, mode
,
271 temp
== op1
? 0 : temp
);
273 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
275 rtx insn
= gen_rtx_SET (op0
, op1
);
279 if (GET_CODE (op0
) == MEM
)
280 op1
= force_reg (Pmode
, op1
);
284 if (GET_CODE (temp
) != REG
)
285 temp
= gen_reg_rtx (Pmode
);
286 temp
= legitimize_pic_address (op1
, temp
);
297 op1
= force_reg (mode
, op1
);
298 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
300 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
301 op1
= legitimize_pic_address (op1
, reg
);
304 op1
= convert_to_mode (mode
, op1
, 1);
311 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
312 || !push_operand (op0
, mode
))
314 op1
= force_reg (mode
, op1
);
316 if (push_operand (op0
, mode
)
317 && ! general_no_elim_operand (op1
, mode
))
318 op1
= copy_to_mode_reg (mode
, op1
);
320 /* Force large constants in 64bit compilation into register
321 to get them CSEed. */
322 if (can_create_pseudo_p ()
323 && (mode
== DImode
) && TARGET_64BIT
324 && immediate_operand (op1
, mode
)
325 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
326 && !register_operand (op0
, mode
)
328 op1
= copy_to_mode_reg (mode
, op1
);
330 if (can_create_pseudo_p ()
331 && CONST_DOUBLE_P (op1
))
333 /* If we are loading a floating point constant to a register,
334 force the value to memory now, since we'll get better code
337 op1
= validize_mem (force_const_mem (mode
, op1
));
338 if (!register_operand (op0
, mode
))
340 rtx temp
= gen_reg_rtx (mode
);
341 emit_insn (gen_rtx_SET (temp
, op1
));
342 emit_move_insn (op0
, temp
);
348 emit_insn (gen_rtx_SET (op0
, op1
));
352 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
354 rtx op0
= operands
[0], op1
= operands
[1];
355 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
356 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
357 unsigned int align
= (TARGET_IAMCU
358 ? GET_MODE_BITSIZE (mode
)
359 : GET_MODE_ALIGNMENT (mode
));
361 if (push_operand (op0
, VOIDmode
))
362 op0
= emit_move_resolve_push (mode
, op0
);
364 /* Force constants other than zero into memory. We do not know how
365 the instructions used to build constants modify the upper 64 bits
366 of the register, once we have that information we may be able
367 to handle some of them more efficiently. */
368 if (can_create_pseudo_p ()
371 && CONSTANT_P (SUBREG_REG (op1
))))
372 && ((register_operand (op0
, mode
)
373 && !standard_sse_constant_p (op1
, mode
))
374 /* ix86_expand_vector_move_misalign() does not like constants. */
375 || (SSE_REG_MODE_P (mode
)
377 && MEM_ALIGN (op0
) < align
)))
381 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
382 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
384 r
= validize_mem (r
);
386 r
= force_reg (imode
, SUBREG_REG (op1
));
387 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
390 op1
= validize_mem (force_const_mem (mode
, op1
));
393 /* We need to check memory alignment for SSE mode since attribute
394 can make operands unaligned. */
395 if (can_create_pseudo_p ()
396 && SSE_REG_MODE_P (mode
)
397 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
398 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
402 /* ix86_expand_vector_move_misalign() does not like both
403 arguments in memory. */
404 if (!register_operand (op0
, mode
)
405 && !register_operand (op1
, mode
))
406 op1
= force_reg (mode
, op1
);
408 tmp
[0] = op0
; tmp
[1] = op1
;
409 ix86_expand_vector_move_misalign (mode
, tmp
);
413 /* Make operand1 a register if it isn't already. */
414 if (can_create_pseudo_p ()
415 && !register_operand (op0
, mode
)
416 && !register_operand (op1
, mode
))
418 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
422 emit_insn (gen_rtx_SET (op0
, op1
));
425 /* Split 32-byte AVX unaligned load and store if needed. */
428 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
431 rtx (*extract
) (rtx
, rtx
, rtx
);
434 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
435 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
437 emit_insn (gen_rtx_SET (op0
, op1
));
441 rtx orig_op0
= NULL_RTX
;
442 mode
= GET_MODE (op0
);
443 switch (GET_MODE_CLASS (mode
))
445 case MODE_VECTOR_INT
:
447 if (mode
!= V32QImode
)
452 op0
= gen_reg_rtx (V32QImode
);
455 op0
= gen_lowpart (V32QImode
, op0
);
456 op1
= gen_lowpart (V32QImode
, op1
);
460 case MODE_VECTOR_FLOAT
:
471 extract
= gen_avx_vextractf128v32qi
;
475 extract
= gen_avx_vextractf128v8sf
;
479 extract
= gen_avx_vextractf128v4df
;
486 rtx r
= gen_reg_rtx (mode
);
487 m
= adjust_address (op1
, mode
, 0);
488 emit_move_insn (r
, m
);
489 m
= adjust_address (op1
, mode
, 16);
490 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
491 emit_move_insn (op0
, r
);
493 else if (MEM_P (op0
))
495 m
= adjust_address (op0
, mode
, 0);
496 emit_insn (extract (m
, op1
, const0_rtx
));
497 m
= adjust_address (op0
, mode
, 16);
498 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
504 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
507 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
508 straight to ix86_expand_vector_move. */
509 /* Code generation for scalar reg-reg moves of single and double precision data:
510 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
514 if (x86_sse_partial_reg_dependency == true)
519 Code generation for scalar loads of double precision data:
520 if (x86_sse_split_regs == true)
521 movlpd mem, reg (gas syntax)
525 Code generation for unaligned packed loads of single precision data
526 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
527 if (x86_sse_unaligned_move_optimal)
530 if (x86_sse_partial_reg_dependency == true)
542 Code generation for unaligned packed loads of double precision data
543 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
544 if (x86_sse_unaligned_move_optimal)
547 if (x86_sse_split_regs == true)
560 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
567 /* Use unaligned load/store for AVX512 or when optimizing for size. */
568 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
570 emit_insn (gen_rtx_SET (op0
, op1
));
576 if (GET_MODE_SIZE (mode
) == 32)
577 ix86_avx256_split_vector_move_misalign (op0
, op1
);
579 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
580 emit_insn (gen_rtx_SET (op0
, op1
));
584 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
585 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
587 emit_insn (gen_rtx_SET (op0
, op1
));
591 /* ??? If we have typed data, then it would appear that using
592 movdqu is the only way to get unaligned data loaded with
594 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
596 emit_insn (gen_rtx_SET (op0
, op1
));
602 if (TARGET_SSE2
&& mode
== V2DFmode
)
606 /* When SSE registers are split into halves, we can avoid
607 writing to the top half twice. */
608 if (TARGET_SSE_SPLIT_REGS
)
615 /* ??? Not sure about the best option for the Intel chips.
616 The following would seem to satisfy; the register is
617 entirely cleared, breaking the dependency chain. We
618 then store to the upper half, with a dependency depth
619 of one. A rumor has it that Intel recommends two movsd
620 followed by an unpacklpd, but this is unconfirmed. And
621 given that the dependency depth of the unpacklpd would
622 still be one, I'm not sure why this would be better. */
623 zero
= CONST0_RTX (V2DFmode
);
626 m
= adjust_address (op1
, DFmode
, 0);
627 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
628 m
= adjust_address (op1
, DFmode
, 8);
629 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
635 if (mode
!= V4SFmode
)
636 t
= gen_reg_rtx (V4SFmode
);
640 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
641 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
645 m
= adjust_address (op1
, V2SFmode
, 0);
646 emit_insn (gen_sse_loadlps (t
, t
, m
));
647 m
= adjust_address (op1
, V2SFmode
, 8);
648 emit_insn (gen_sse_loadhps (t
, t
, m
));
649 if (mode
!= V4SFmode
)
650 emit_move_insn (op0
, gen_lowpart (mode
, t
));
653 else if (MEM_P (op0
))
655 if (TARGET_SSE2
&& mode
== V2DFmode
)
657 m
= adjust_address (op0
, DFmode
, 0);
658 emit_insn (gen_sse2_storelpd (m
, op1
));
659 m
= adjust_address (op0
, DFmode
, 8);
660 emit_insn (gen_sse2_storehpd (m
, op1
));
664 if (mode
!= V4SFmode
)
665 op1
= gen_lowpart (V4SFmode
, op1
);
667 m
= adjust_address (op0
, V2SFmode
, 0);
668 emit_insn (gen_sse_storelps (m
, op1
));
669 m
= adjust_address (op0
, V2SFmode
, 8);
670 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
677 /* Move bits 64:95 to bits 32:63. */
680 ix86_move_vector_high_sse_to_mmx (rtx op
)
682 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
683 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
684 GEN_INT (0), GEN_INT (0)));
685 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
686 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
687 rtx insn
= gen_rtx_SET (dest
, op
);
691 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
694 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
696 rtx op0
= operands
[0];
697 rtx op1
= operands
[1];
698 rtx op2
= operands
[2];
700 machine_mode dmode
= GET_MODE (op0
);
701 machine_mode smode
= GET_MODE (op1
);
702 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
703 machine_mode inner_smode
= GET_MODE_INNER (smode
);
705 /* Get the corresponding SSE mode for destination. */
706 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
707 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
709 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
710 nunits
/ 2).require ();
712 /* Get the corresponding SSE mode for source. */
713 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
714 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
717 /* Generate SSE pack with signed/unsigned saturation. */
718 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
719 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
720 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
722 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
723 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
724 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
728 ix86_move_vector_high_sse_to_mmx (op0
);
731 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
734 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
736 rtx op0
= operands
[0];
737 rtx op1
= operands
[1];
738 rtx op2
= operands
[2];
739 machine_mode mode
= GET_MODE (op0
);
741 /* The corresponding SSE mode. */
742 machine_mode sse_mode
, double_sse_mode
;
747 sse_mode
= V16QImode
;
748 double_sse_mode
= V32QImode
;
749 mask
= gen_rtx_PARALLEL (VOIDmode
,
751 GEN_INT (0), GEN_INT (16),
752 GEN_INT (1), GEN_INT (17),
753 GEN_INT (2), GEN_INT (18),
754 GEN_INT (3), GEN_INT (19),
755 GEN_INT (4), GEN_INT (20),
756 GEN_INT (5), GEN_INT (21),
757 GEN_INT (6), GEN_INT (22),
758 GEN_INT (7), GEN_INT (23)));
763 double_sse_mode
= V16HImode
;
764 mask
= gen_rtx_PARALLEL (VOIDmode
,
766 GEN_INT (0), GEN_INT (8),
767 GEN_INT (1), GEN_INT (9),
768 GEN_INT (2), GEN_INT (10),
769 GEN_INT (3), GEN_INT (11)));
774 double_sse_mode
= V8SImode
;
775 mask
= gen_rtx_PARALLEL (VOIDmode
,
777 GEN_INT (0), GEN_INT (4),
778 GEN_INT (1), GEN_INT (5)));
785 /* Generate SSE punpcklXX. */
786 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
787 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
788 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
790 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
791 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
792 rtx insn
= gen_rtx_SET (dest
, op2
);
797 /* Move bits 64:127 to bits 0:63. */
798 mask
= gen_rtx_PARALLEL (VOIDmode
,
799 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
800 GEN_INT (0), GEN_INT (0)));
801 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
802 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
803 insn
= gen_rtx_SET (dest
, op1
);
808 /* Helper function of ix86_fixup_binary_operands to canonicalize
809 operand order. Returns true if the operands should be swapped. */
812 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
815 rtx dst
= operands
[0];
816 rtx src1
= operands
[1];
817 rtx src2
= operands
[2];
819 /* If the operation is not commutative, we can't do anything. */
820 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
821 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
824 /* Highest priority is that src1 should match dst. */
825 if (rtx_equal_p (dst
, src1
))
827 if (rtx_equal_p (dst
, src2
))
830 /* Next highest priority is that immediate constants come second. */
831 if (immediate_operand (src2
, mode
))
833 if (immediate_operand (src1
, mode
))
836 /* Lowest priority is that memory references should come second. */
846 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
847 destination to use for the operation. If different from the true
848 destination in operands[0], a copy operation will be required. */
851 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
854 rtx dst
= operands
[0];
855 rtx src1
= operands
[1];
856 rtx src2
= operands
[2];
858 /* Canonicalize operand order. */
859 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
861 /* It is invalid to swap operands of different modes. */
862 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
864 std::swap (src1
, src2
);
867 /* Both source operands cannot be in memory. */
868 if (MEM_P (src1
) && MEM_P (src2
))
870 /* Optimization: Only read from memory once. */
871 if (rtx_equal_p (src1
, src2
))
873 src2
= force_reg (mode
, src2
);
876 else if (rtx_equal_p (dst
, src1
))
877 src2
= force_reg (mode
, src2
);
879 src1
= force_reg (mode
, src1
);
882 /* If the destination is memory, and we do not have matching source
883 operands, do things in registers. */
884 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
885 dst
= gen_reg_rtx (mode
);
887 /* Source 1 cannot be a constant. */
888 if (CONSTANT_P (src1
))
889 src1
= force_reg (mode
, src1
);
891 /* Source 1 cannot be a non-matching memory. */
892 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
893 src1
= force_reg (mode
, src1
);
895 /* Improve address combine. */
897 && GET_MODE_CLASS (mode
) == MODE_INT
899 src2
= force_reg (mode
, src2
);
906 /* Similarly, but assume that the destination has already been
910 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
911 machine_mode mode
, rtx operands
[])
913 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
914 gcc_assert (dst
== operands
[0]);
917 /* Attempt to expand a binary operator. Make the expansion closer to the
918 actual machine, then just general_operand, which will allow 3 separate
919 memory references (one output, two input) in a single insn. */
922 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
925 rtx src1
, src2
, dst
, op
, clob
;
927 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
931 /* Emit the instruction. */
933 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
937 && !rtx_equal_p (dst
, src1
))
939 /* This is going to be an LEA; avoid splitting it later. */
944 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
945 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
948 /* Fix up the destination if needed. */
949 if (dst
!= operands
[0])
950 emit_move_insn (operands
[0], dst
);
953 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
954 the given OPERANDS. */
957 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
960 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
961 if (SUBREG_P (operands
[1]))
966 else if (SUBREG_P (operands
[2]))
971 /* Optimize (__m128i) d | (__m128i) e and similar code
972 when d and e are float vectors into float vector logical
973 insn. In C/C++ without using intrinsics there is no other way
974 to express vector logical operation on float vectors than
975 to cast them temporarily to integer vectors. */
977 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
978 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
979 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
980 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
981 && SUBREG_BYTE (op1
) == 0
982 && (GET_CODE (op2
) == CONST_VECTOR
983 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
984 && SUBREG_BYTE (op2
) == 0))
985 && can_create_pseudo_p ())
988 switch (GET_MODE (SUBREG_REG (op1
)))
996 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
997 if (GET_CODE (op2
) == CONST_VECTOR
)
999 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1000 op2
= force_reg (GET_MODE (dst
), op2
);
1005 op2
= SUBREG_REG (operands
[2]);
1006 if (!vector_operand (op2
, GET_MODE (dst
)))
1007 op2
= force_reg (GET_MODE (dst
), op2
);
1009 op1
= SUBREG_REG (op1
);
1010 if (!vector_operand (op1
, GET_MODE (dst
)))
1011 op1
= force_reg (GET_MODE (dst
), op1
);
1012 emit_insn (gen_rtx_SET (dst
,
1013 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1015 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1021 if (!vector_operand (operands
[1], mode
))
1022 operands
[1] = force_reg (mode
, operands
[1]);
1023 if (!vector_operand (operands
[2], mode
))
1024 operands
[2] = force_reg (mode
, operands
[2]);
1025 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1026 emit_insn (gen_rtx_SET (operands
[0],
1027 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1031 /* Return TRUE or FALSE depending on whether the binary operator meets the
1032 appropriate constraints. */
1035 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1038 rtx dst
= operands
[0];
1039 rtx src1
= operands
[1];
1040 rtx src2
= operands
[2];
1042 /* Both source operands cannot be in memory. */
1043 if (MEM_P (src1
) && MEM_P (src2
))
1046 /* Canonicalize operand order for commutative operators. */
1047 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1048 std::swap (src1
, src2
);
1050 /* If the destination is memory, we must have a matching source operand. */
1051 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1054 /* Source 1 cannot be a constant. */
1055 if (CONSTANT_P (src1
))
1058 /* Source 1 cannot be a non-matching memory. */
1059 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1060 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1064 || (TARGET_64BIT
&& mode
== DImode
))
1065 && satisfies_constraint_L (src2
));
1070 /* Attempt to expand a unary operator. Make the expansion closer to the
1071 actual machine, then just general_operand, which will allow 2 separate
1072 memory references (one output, one input) in a single insn. */
1075 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1078 bool matching_memory
= false;
1079 rtx src
, dst
, op
, clob
;
1084 /* If the destination is memory, and we do not have matching source
1085 operands, do things in registers. */
1088 if (rtx_equal_p (dst
, src
))
1089 matching_memory
= true;
1091 dst
= gen_reg_rtx (mode
);
1094 /* When source operand is memory, destination must match. */
1095 if (MEM_P (src
) && !matching_memory
)
1096 src
= force_reg (mode
, src
);
1098 /* Emit the instruction. */
1100 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1106 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1107 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1110 /* Fix up the destination if needed. */
1111 if (dst
!= operands
[0])
1112 emit_move_insn (operands
[0], dst
);
1115 /* Predict just emitted jump instruction to be taken with probability PROB. */
1118 predict_jump (int prob
)
1120 rtx_insn
*insn
= get_last_insn ();
1121 gcc_assert (JUMP_P (insn
));
1122 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1125 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1126 divisor are within the range [0-255]. */
1129 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1132 rtx_code_label
*end_label
, *qimode_label
;
1135 rtx scratch
, tmp0
, tmp1
, tmp2
;
1136 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1141 if (GET_MODE (operands
[0]) == SImode
)
1143 if (GET_MODE (operands
[1]) == SImode
)
1144 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1147 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1151 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1155 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1162 end_label
= gen_label_rtx ();
1163 qimode_label
= gen_label_rtx ();
1165 scratch
= gen_reg_rtx (mode
);
1167 /* Use 8bit unsigned divimod if dividend and divisor are within
1168 the range [0-255]. */
1169 emit_move_insn (scratch
, operands
[2]);
1170 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1171 scratch
, 1, OPTAB_DIRECT
);
1172 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1173 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1174 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1175 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1176 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1178 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1179 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1180 JUMP_LABEL (insn
) = qimode_label
;
1182 /* Generate original signed/unsigned divimod. */
1183 div
= gen_divmod4_1 (operands
[0], operands
[1],
1184 operands
[2], operands
[3]);
1187 /* Branch to the end. */
1188 emit_jump_insn (gen_jump (end_label
));
1191 /* Generate 8bit unsigned divide. */
1192 emit_label (qimode_label
);
1193 /* Don't use operands[0] for result of 8bit divide since not all
1194 registers support QImode ZERO_EXTRACT. */
1195 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1196 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1197 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1198 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1202 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1203 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1207 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1208 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1212 if (GET_MODE (operands
[0]) != SImode
)
1213 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1214 if (GET_MODE (operands
[1]) != SImode
)
1215 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1218 /* Extract remainder from AH. */
1219 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]),
1220 tmp0
, GEN_INT (8), GEN_INT (8));
1221 if (REG_P (operands
[1]))
1222 insn
= emit_move_insn (operands
[1], tmp1
);
1225 /* Need a new scratch register since the old one has result
1227 scratch
= gen_reg_rtx (GET_MODE (operands
[1]));
1228 emit_move_insn (scratch
, tmp1
);
1229 insn
= emit_move_insn (operands
[1], scratch
);
1231 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1233 /* Zero extend quotient from AL. */
1234 tmp1
= gen_lowpart (QImode
, tmp0
);
1235 insn
= emit_insn (gen_extend_insn
1237 GET_MODE (operands
[0]), QImode
, 1));
1238 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1240 emit_label (end_label
);
1243 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1244 matches destination. RTX includes clobber of FLAGS_REG. */
1247 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1252 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1253 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1255 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1258 /* Return true if regno1 def is nearest to the insn. */
1261 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1263 rtx_insn
*prev
= insn
;
1264 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1268 while (prev
&& prev
!= start
)
1270 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1272 prev
= PREV_INSN (prev
);
1275 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1277 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1279 prev
= PREV_INSN (prev
);
1282 /* None of the regs is defined in the bb. */
1286 /* Split lea instructions into a sequence of instructions
1287 which are executed on ALU to avoid AGU stalls.
1288 It is assumed that it is allowed to clobber flags register
1292 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1294 unsigned int regno0
, regno1
, regno2
;
1295 struct ix86_address parts
;
1299 ok
= ix86_decompose_address (operands
[1], &parts
);
1302 target
= gen_lowpart (mode
, operands
[0]);
1304 regno0
= true_regnum (target
);
1305 regno1
= INVALID_REGNUM
;
1306 regno2
= INVALID_REGNUM
;
1310 parts
.base
= gen_lowpart (mode
, parts
.base
);
1311 regno1
= true_regnum (parts
.base
);
1316 parts
.index
= gen_lowpart (mode
, parts
.index
);
1317 regno2
= true_regnum (parts
.index
);
1321 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1323 if (parts
.scale
> 1)
1325 /* Case r1 = r1 + ... */
1326 if (regno1
== regno0
)
1328 /* If we have a case r1 = r1 + C * r2 then we
1329 should use multiplication which is very
1330 expensive. Assume cost model is wrong if we
1331 have such case here. */
1332 gcc_assert (regno2
!= regno0
);
1334 for (adds
= parts
.scale
; adds
> 0; adds
--)
1335 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1339 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1340 if (regno0
!= regno2
)
1341 emit_insn (gen_rtx_SET (target
, parts
.index
));
1343 /* Use shift for scaling. */
1344 ix86_emit_binop (ASHIFT
, mode
, target
,
1345 GEN_INT (exact_log2 (parts
.scale
)));
1348 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1350 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1351 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1354 else if (!parts
.base
&& !parts
.index
)
1356 gcc_assert(parts
.disp
);
1357 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1363 if (regno0
!= regno2
)
1364 emit_insn (gen_rtx_SET (target
, parts
.index
));
1366 else if (!parts
.index
)
1368 if (regno0
!= regno1
)
1369 emit_insn (gen_rtx_SET (target
, parts
.base
));
1373 if (regno0
== regno1
)
1375 else if (regno0
== regno2
)
1381 /* Find better operand for SET instruction, depending
1382 on which definition is farther from the insn. */
1383 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1384 tmp
= parts
.index
, tmp1
= parts
.base
;
1386 tmp
= parts
.base
, tmp1
= parts
.index
;
1388 emit_insn (gen_rtx_SET (target
, tmp
));
1390 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1391 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1393 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1397 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1400 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1401 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1405 /* Post-reload splitter for converting an SF or DFmode value in an
1406 SSE register into an unsigned SImode. */
1409 ix86_split_convert_uns_si_sse (rtx operands
[])
1411 machine_mode vecmode
;
1412 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1414 large
= operands
[1];
1415 zero_or_two31
= operands
[2];
1416 input
= operands
[3];
1417 two31
= operands
[4];
1418 vecmode
= GET_MODE (large
);
1419 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1421 /* Load up the value into the low element. We must ensure that the other
1422 elements are valid floats -- zero is the easiest such value. */
1425 if (vecmode
== V4SFmode
)
1426 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1428 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1432 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1433 emit_move_insn (value
, CONST0_RTX (vecmode
));
1434 if (vecmode
== V4SFmode
)
1435 emit_insn (gen_sse_movss (value
, value
, input
));
1437 emit_insn (gen_sse2_movsd (value
, value
, input
));
1440 emit_move_insn (large
, two31
);
1441 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1443 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1444 emit_insn (gen_rtx_SET (large
, x
));
1446 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1447 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1449 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1450 emit_insn (gen_rtx_SET (value
, x
));
1452 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1453 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1455 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1456 if (vecmode
== V4SFmode
)
1457 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1459 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1462 emit_insn (gen_xorv4si3 (value
, value
, large
));
1465 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1466 machine_mode mode
, rtx target
,
1467 rtx var
, int one_var
);
1469 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1470 Expects the 64-bit DImode to be supplied in a pair of integral
1471 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1472 -mfpmath=sse, !optimize_size only. */
1475 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1477 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1478 rtx int_xmm
, fp_xmm
;
1479 rtx biases
, exponents
;
1482 int_xmm
= gen_reg_rtx (V4SImode
);
1483 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1484 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1485 else if (TARGET_SSE_SPLIT_REGS
)
1487 emit_clobber (int_xmm
);
1488 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1492 x
= gen_reg_rtx (V2DImode
);
1493 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1494 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1497 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1498 gen_rtvec (4, GEN_INT (0x43300000UL
),
1499 GEN_INT (0x45300000UL
),
1500 const0_rtx
, const0_rtx
));
1501 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1503 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1504 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1506 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1507 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1508 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1509 (0x1.0p84 + double(fp_value_hi_xmm)).
1510 Note these exponents differ by 32. */
1512 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1514 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1515 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1516 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1517 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1518 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1519 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1520 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1521 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1522 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1524 /* Add the upper and lower DFmode values together. */
1526 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1529 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1530 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1531 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1534 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1537 /* Not used, but eases macroization of patterns. */
1539 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1544 /* Convert an unsigned SImode value into a DFmode. Only currently used
1545 for SSE, but applicable anywhere. */
1548 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1550 REAL_VALUE_TYPE TWO31r
;
1553 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1554 NULL
, 1, OPTAB_DIRECT
);
1556 fp
= gen_reg_rtx (DFmode
);
1557 emit_insn (gen_floatsidf2 (fp
, x
));
1559 real_ldexp (&TWO31r
, &dconst1
, 31);
1560 x
= const_double_from_real_value (TWO31r
, DFmode
);
1562 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1564 emit_move_insn (target
, x
);
1567 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1568 32-bit mode; otherwise we have a direct convert instruction. */
1571 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1573 REAL_VALUE_TYPE TWO32r
;
1574 rtx fp_lo
, fp_hi
, x
;
1576 fp_lo
= gen_reg_rtx (DFmode
);
1577 fp_hi
= gen_reg_rtx (DFmode
);
1579 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1581 real_ldexp (&TWO32r
, &dconst1
, 32);
1582 x
= const_double_from_real_value (TWO32r
, DFmode
);
1583 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1585 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1587 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1590 emit_move_insn (target
, x
);
1593 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1594 For x86_32, -mfpmath=sse, !optimize_size only. */
1596 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1598 REAL_VALUE_TYPE ONE16r
;
1599 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1601 real_ldexp (&ONE16r
, &dconst1
, 16);
1602 x
= const_double_from_real_value (ONE16r
, SFmode
);
1603 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1604 NULL
, 0, OPTAB_DIRECT
);
1605 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1606 NULL
, 0, OPTAB_DIRECT
);
1607 fp_hi
= gen_reg_rtx (SFmode
);
1608 fp_lo
= gen_reg_rtx (SFmode
);
1609 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1610 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1611 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1613 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1615 if (!rtx_equal_p (target
, fp_hi
))
1616 emit_move_insn (target
, fp_hi
);
1619 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1620 a vector of unsigned ints VAL to vector of floats TARGET. */
1623 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1626 REAL_VALUE_TYPE TWO16r
;
1627 machine_mode intmode
= GET_MODE (val
);
1628 machine_mode fltmode
= GET_MODE (target
);
1629 rtx (*cvt
) (rtx
, rtx
);
1631 if (intmode
== V4SImode
)
1632 cvt
= gen_floatv4siv4sf2
;
1634 cvt
= gen_floatv8siv8sf2
;
1635 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1636 tmp
[0] = force_reg (intmode
, tmp
[0]);
1637 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1639 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1640 NULL_RTX
, 1, OPTAB_DIRECT
);
1641 tmp
[3] = gen_reg_rtx (fltmode
);
1642 emit_insn (cvt (tmp
[3], tmp
[1]));
1643 tmp
[4] = gen_reg_rtx (fltmode
);
1644 emit_insn (cvt (tmp
[4], tmp
[2]));
1645 real_ldexp (&TWO16r
, &dconst1
, 16);
1646 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1647 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1648 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1650 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1652 if (tmp
[7] != target
)
1653 emit_move_insn (target
, tmp
[7]);
1656 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1657 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1658 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1659 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1662 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1664 REAL_VALUE_TYPE TWO31r
;
1666 machine_mode mode
= GET_MODE (val
);
1667 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1668 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1669 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1672 for (i
= 0; i
< 3; i
++)
1673 tmp
[i
] = gen_reg_rtx (mode
);
1674 real_ldexp (&TWO31r
, &dconst1
, 31);
1675 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1676 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1677 two31r
= force_reg (mode
, two31r
);
1680 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1681 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1682 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1683 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1684 default: gcc_unreachable ();
1686 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1687 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1688 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1690 if (intmode
== V4SImode
|| TARGET_AVX2
)
1691 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1692 gen_lowpart (intmode
, tmp
[0]),
1693 GEN_INT (31), NULL_RTX
, 0,
1697 rtx two31
= GEN_INT (HOST_WIDE_INT_1U
<< 31);
1698 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1699 *xorp
= expand_simple_binop (intmode
, AND
,
1700 gen_lowpart (intmode
, tmp
[0]),
1704 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1708 /* Generate code for floating point ABS or NEG. */
1711 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1715 bool use_sse
= false;
1716 bool vector_mode
= VECTOR_MODE_P (mode
);
1717 machine_mode vmode
= mode
;
1722 else if (mode
== TFmode
)
1724 else if (TARGET_SSE_MATH
)
1726 use_sse
= SSE_FLOAT_MODE_P (mode
);
1729 else if (mode
== DFmode
)
1736 set
= gen_rtx_fmt_e (code
, mode
, src
);
1737 set
= gen_rtx_SET (dst
, set
);
1741 rtx mask
, use
, clob
;
1743 /* NEG and ABS performed with SSE use bitwise mask operations.
1744 Create the appropriate mask now. */
1745 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1746 use
= gen_rtx_USE (VOIDmode
, mask
);
1748 par
= gen_rtvec (2, set
, use
);
1751 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1752 par
= gen_rtvec (3, set
, use
, clob
);
1759 /* Changing of sign for FP values is doable using integer unit too. */
1760 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1761 par
= gen_rtvec (2, set
, clob
);
1764 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1767 /* Deconstruct a floating point ABS or NEG operation
1768 with integer registers into integer operations. */
1771 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1774 enum rtx_code absneg_op
;
1777 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1782 dst
= gen_lowpart (SImode
, operands
[0]);
1786 set
= gen_int_mode (0x7fffffff, SImode
);
1791 set
= gen_int_mode (0x80000000, SImode
);
1794 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1800 dst
= gen_lowpart (DImode
, operands
[0]);
1801 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1806 set
= gen_rtx_NOT (DImode
, dst
);
1810 dst
= gen_highpart (SImode
, operands
[0]);
1814 set
= gen_int_mode (0x7fffffff, SImode
);
1819 set
= gen_int_mode (0x80000000, SImode
);
1822 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1827 dst
= gen_rtx_REG (SImode
,
1828 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1831 set
= GEN_INT (0x7fff);
1836 set
= GEN_INT (0x8000);
1839 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1846 set
= gen_rtx_SET (dst
, set
);
1848 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1849 rtvec par
= gen_rtvec (2, set
, clob
);
1851 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1854 /* Expand a copysign operation. Special case operand 0 being a constant. */
1857 ix86_expand_copysign (rtx operands
[])
1859 machine_mode mode
, vmode
;
1860 rtx dest
, op0
, op1
, mask
;
1866 mode
= GET_MODE (dest
);
1870 else if (mode
== DFmode
)
1872 else if (mode
== TFmode
)
1877 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1879 if (CONST_DOUBLE_P (op0
))
1881 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1882 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1884 if (mode
== SFmode
|| mode
== DFmode
)
1886 if (op0
== CONST0_RTX (mode
))
1887 op0
= CONST0_RTX (vmode
);
1890 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1892 op0
= force_reg (vmode
, v
);
1895 else if (op0
!= CONST0_RTX (mode
))
1896 op0
= force_reg (mode
, op0
);
1898 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1902 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1904 emit_insn (gen_copysign3_var
1905 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1909 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1910 be a constant, and so has already been expanded into a vector constant. */
1913 ix86_split_copysign_const (rtx operands
[])
1915 machine_mode mode
, vmode
;
1916 rtx dest
, op0
, mask
, x
;
1922 mode
= GET_MODE (dest
);
1923 vmode
= GET_MODE (mask
);
1925 dest
= lowpart_subreg (vmode
, dest
, mode
);
1926 x
= gen_rtx_AND (vmode
, dest
, mask
);
1927 emit_insn (gen_rtx_SET (dest
, x
));
1929 if (op0
!= CONST0_RTX (vmode
))
1931 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1932 emit_insn (gen_rtx_SET (dest
, x
));
1936 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1937 so we have to do two masks. */
1940 ix86_split_copysign_var (rtx operands
[])
1942 machine_mode mode
, vmode
;
1943 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1946 scratch
= operands
[1];
1949 nmask
= operands
[4];
1952 mode
= GET_MODE (dest
);
1953 vmode
= GET_MODE (mask
);
1955 if (rtx_equal_p (op0
, op1
))
1957 /* Shouldn't happen often (it's useless, obviously), but when it does
1958 we'd generate incorrect code if we continue below. */
1959 emit_move_insn (dest
, op0
);
1963 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1965 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1967 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1968 emit_insn (gen_rtx_SET (scratch
, x
));
1971 op0
= lowpart_subreg (vmode
, op0
, mode
);
1972 x
= gen_rtx_NOT (vmode
, dest
);
1973 x
= gen_rtx_AND (vmode
, x
, op0
);
1974 emit_insn (gen_rtx_SET (dest
, x
));
1978 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1980 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1982 else /* alternative 2,4 */
1984 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1985 op1
= lowpart_subreg (vmode
, op1
, mode
);
1986 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1988 emit_insn (gen_rtx_SET (scratch
, x
));
1990 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1992 dest
= lowpart_subreg (vmode
, op0
, mode
);
1993 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1995 else /* alternative 3,4 */
1997 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1999 op0
= lowpart_subreg (vmode
, op0
, mode
);
2000 x
= gen_rtx_AND (vmode
, dest
, op0
);
2002 emit_insn (gen_rtx_SET (dest
, x
));
2005 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
2006 emit_insn (gen_rtx_SET (dest
, x
));
2009 /* Expand an xorsign operation. */
2012 ix86_expand_xorsign (rtx operands
[])
2014 machine_mode mode
, vmode
;
2015 rtx dest
, op0
, op1
, mask
;
2021 mode
= GET_MODE (dest
);
2025 else if (mode
== DFmode
)
2030 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2032 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2035 /* Deconstruct an xorsign operation into bit masks. */
2038 ix86_split_xorsign (rtx operands
[])
2040 machine_mode mode
, vmode
;
2041 rtx dest
, op0
, mask
, x
;
2047 mode
= GET_MODE (dest
);
2048 vmode
= GET_MODE (mask
);
2050 dest
= lowpart_subreg (vmode
, dest
, mode
);
2051 x
= gen_rtx_AND (vmode
, dest
, mask
);
2052 emit_insn (gen_rtx_SET (dest
, x
));
2054 op0
= lowpart_subreg (vmode
, op0
, mode
);
2055 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2056 emit_insn (gen_rtx_SET (dest
, x
));
2059 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2062 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2064 machine_mode mode
= GET_MODE (op0
);
2067 /* Handle special case - vector comparsion with boolean result, transform
2068 it using ptest instruction. */
2069 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2071 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2072 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2074 gcc_assert (code
== EQ
|| code
== NE
);
2075 /* Generate XOR since we can't check that one operand is zero vector. */
2076 tmp
= gen_reg_rtx (mode
);
2077 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2078 tmp
= gen_lowpart (p_mode
, tmp
);
2079 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2080 gen_rtx_UNSPEC (CCmode
,
2081 gen_rtvec (2, tmp
, tmp
),
2083 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2084 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2085 gen_rtx_LABEL_REF (VOIDmode
, label
),
2087 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2100 tmp
= ix86_expand_compare (code
, op0
, op1
);
2101 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2102 gen_rtx_LABEL_REF (VOIDmode
, label
),
2104 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2110 /* For 32-bit target DI comparison may be performed on
2111 SSE registers. To allow this we should avoid split
2112 to SI mode which is achieved by doing xor in DI mode
2113 and then comparing with zero (which is recognized by
2114 STV pass). We don't compare using xor when optimizing
2116 if (!optimize_insn_for_size_p ()
2118 && (code
== EQ
|| code
== NE
))
2120 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2125 /* Expand DImode branch into multiple compare+branch. */
2128 rtx_code_label
*label2
;
2129 enum rtx_code code1
, code2
, code3
;
2130 machine_mode submode
;
2132 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2134 std::swap (op0
, op1
);
2135 code
= swap_condition (code
);
2138 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2139 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2141 submode
= mode
== DImode
? SImode
: DImode
;
2143 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2144 avoid two branches. This costs one extra insn, so disable when
2145 optimizing for size. */
2147 if ((code
== EQ
|| code
== NE
)
2148 && (!optimize_insn_for_size_p ()
2149 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2154 if (hi
[1] != const0_rtx
)
2155 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2156 NULL_RTX
, 0, OPTAB_WIDEN
);
2159 if (lo
[1] != const0_rtx
)
2160 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2161 NULL_RTX
, 0, OPTAB_WIDEN
);
2163 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2164 NULL_RTX
, 0, OPTAB_WIDEN
);
2166 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2170 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2171 op1 is a constant and the low word is zero, then we can just
2172 examine the high word. Similarly for low word -1 and
2173 less-or-equal-than or greater-than. */
2175 if (CONST_INT_P (hi
[1]))
2178 case LT
: case LTU
: case GE
: case GEU
:
2179 if (lo
[1] == const0_rtx
)
2181 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2185 case LE
: case LEU
: case GT
: case GTU
:
2186 if (lo
[1] == constm1_rtx
)
2188 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2196 /* Emulate comparisons that do not depend on Zero flag with
2197 double-word subtraction. Note that only Overflow, Sign
2198 and Carry flags are valid, so swap arguments and condition
2199 of comparisons that would otherwise test Zero flag. */
2203 case LE
: case LEU
: case GT
: case GTU
:
2204 std::swap (lo
[0], lo
[1]);
2205 std::swap (hi
[0], hi
[1]);
2206 code
= swap_condition (code
);
2209 case LT
: case LTU
: case GE
: case GEU
:
2211 bool uns
= (code
== LTU
|| code
== GEU
);
2212 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2213 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2215 if (!nonimmediate_operand (lo
[0], submode
))
2216 lo
[0] = force_reg (submode
, lo
[0]);
2217 if (!x86_64_general_operand (lo
[1], submode
))
2218 lo
[1] = force_reg (submode
, lo
[1]);
2220 if (!register_operand (hi
[0], submode
))
2221 hi
[0] = force_reg (submode
, hi
[0]);
2222 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2223 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2224 hi
[1] = force_reg (submode
, hi
[1]);
2226 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2228 tmp
= gen_rtx_SCRATCH (submode
);
2229 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2231 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2232 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2240 /* Otherwise, we need two or three jumps. */
2242 label2
= gen_label_rtx ();
2245 code2
= swap_condition (code
);
2246 code3
= unsigned_condition (code
);
2250 case LT
: case GT
: case LTU
: case GTU
:
2253 case LE
: code1
= LT
; code2
= GT
; break;
2254 case GE
: code1
= GT
; code2
= LT
; break;
2255 case LEU
: code1
= LTU
; code2
= GTU
; break;
2256 case GEU
: code1
= GTU
; code2
= LTU
; break;
2258 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2259 case NE
: code2
= UNKNOWN
; break;
2267 * if (hi(a) < hi(b)) goto true;
2268 * if (hi(a) > hi(b)) goto false;
2269 * if (lo(a) < lo(b)) goto true;
2273 if (code1
!= UNKNOWN
)
2274 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2275 if (code2
!= UNKNOWN
)
2276 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2278 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2280 if (code2
!= UNKNOWN
)
2281 emit_label (label2
);
2286 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2291 /* Figure out whether to use unordered fp comparisons. */
2294 ix86_unordered_fp_compare (enum rtx_code code
)
2296 if (!TARGET_IEEE_FP
)
2325 /* Return a comparison we can do and that it is equivalent to
2326 swap_condition (code) apart possibly from orderedness.
2327 But, never change orderedness if TARGET_IEEE_FP, returning
2328 UNKNOWN in that case if necessary. */
2330 static enum rtx_code
2331 ix86_fp_swap_condition (enum rtx_code code
)
2335 case GT
: /* GTU - CF=0 & ZF=0 */
2336 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2337 case GE
: /* GEU - CF=0 */
2338 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2339 case UNLT
: /* LTU - CF=1 */
2340 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2341 case UNLE
: /* LEU - CF=1 | ZF=1 */
2342 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2344 return swap_condition (code
);
2348 /* Return cost of comparison CODE using the best strategy for performance.
2349 All following functions do use number of instructions as a cost metrics.
2350 In future this should be tweaked to compute bytes for optimize_size and
2351 take into account performance of various instructions on various CPUs. */
2354 ix86_fp_comparison_cost (enum rtx_code code
)
2358 /* The cost of code using bit-twiddling on %ah. */
2375 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2379 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2385 switch (ix86_fp_comparison_strategy (code
))
2387 case IX86_FPCMP_COMI
:
2388 return arith_cost
> 4 ? 3 : 2;
2389 case IX86_FPCMP_SAHF
:
2390 return arith_cost
> 4 ? 4 : 3;
2396 /* Swap, force into registers, or otherwise massage the two operands
2397 to a fp comparison. The operands are updated in place; the new
2398 comparison code is returned. */
2400 static enum rtx_code
2401 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2403 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2404 rtx op0
= *pop0
, op1
= *pop1
;
2405 machine_mode op_mode
= GET_MODE (op0
);
2406 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2408 /* All of the unordered compare instructions only work on registers.
2409 The same is true of the fcomi compare instructions. The XFmode
2410 compare instructions require registers except when comparing
2411 against zero or when converting operand 1 from fixed point to
2415 && (unordered_compare
2416 || (op_mode
== XFmode
2417 && ! (standard_80387_constant_p (op0
) == 1
2418 || standard_80387_constant_p (op1
) == 1)
2419 && GET_CODE (op1
) != FLOAT
)
2420 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2422 op0
= force_reg (op_mode
, op0
);
2423 op1
= force_reg (op_mode
, op1
);
2427 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2428 things around if they appear profitable, otherwise force op0
2431 if (standard_80387_constant_p (op0
) == 0
2433 && ! (standard_80387_constant_p (op1
) == 0
2436 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2437 if (new_code
!= UNKNOWN
)
2439 std::swap (op0
, op1
);
2445 op0
= force_reg (op_mode
, op0
);
2447 if (CONSTANT_P (op1
))
2449 int tmp
= standard_80387_constant_p (op1
);
2451 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2455 op1
= force_reg (op_mode
, op1
);
2458 op1
= force_reg (op_mode
, op1
);
2462 /* Try to rearrange the comparison to make it cheaper. */
2463 if (ix86_fp_comparison_cost (code
)
2464 > ix86_fp_comparison_cost (swap_condition (code
))
2465 && (REG_P (op1
) || can_create_pseudo_p ()))
2467 std::swap (op0
, op1
);
2468 code
= swap_condition (code
);
2470 op0
= force_reg (op_mode
, op0
);
2478 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2481 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2483 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2484 machine_mode cmp_mode
;
2487 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2489 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2490 if (unordered_compare
)
2491 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2493 /* Do fcomi/sahf based test when profitable. */
2494 switch (ix86_fp_comparison_strategy (code
))
2496 case IX86_FPCMP_COMI
:
2497 cmp_mode
= CCFPmode
;
2498 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2501 case IX86_FPCMP_SAHF
:
2502 cmp_mode
= CCFPmode
;
2503 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2504 scratch
= gen_reg_rtx (HImode
);
2505 emit_insn (gen_rtx_SET (scratch
, tmp
));
2506 emit_insn (gen_x86_sahf_1 (scratch
));
2509 case IX86_FPCMP_ARITH
:
2510 cmp_mode
= CCNOmode
;
2511 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2512 scratch
= gen_reg_rtx (HImode
);
2513 emit_insn (gen_rtx_SET (scratch
, tmp
));
2515 /* In the unordered case, we have to check C2 for NaN's, which
2516 doesn't happen to work out to anything nice combination-wise.
2517 So do some bit twiddling on the value we've got in AH to come
2518 up with an appropriate set of condition codes. */
2524 if (code
== GT
|| !TARGET_IEEE_FP
)
2526 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2531 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2532 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2533 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2540 if (code
== LT
&& TARGET_IEEE_FP
)
2542 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2543 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2549 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2555 if (code
== GE
|| !TARGET_IEEE_FP
)
2557 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2562 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2563 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2569 if (code
== LE
&& TARGET_IEEE_FP
)
2571 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2572 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2573 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2579 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2585 if (code
== EQ
&& TARGET_IEEE_FP
)
2587 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2588 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2594 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2600 if (code
== NE
&& TARGET_IEEE_FP
)
2602 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2603 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2609 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2615 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2619 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2632 /* Return the test that should be put into the flags user, i.e.
2633 the bcc, scc, or cmov instruction. */
2634 return gen_rtx_fmt_ee (code
, VOIDmode
,
2635 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2639 /* Generate insn patterns to do an integer compare of OPERANDS. */
2642 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2644 machine_mode cmpmode
;
2647 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2648 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2650 /* This is very simple, but making the interface the same as in the
2651 FP case makes the rest of the code easier. */
2652 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2653 emit_insn (gen_rtx_SET (flags
, tmp
));
2655 /* Return the test that should be put into the flags user, i.e.
2656 the bcc, scc, or cmov instruction. */
2657 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2661 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2665 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2666 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2668 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2670 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2671 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2674 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2680 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2684 gcc_assert (GET_MODE (dest
) == QImode
);
2686 ret
= ix86_expand_compare (code
, op0
, op1
);
2687 PUT_MODE (ret
, QImode
);
2688 emit_insn (gen_rtx_SET (dest
, ret
));
2691 /* Expand comparison setting or clearing carry flag. Return true when
2692 successful and set pop for the operation. */
2694 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2697 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2699 /* Do not handle double-mode compares that go through special path. */
2700 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2703 if (SCALAR_FLOAT_MODE_P (mode
))
2706 rtx_insn
*compare_seq
;
2708 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2710 /* Shortcut: following common codes never translate
2711 into carry flag compares. */
2712 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2713 || code
== ORDERED
|| code
== UNORDERED
)
2716 /* These comparisons require zero flag; swap operands so they won't. */
2717 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2720 std::swap (op0
, op1
);
2721 code
= swap_condition (code
);
2724 /* Try to expand the comparison and verify that we end up with
2725 carry flag based comparison. This fails to be true only when
2726 we decide to expand comparison using arithmetic that is not
2727 too common scenario. */
2729 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2730 compare_seq
= get_insns ();
2733 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2734 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2736 code
= GET_CODE (compare_op
);
2738 if (code
!= LTU
&& code
!= GEU
)
2741 emit_insn (compare_seq
);
2746 if (!INTEGRAL_MODE_P (mode
))
2755 /* Convert a==0 into (unsigned)a<1. */
2758 if (op1
!= const0_rtx
)
2761 code
= (code
== EQ
? LTU
: GEU
);
2764 /* Convert a>b into b<a or a>=b-1. */
2767 if (CONST_INT_P (op1
))
2769 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2770 /* Bail out on overflow. We still can swap operands but that
2771 would force loading of the constant into register. */
2772 if (op1
== const0_rtx
2773 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2775 code
= (code
== GTU
? GEU
: LTU
);
2779 std::swap (op0
, op1
);
2780 code
= (code
== GTU
? LTU
: GEU
);
2784 /* Convert a>=0 into (unsigned)a<0x80000000. */
2787 if (mode
== DImode
|| op1
!= const0_rtx
)
2789 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2790 code
= (code
== LT
? GEU
: LTU
);
2794 if (mode
== DImode
|| op1
!= constm1_rtx
)
2796 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2797 code
= (code
== LE
? GEU
: LTU
);
2803 /* Swapping operands may cause constant to appear as first operand. */
2804 if (!nonimmediate_operand (op0
, VOIDmode
))
2806 if (!can_create_pseudo_p ())
2808 op0
= force_reg (mode
, op0
);
2810 *pop
= ix86_expand_compare (code
, op0
, op1
);
2811 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2815 /* Expand conditional increment or decrement using adb/sbb instructions.
2816 The default case using setcc followed by the conditional move can be
2817 done by generic code. */
2819 ix86_expand_int_addcc (rtx operands
[])
2821 enum rtx_code code
= GET_CODE (operands
[1]);
2823 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2825 rtx val
= const0_rtx
;
2828 rtx op0
= XEXP (operands
[1], 0);
2829 rtx op1
= XEXP (operands
[1], 1);
2831 if (operands
[3] != const1_rtx
2832 && operands
[3] != constm1_rtx
)
2834 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2836 code
= GET_CODE (compare_op
);
2838 flags
= XEXP (compare_op
, 0);
2840 if (GET_MODE (flags
) == CCFPmode
)
2843 code
= ix86_fp_compare_code_to_integer (code
);
2850 PUT_CODE (compare_op
,
2851 reverse_condition_maybe_unordered
2852 (GET_CODE (compare_op
)));
2854 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2857 mode
= GET_MODE (operands
[0]);
2859 /* Construct either adc or sbb insn. */
2860 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2861 insn
= gen_sub3_carry
;
2863 insn
= gen_add3_carry
;
2865 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2871 ix86_expand_int_movcc (rtx operands
[])
2873 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2874 rtx_insn
*compare_seq
;
2876 machine_mode mode
= GET_MODE (operands
[0]);
2877 bool sign_bit_compare_p
= false;
2878 rtx op0
= XEXP (operands
[1], 0);
2879 rtx op1
= XEXP (operands
[1], 1);
2881 if (GET_MODE (op0
) == TImode
2882 || (GET_MODE (op0
) == DImode
2887 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2888 compare_seq
= get_insns ();
2891 compare_code
= GET_CODE (compare_op
);
2893 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2894 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2895 sign_bit_compare_p
= true;
2897 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2898 HImode insns, we'd be swallowed in word prefix ops. */
2900 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2901 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2902 && CONST_INT_P (operands
[2])
2903 && CONST_INT_P (operands
[3]))
2905 rtx out
= operands
[0];
2906 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2907 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2911 /* Sign bit compares are better done using shifts than we do by using
2913 if (sign_bit_compare_p
2914 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2916 /* Detect overlap between destination and compare sources. */
2919 if (!sign_bit_compare_p
)
2924 compare_code
= GET_CODE (compare_op
);
2926 flags
= XEXP (compare_op
, 0);
2928 if (GET_MODE (flags
) == CCFPmode
)
2932 = ix86_fp_compare_code_to_integer (compare_code
);
2935 /* To simplify rest of code, restrict to the GEU case. */
2936 if (compare_code
== LTU
)
2939 compare_code
= reverse_condition (compare_code
);
2940 code
= reverse_condition (code
);
2945 PUT_CODE (compare_op
,
2946 reverse_condition_maybe_unordered
2947 (GET_CODE (compare_op
)));
2949 PUT_CODE (compare_op
,
2950 reverse_condition (GET_CODE (compare_op
)));
2954 if (reg_overlap_mentioned_p (out
, op0
)
2955 || reg_overlap_mentioned_p (out
, op1
))
2956 tmp
= gen_reg_rtx (mode
);
2959 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2961 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2962 flags
, compare_op
));
2966 if (code
== GT
|| code
== GE
)
2967 code
= reverse_condition (code
);
2973 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2986 tmp
= expand_simple_binop (mode
, PLUS
,
2988 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2999 tmp
= expand_simple_binop (mode
, IOR
,
3001 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3003 else if (diff
== -1 && ct
)
3013 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3015 tmp
= expand_simple_binop (mode
, PLUS
,
3016 copy_rtx (tmp
), GEN_INT (cf
),
3017 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3025 * andl cf - ct, dest
3035 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3038 tmp
= expand_simple_binop (mode
, AND
,
3040 gen_int_mode (cf
- ct
, mode
),
3041 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3043 tmp
= expand_simple_binop (mode
, PLUS
,
3044 copy_rtx (tmp
), GEN_INT (ct
),
3045 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3048 if (!rtx_equal_p (tmp
, out
))
3049 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3056 machine_mode cmp_mode
= GET_MODE (op0
);
3057 enum rtx_code new_code
;
3059 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3061 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3063 /* We may be reversing unordered compare to normal compare, that
3064 is not valid in general (we may convert non-trapping condition
3065 to trapping one), however on i386 we currently emit all
3066 comparisons unordered. */
3067 new_code
= reverse_condition_maybe_unordered (code
);
3070 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3071 if (new_code
!= UNKNOWN
)
3079 compare_code
= UNKNOWN
;
3080 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3081 && CONST_INT_P (op1
))
3083 if (op1
== const0_rtx
3084 && (code
== LT
|| code
== GE
))
3085 compare_code
= code
;
3086 else if (op1
== constm1_rtx
)
3090 else if (code
== GT
)
3095 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3096 if (compare_code
!= UNKNOWN
3097 && GET_MODE (op0
) == GET_MODE (out
)
3098 && (cf
== -1 || ct
== -1))
3100 /* If lea code below could be used, only optimize
3101 if it results in a 2 insn sequence. */
3103 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3104 || diff
== 3 || diff
== 5 || diff
== 9)
3105 || (compare_code
== LT
&& ct
== -1)
3106 || (compare_code
== GE
&& cf
== -1))
3109 * notl op1 (if necessary)
3117 code
= reverse_condition (code
);
3120 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3122 out
= expand_simple_binop (mode
, IOR
,
3124 out
, 1, OPTAB_DIRECT
);
3125 if (out
!= operands
[0])
3126 emit_move_insn (operands
[0], out
);
3133 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3134 || diff
== 3 || diff
== 5 || diff
== 9)
3135 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3137 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3143 * lea cf(dest*(ct-cf)),dest
3147 * This also catches the degenerate setcc-only case.
3153 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3156 /* On x86_64 the lea instruction operates on Pmode, so we need
3157 to get arithmetics done in proper mode to match. */
3159 tmp
= copy_rtx (out
);
3163 out1
= copy_rtx (out
);
3164 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3168 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3174 tmp
= gen_rtx_PLUS (mode
, tmp
, GEN_INT (cf
));
3177 if (!rtx_equal_p (tmp
, out
))
3180 out
= force_operand (tmp
, copy_rtx (out
));
3182 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3184 if (!rtx_equal_p (out
, operands
[0]))
3185 emit_move_insn (operands
[0], copy_rtx (out
));
3191 * General case: Jumpful:
3192 * xorl dest,dest cmpl op1, op2
3193 * cmpl op1, op2 movl ct, dest
3195 * decl dest movl cf, dest
3196 * andl (cf-ct),dest 1:
3201 * This is reasonably steep, but branch mispredict costs are
3202 * high on modern cpus, so consider failing only if optimizing
3206 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3207 && BRANCH_COST (optimize_insn_for_speed_p (),
3212 machine_mode cmp_mode
= GET_MODE (op0
);
3213 enum rtx_code new_code
;
3215 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3217 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3219 /* We may be reversing unordered compare to normal compare,
3220 that is not valid in general (we may convert non-trapping
3221 condition to trapping one), however on i386 we currently
3222 emit all comparisons unordered. */
3223 new_code
= reverse_condition_maybe_unordered (code
);
3227 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3228 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3229 compare_code
= reverse_condition (compare_code
);
3232 if (new_code
!= UNKNOWN
)
3240 if (compare_code
!= UNKNOWN
)
3242 /* notl op1 (if needed)
3247 For x < 0 (resp. x <= -1) there will be no notl,
3248 so if possible swap the constants to get rid of the
3250 True/false will be -1/0 while code below (store flag
3251 followed by decrement) is 0/-1, so the constants need
3252 to be exchanged once more. */
3254 if (compare_code
== GE
|| !cf
)
3256 code
= reverse_condition (code
);
3262 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3266 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3268 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3270 copy_rtx (out
), 1, OPTAB_DIRECT
);
3273 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3274 gen_int_mode (cf
- ct
, mode
),
3275 copy_rtx (out
), 1, OPTAB_DIRECT
);
3277 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3278 copy_rtx (out
), 1, OPTAB_DIRECT
);
3279 if (!rtx_equal_p (out
, operands
[0]))
3280 emit_move_insn (operands
[0], copy_rtx (out
));
3286 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3288 /* Try a few things more with specific constants and a variable. */
3291 rtx var
, orig_out
, out
, tmp
;
3293 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3296 /* If one of the two operands is an interesting constant, load a
3297 constant with the above and mask it in with a logical operation. */
3299 if (CONST_INT_P (operands
[2]))
3302 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3303 operands
[3] = constm1_rtx
, op
= and_optab
;
3304 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3305 operands
[3] = const0_rtx
, op
= ior_optab
;
3309 else if (CONST_INT_P (operands
[3]))
3312 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3313 operands
[2] = constm1_rtx
, op
= and_optab
;
3314 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3315 operands
[2] = const0_rtx
, op
= ior_optab
;
3322 orig_out
= operands
[0];
3323 tmp
= gen_reg_rtx (mode
);
3326 /* Recurse to get the constant loaded. */
3327 if (!ix86_expand_int_movcc (operands
))
3330 /* Mask in the interesting variable. */
3331 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3333 if (!rtx_equal_p (out
, orig_out
))
3334 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3340 * For comparison with above,
3350 if (! nonimmediate_operand (operands
[2], mode
))
3351 operands
[2] = force_reg (mode
, operands
[2]);
3352 if (! nonimmediate_operand (operands
[3], mode
))
3353 operands
[3] = force_reg (mode
, operands
[3]);
3355 if (! register_operand (operands
[2], VOIDmode
)
3357 || ! register_operand (operands
[3], VOIDmode
)))
3358 operands
[2] = force_reg (mode
, operands
[2]);
3361 && ! register_operand (operands
[3], VOIDmode
))
3362 operands
[3] = force_reg (mode
, operands
[3]);
3364 emit_insn (compare_seq
);
3365 emit_insn (gen_rtx_SET (operands
[0],
3366 gen_rtx_IF_THEN_ELSE (mode
,
3367 compare_op
, operands
[2],
3372 /* Detect conditional moves that exactly match min/max operational
3373 semantics. Note that this is IEEE safe, as long as we don't
3374 interchange the operands.
3376 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3377 and TRUE if the operation is successful and instructions are emitted. */
3380 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3381 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3389 else if (code
== UNGE
)
3390 std::swap (if_true
, if_false
);
3394 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3396 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3401 mode
= GET_MODE (dest
);
3403 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3404 but MODE may be a vector mode and thus not appropriate. */
3405 if (!flag_finite_math_only
|| flag_signed_zeros
)
3407 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3410 if_true
= force_reg (mode
, if_true
);
3411 v
= gen_rtvec (2, if_true
, if_false
);
3412 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3416 code
= is_min
? SMIN
: SMAX
;
3417 if (MEM_P (if_true
) && MEM_P (if_false
))
3418 if_true
= force_reg (mode
, if_true
);
3419 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3422 emit_insn (gen_rtx_SET (dest
, tmp
));
3426 /* Expand an SSE comparison. Return the register with the result. */
3429 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3430 rtx op_true
, rtx op_false
)
3432 machine_mode mode
= GET_MODE (dest
);
3433 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3435 /* In general case result of comparison can differ from operands' type. */
3436 machine_mode cmp_mode
;
3438 /* In AVX512F the result of comparison is an integer mask. */
3439 bool maskcmp
= false;
3442 if (GET_MODE_SIZE (cmp_ops_mode
) == 64)
3444 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3445 cmp_mode
= int_mode_for_size (nbits
, 0).require ();
3449 cmp_mode
= cmp_ops_mode
;
3451 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3453 int (*op1_predicate
)(rtx
, machine_mode
)
3454 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3456 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3457 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3460 || (maskcmp
&& cmp_mode
!= mode
)
3461 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3462 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3463 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3465 /* Compare patterns for int modes are unspec in AVX512F only. */
3466 if (maskcmp
&& (code
== GT
|| code
== EQ
))
3468 rtx (*gen
)(rtx
, rtx
, rtx
);
3470 switch (cmp_ops_mode
)
3473 gcc_assert (TARGET_AVX512BW
);
3474 gen
= code
== GT
? gen_avx512bw_gtv64qi3
: gen_avx512bw_eqv64qi3_1
;
3477 gcc_assert (TARGET_AVX512BW
);
3478 gen
= code
== GT
? gen_avx512bw_gtv32hi3
: gen_avx512bw_eqv32hi3_1
;
3481 gen
= code
== GT
? gen_avx512f_gtv16si3
: gen_avx512f_eqv16si3_1
;
3484 gen
= code
== GT
? gen_avx512f_gtv8di3
: gen_avx512f_eqv8di3_1
;
3492 emit_insn (gen (dest
, cmp_op0
, cmp_op1
));
3496 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3498 if (cmp_mode
!= mode
&& !maskcmp
)
3500 x
= force_reg (cmp_ops_mode
, x
);
3501 convert_move (dest
, x
, false);
3504 emit_insn (gen_rtx_SET (dest
, x
));
3509 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3510 operations. This is used for both scalar and vector conditional moves. */
3513 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3515 machine_mode mode
= GET_MODE (dest
);
3516 machine_mode cmpmode
= GET_MODE (cmp
);
3518 /* In AVX512F the result of comparison is an integer mask. */
3519 bool maskcmp
= (mode
!= cmpmode
&& TARGET_AVX512F
);
3523 /* If we have an integer mask and FP value then we need
3524 to cast mask to FP mode. */
3525 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3527 cmp
= force_reg (cmpmode
, cmp
);
3528 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3533 rtx (*gen
) (rtx
, rtx
) = NULL
;
3534 if ((op_true
== CONST0_RTX (mode
)
3535 && vector_all_ones_operand (op_false
, mode
))
3536 || (op_false
== CONST0_RTX (mode
)
3537 && vector_all_ones_operand (op_true
, mode
)))
3541 if (TARGET_AVX512BW
)
3542 gen
= gen_avx512bw_cvtmask2bv64qi
;
3545 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3546 gen
= gen_avx512vl_cvtmask2bv32qi
;
3549 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3550 gen
= gen_avx512vl_cvtmask2bv16qi
;
3553 if (TARGET_AVX512BW
)
3554 gen
= gen_avx512bw_cvtmask2wv32hi
;
3557 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3558 gen
= gen_avx512vl_cvtmask2wv16hi
;
3561 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3562 gen
= gen_avx512vl_cvtmask2wv8hi
;
3565 if (TARGET_AVX512DQ
)
3566 gen
= gen_avx512f_cvtmask2dv16si
;
3569 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3570 gen
= gen_avx512vl_cvtmask2dv8si
;
3573 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3574 gen
= gen_avx512vl_cvtmask2dv4si
;
3577 if (TARGET_AVX512DQ
)
3578 gen
= gen_avx512f_cvtmask2qv8di
;
3581 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3582 gen
= gen_avx512vl_cvtmask2qv4di
;
3585 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3586 gen
= gen_avx512vl_cvtmask2qv2di
;
3591 if (gen
&& SCALAR_INT_MODE_P (cmpmode
))
3593 cmp
= force_reg (cmpmode
, cmp
);
3594 if (op_true
== CONST0_RTX (mode
))
3596 rtx (*gen_not
) (rtx
, rtx
);
3599 case E_QImode
: gen_not
= gen_knotqi
; break;
3600 case E_HImode
: gen_not
= gen_knothi
; break;
3601 case E_SImode
: gen_not
= gen_knotsi
; break;
3602 case E_DImode
: gen_not
= gen_knotdi
; break;
3603 default: gcc_unreachable ();
3605 rtx n
= gen_reg_rtx (cmpmode
);
3606 emit_insn (gen_not (n
, cmp
));
3609 emit_insn (gen (dest
, cmp
));
3613 else if (vector_all_ones_operand (op_true
, mode
)
3614 && op_false
== CONST0_RTX (mode
))
3616 emit_insn (gen_rtx_SET (dest
, cmp
));
3619 else if (op_false
== CONST0_RTX (mode
))
3621 op_true
= force_reg (mode
, op_true
);
3622 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3623 emit_insn (gen_rtx_SET (dest
, x
));
3626 else if (op_true
== CONST0_RTX (mode
))
3628 op_false
= force_reg (mode
, op_false
);
3629 x
= gen_rtx_NOT (mode
, cmp
);
3630 x
= gen_rtx_AND (mode
, x
, op_false
);
3631 emit_insn (gen_rtx_SET (dest
, x
));
3634 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3636 op_false
= force_reg (mode
, op_false
);
3637 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3638 emit_insn (gen_rtx_SET (dest
, x
));
3641 else if (TARGET_XOP
)
3643 op_true
= force_reg (mode
, op_true
);
3645 if (!nonimmediate_operand (op_false
, mode
))
3646 op_false
= force_reg (mode
, op_false
);
3648 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3654 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3657 if (!vector_operand (op_true
, mode
))
3658 op_true
= force_reg (mode
, op_true
);
3660 op_false
= force_reg (mode
, op_false
);
3666 gen
= gen_sse4_1_blendvps
;
3670 gen
= gen_sse4_1_blendvpd
;
3675 gen
= gen_sse4_1_blendvss
;
3676 op_true
= force_reg (mode
, op_true
);
3682 gen
= gen_sse4_1_blendvsd
;
3683 op_true
= force_reg (mode
, op_true
);
3692 gen
= gen_sse4_1_pblendvb
;
3693 if (mode
!= V16QImode
)
3694 d
= gen_reg_rtx (V16QImode
);
3695 op_false
= gen_lowpart (V16QImode
, op_false
);
3696 op_true
= gen_lowpart (V16QImode
, op_true
);
3697 cmp
= gen_lowpart (V16QImode
, cmp
);
3702 gen
= gen_avx_blendvps256
;
3706 gen
= gen_avx_blendvpd256
;
3714 gen
= gen_avx2_pblendvb
;
3715 if (mode
!= V32QImode
)
3716 d
= gen_reg_rtx (V32QImode
);
3717 op_false
= gen_lowpart (V32QImode
, op_false
);
3718 op_true
= gen_lowpart (V32QImode
, op_true
);
3719 cmp
= gen_lowpart (V32QImode
, cmp
);
3724 gen
= gen_avx512bw_blendmv64qi
;
3727 gen
= gen_avx512bw_blendmv32hi
;
3730 gen
= gen_avx512f_blendmv16si
;
3733 gen
= gen_avx512f_blendmv8di
;
3736 gen
= gen_avx512f_blendmv8df
;
3739 gen
= gen_avx512f_blendmv16sf
;
3748 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3750 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3754 op_true
= force_reg (mode
, op_true
);
3756 t2
= gen_reg_rtx (mode
);
3758 t3
= gen_reg_rtx (mode
);
3762 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3763 emit_insn (gen_rtx_SET (t2
, x
));
3765 x
= gen_rtx_NOT (mode
, cmp
);
3766 x
= gen_rtx_AND (mode
, x
, op_false
);
3767 emit_insn (gen_rtx_SET (t3
, x
));
3769 x
= gen_rtx_IOR (mode
, t3
, t2
);
3770 emit_insn (gen_rtx_SET (dest
, x
));
3774 /* Swap, force into registers, or otherwise massage the two operands
3775 to an sse comparison with a mask result. Thus we differ a bit from
3776 ix86_prepare_fp_compare_args which expects to produce a flags result.
3778 The DEST operand exists to help determine whether to commute commutative
3779 operators. The POP0/POP1 operands are updated in place. The new
3780 comparison code is returned, or UNKNOWN if not implementable. */
3782 static enum rtx_code
3783 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3784 rtx
*pop0
, rtx
*pop1
)
3790 /* AVX supports all the needed comparisons. */
3793 /* We have no LTGT as an operator. We could implement it with
3794 NE & ORDERED, but this requires an extra temporary. It's
3795 not clear that it's worth it. */
3802 /* These are supported directly. */
3809 /* AVX has 3 operand comparisons, no need to swap anything. */
3812 /* For commutative operators, try to canonicalize the destination
3813 operand to be first in the comparison - this helps reload to
3814 avoid extra moves. */
3815 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3823 /* These are not supported directly before AVX, and furthermore
3824 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3825 comparison operands to transform into something that is
3827 std::swap (*pop0
, *pop1
);
3828 code
= swap_condition (code
);
3838 /* Expand a floating-point conditional move. Return true if successful. */
3841 ix86_expand_fp_movcc (rtx operands
[])
3843 machine_mode mode
= GET_MODE (operands
[0]);
3844 enum rtx_code code
= GET_CODE (operands
[1]);
3845 rtx tmp
, compare_op
;
3846 rtx op0
= XEXP (operands
[1], 0);
3847 rtx op1
= XEXP (operands
[1], 1);
3849 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3853 /* Since we've no cmove for sse registers, don't force bad register
3854 allocation just to gain access to it. Deny movcc when the
3855 comparison mode doesn't match the move mode. */
3856 cmode
= GET_MODE (op0
);
3857 if (cmode
== VOIDmode
)
3858 cmode
= GET_MODE (op1
);
3862 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3863 if (code
== UNKNOWN
)
3866 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3867 operands
[2], operands
[3]))
3870 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3871 operands
[2], operands
[3]);
3872 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3876 if (GET_MODE (op0
) == TImode
3877 || (GET_MODE (op0
) == DImode
3881 /* The floating point conditional move instructions don't directly
3882 support conditions resulting from a signed integer comparison. */
3884 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3885 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3887 tmp
= gen_reg_rtx (QImode
);
3888 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3890 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3893 emit_insn (gen_rtx_SET (operands
[0],
3894 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3895 operands
[2], operands
[3])));
3900 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3903 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3928 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3931 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3968 /* Return immediate value to be used in UNSPEC_PCMP
3969 for comparison CODE in MODE. */
3972 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3974 if (FLOAT_MODE_P (mode
))
3975 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3976 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3979 /* Expand AVX-512 vector comparison. */
3982 ix86_expand_mask_vec_cmp (rtx operands
[])
3984 machine_mode mask_mode
= GET_MODE (operands
[0]);
3985 machine_mode cmp_mode
= GET_MODE (operands
[2]);
3986 enum rtx_code code
= GET_CODE (operands
[1]);
3987 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3997 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4001 unspec_code
= UNSPEC_PCMP
;
4004 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, operands
[2],
4007 emit_insn (gen_rtx_SET (operands
[0], unspec
));
4012 /* Expand fp vector comparison. */
4015 ix86_expand_fp_vec_cmp (rtx operands
[])
4017 enum rtx_code code
= GET_CODE (operands
[1]);
4020 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4021 &operands
[2], &operands
[3]);
4022 if (code
== UNKNOWN
)
4025 switch (GET_CODE (operands
[1]))
4028 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4029 operands
[3], NULL
, NULL
);
4030 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4031 operands
[3], NULL
, NULL
);
4035 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4036 operands
[3], NULL
, NULL
);
4037 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4038 operands
[3], NULL
, NULL
);
4044 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4048 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4049 operands
[1], operands
[2]);
4051 if (operands
[0] != cmp
)
4052 emit_move_insn (operands
[0], cmp
);
4058 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4059 rtx op_true
, rtx op_false
, bool *negate
)
4061 machine_mode data_mode
= GET_MODE (dest
);
4062 machine_mode mode
= GET_MODE (cop0
);
4067 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4069 && (mode
== V16QImode
|| mode
== V8HImode
4070 || mode
== V4SImode
|| mode
== V2DImode
))
4074 /* Canonicalize the comparison to EQ, GT, GTU. */
4085 code
= reverse_condition (code
);
4091 code
= reverse_condition (code
);
4097 std::swap (cop0
, cop1
);
4098 code
= swap_condition (code
);
4105 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4106 if (mode
== V2DImode
)
4111 /* SSE4.1 supports EQ. */
4118 /* SSE4.2 supports GT/GTU. */
4128 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4129 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4131 std::swap (optrue
, opfalse
);
4133 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4134 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4135 min (x, y) == x). While we add one instruction (the minimum),
4136 we remove the need for two instructions in the negation, as the
4137 result is done this way.
4138 When using masks, do it for SI/DImode element types, as it is shorter
4139 than the two subtractions. */
4141 && GET_MODE_SIZE (mode
) != 64
4142 && vector_all_ones_operand (opfalse
, data_mode
)
4143 && optrue
== CONST0_RTX (data_mode
))
4145 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4146 /* Don't do it if not using integer masks and we'd end up with
4147 the right values in the registers though. */
4148 && (GET_MODE_SIZE (mode
) == 64
4149 || !vector_all_ones_operand (optrue
, data_mode
)
4150 || opfalse
!= CONST0_RTX (data_mode
))))
4152 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4157 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4160 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4161 cop0
= force_reg (mode
, cop0
);
4162 cop1
= force_reg (mode
, cop1
);
4166 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4170 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4174 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4177 if (TARGET_AVX512VL
)
4179 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4180 cop0
= force_reg (mode
, cop0
);
4181 cop1
= force_reg (mode
, cop1
);
4185 if (code
== GTU
&& TARGET_SSE2
)
4186 gen
= gen_uminv16qi3
;
4187 else if (code
== GT
&& TARGET_SSE4_1
)
4188 gen
= gen_sminv16qi3
;
4191 if (code
== GTU
&& TARGET_SSE4_1
)
4192 gen
= gen_uminv8hi3
;
4193 else if (code
== GT
&& TARGET_SSE2
)
4194 gen
= gen_sminv8hi3
;
4198 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4201 if (TARGET_AVX512VL
)
4203 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4204 cop0
= force_reg (mode
, cop0
);
4205 cop1
= force_reg (mode
, cop1
);
4214 rtx tem
= gen_reg_rtx (mode
);
4215 if (!vector_operand (cop0
, mode
))
4216 cop0
= force_reg (mode
, cop0
);
4217 if (!vector_operand (cop1
, mode
))
4218 cop1
= force_reg (mode
, cop1
);
4220 emit_insn (gen (tem
, cop0
, cop1
));
4226 /* Unsigned parallel compare is not supported by the hardware.
4227 Play some tricks to turn this into a signed comparison
4231 cop0
= force_reg (mode
, cop0
);
4244 /* Subtract (-(INT MAX) - 1) from both operands to make
4246 mask
= ix86_build_signbit_mask (mode
, true, false);
4247 t1
= gen_reg_rtx (mode
);
4248 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4250 t2
= gen_reg_rtx (mode
);
4251 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4265 /* Perform a parallel unsigned saturating subtraction. */
4266 x
= gen_reg_rtx (mode
);
4267 emit_insn (gen_rtx_SET
4268 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4270 cop1
= CONST0_RTX (mode
);
4282 std::swap (op_true
, op_false
);
4284 /* Allow the comparison to be done in one mode, but the movcc to
4285 happen in another mode. */
4286 if (data_mode
== mode
)
4288 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4293 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4294 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4296 if (GET_MODE (x
) == mode
)
4297 x
= gen_lowpart (data_mode
, x
);
4303 /* Expand integer vector comparison. */
4306 ix86_expand_int_vec_cmp (rtx operands
[])
4308 rtx_code code
= GET_CODE (operands
[1]);
4309 bool negate
= false;
4310 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4311 operands
[3], NULL
, NULL
, &negate
);
4317 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4318 CONST0_RTX (GET_MODE (cmp
)),
4319 NULL
, NULL
, &negate
);
4321 gcc_assert (!negate
);
4323 if (operands
[0] != cmp
)
4324 emit_move_insn (operands
[0], cmp
);
4329 /* Expand a floating-point vector conditional move; a vcond operation
4330 rather than a movcc operation. */
4333 ix86_expand_fp_vcond (rtx operands
[])
4335 enum rtx_code code
= GET_CODE (operands
[3]);
4338 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4339 &operands
[4], &operands
[5]);
4340 if (code
== UNKNOWN
)
4343 switch (GET_CODE (operands
[3]))
4346 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4347 operands
[5], operands
[0], operands
[0]);
4348 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4349 operands
[5], operands
[1], operands
[2]);
4353 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4354 operands
[5], operands
[0], operands
[0]);
4355 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4356 operands
[5], operands
[1], operands
[2]);
4362 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4364 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4368 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4369 operands
[5], operands
[1], operands
[2]))
4372 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4373 operands
[1], operands
[2]);
4374 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4378 /* Expand a signed/unsigned integral vector conditional move. */
4381 ix86_expand_int_vcond (rtx operands
[])
4383 machine_mode data_mode
= GET_MODE (operands
[0]);
4384 machine_mode mode
= GET_MODE (operands
[4]);
4385 enum rtx_code code
= GET_CODE (operands
[3]);
4386 bool negate
= false;
4392 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4393 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4394 if ((code
== LT
|| code
== GE
)
4395 && data_mode
== mode
4396 && cop1
== CONST0_RTX (mode
)
4397 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4398 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4399 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4400 && (GET_MODE_SIZE (data_mode
) == 16
4401 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4403 rtx negop
= operands
[2 - (code
== LT
)];
4404 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4405 if (negop
== CONST1_RTX (data_mode
))
4407 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4408 operands
[0], 1, OPTAB_DIRECT
);
4409 if (res
!= operands
[0])
4410 emit_move_insn (operands
[0], res
);
4413 else if (GET_MODE_INNER (data_mode
) != DImode
4414 && vector_all_ones_operand (negop
, data_mode
))
4416 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4417 operands
[0], 0, OPTAB_DIRECT
);
4418 if (res
!= operands
[0])
4419 emit_move_insn (operands
[0], res
);
4424 if (!nonimmediate_operand (cop1
, mode
))
4425 cop1
= force_reg (mode
, cop1
);
4426 if (!general_operand (operands
[1], data_mode
))
4427 operands
[1] = force_reg (data_mode
, operands
[1]);
4428 if (!general_operand (operands
[2], data_mode
))
4429 operands
[2] = force_reg (data_mode
, operands
[2]);
4431 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4432 operands
[1], operands
[2], &negate
);
4437 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4438 operands
[2-negate
]);
4443 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4444 struct expand_vec_perm_d
*d
)
4446 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4447 expander, so args are either in d, or in op0, op1 etc. */
4448 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4449 machine_mode maskmode
= mode
;
4450 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4455 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4456 gen
= gen_avx512vl_vpermt2varv8hi3
;
4459 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4460 gen
= gen_avx512vl_vpermt2varv16hi3
;
4463 if (TARGET_AVX512VBMI
)
4464 gen
= gen_avx512bw_vpermt2varv64qi3
;
4467 if (TARGET_AVX512BW
)
4468 gen
= gen_avx512bw_vpermt2varv32hi3
;
4471 if (TARGET_AVX512VL
)
4472 gen
= gen_avx512vl_vpermt2varv4si3
;
4475 if (TARGET_AVX512VL
)
4476 gen
= gen_avx512vl_vpermt2varv8si3
;
4480 gen
= gen_avx512f_vpermt2varv16si3
;
4483 if (TARGET_AVX512VL
)
4485 gen
= gen_avx512vl_vpermt2varv4sf3
;
4486 maskmode
= V4SImode
;
4490 if (TARGET_AVX512VL
)
4492 gen
= gen_avx512vl_vpermt2varv8sf3
;
4493 maskmode
= V8SImode
;
4499 gen
= gen_avx512f_vpermt2varv16sf3
;
4500 maskmode
= V16SImode
;
4504 if (TARGET_AVX512VL
)
4505 gen
= gen_avx512vl_vpermt2varv2di3
;
4508 if (TARGET_AVX512VL
)
4509 gen
= gen_avx512vl_vpermt2varv4di3
;
4513 gen
= gen_avx512f_vpermt2varv8di3
;
4516 if (TARGET_AVX512VL
)
4518 gen
= gen_avx512vl_vpermt2varv2df3
;
4519 maskmode
= V2DImode
;
4523 if (TARGET_AVX512VL
)
4525 gen
= gen_avx512vl_vpermt2varv4df3
;
4526 maskmode
= V4DImode
;
4532 gen
= gen_avx512f_vpermt2varv8df3
;
4533 maskmode
= V8DImode
;
4543 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4544 expander, so args are either in d, or in op0, op1 etc. */
4551 for (int i
= 0; i
< d
->nelt
; ++i
)
4552 vec
[i
] = GEN_INT (d
->perm
[i
]);
4553 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4556 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4560 /* Expand a variable vector permutation. */
4563 ix86_expand_vec_perm (rtx operands
[])
4565 rtx target
= operands
[0];
4566 rtx op0
= operands
[1];
4567 rtx op1
= operands
[2];
4568 rtx mask
= operands
[3];
4569 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4570 machine_mode mode
= GET_MODE (op0
);
4571 machine_mode maskmode
= GET_MODE (mask
);
4573 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4575 /* Number of elements in the vector. */
4576 w
= GET_MODE_NUNITS (mode
);
4577 e
= GET_MODE_UNIT_SIZE (mode
);
4578 gcc_assert (w
<= 64);
4580 if (TARGET_AVX512F
&& one_operand_shuffle
)
4582 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4586 gen
=gen_avx512f_permvarv16si
;
4589 gen
= gen_avx512f_permvarv16sf
;
4592 gen
= gen_avx512f_permvarv8di
;
4595 gen
= gen_avx512f_permvarv8df
;
4602 emit_insn (gen (target
, op0
, mask
));
4607 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4612 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4614 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4615 an constant shuffle operand. With a tiny bit of effort we can
4616 use VPERMD instead. A re-interpretation stall for V4DFmode is
4617 unfortunate but there's no avoiding it.
4618 Similarly for V16HImode we don't have instructions for variable
4619 shuffling, while for V32QImode we can use after preparing suitable
4620 masks vpshufb; vpshufb; vpermq; vpor. */
4622 if (mode
== V16HImode
)
4624 maskmode
= mode
= V32QImode
;
4630 maskmode
= mode
= V8SImode
;
4634 t1
= gen_reg_rtx (maskmode
);
4636 /* Replicate the low bits of the V4DImode mask into V8SImode:
4638 t1 = { A A B B C C D D }. */
4639 for (i
= 0; i
< w
/ 2; ++i
)
4640 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4641 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4642 vt
= force_reg (maskmode
, vt
);
4643 mask
= gen_lowpart (maskmode
, mask
);
4644 if (maskmode
== V8SImode
)
4645 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4647 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4649 /* Multiply the shuffle indicies by two. */
4650 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4653 /* Add one to the odd shuffle indicies:
4654 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4655 for (i
= 0; i
< w
/ 2; ++i
)
4657 vec
[i
* 2] = const0_rtx
;
4658 vec
[i
* 2 + 1] = const1_rtx
;
4660 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4661 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4662 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4665 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4666 operands
[3] = mask
= t1
;
4667 target
= gen_reg_rtx (mode
);
4668 op0
= gen_lowpart (mode
, op0
);
4669 op1
= gen_lowpart (mode
, op1
);
4675 /* The VPERMD and VPERMPS instructions already properly ignore
4676 the high bits of the shuffle elements. No need for us to
4677 perform an AND ourselves. */
4678 if (one_operand_shuffle
)
4680 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4681 if (target
!= operands
[0])
4682 emit_move_insn (operands
[0],
4683 gen_lowpart (GET_MODE (operands
[0]), target
));
4687 t1
= gen_reg_rtx (V8SImode
);
4688 t2
= gen_reg_rtx (V8SImode
);
4689 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4690 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4696 mask
= gen_lowpart (V8SImode
, mask
);
4697 if (one_operand_shuffle
)
4698 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4701 t1
= gen_reg_rtx (V8SFmode
);
4702 t2
= gen_reg_rtx (V8SFmode
);
4703 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4704 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4710 /* By combining the two 128-bit input vectors into one 256-bit
4711 input vector, we can use VPERMD and VPERMPS for the full
4712 two-operand shuffle. */
4713 t1
= gen_reg_rtx (V8SImode
);
4714 t2
= gen_reg_rtx (V8SImode
);
4715 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4716 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4717 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4718 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4722 t1
= gen_reg_rtx (V8SFmode
);
4723 t2
= gen_reg_rtx (V8SImode
);
4724 mask
= gen_lowpart (V4SImode
, mask
);
4725 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4726 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4727 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4728 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4732 t1
= gen_reg_rtx (V32QImode
);
4733 t2
= gen_reg_rtx (V32QImode
);
4734 t3
= gen_reg_rtx (V32QImode
);
4735 vt2
= GEN_INT (-128);
4736 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4737 vt
= force_reg (V32QImode
, vt
);
4738 for (i
= 0; i
< 32; i
++)
4739 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4740 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4741 vt2
= force_reg (V32QImode
, vt2
);
4742 /* From mask create two adjusted masks, which contain the same
4743 bits as mask in the low 7 bits of each vector element.
4744 The first mask will have the most significant bit clear
4745 if it requests element from the same 128-bit lane
4746 and MSB set if it requests element from the other 128-bit lane.
4747 The second mask will have the opposite values of the MSB,
4748 and additionally will have its 128-bit lanes swapped.
4749 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4750 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4751 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4752 stands for other 12 bytes. */
4753 /* The bit whether element is from the same lane or the other
4754 lane is bit 4, so shift it up by 3 to the MSB position. */
4755 t5
= gen_reg_rtx (V4DImode
);
4756 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4758 /* Clear MSB bits from the mask just in case it had them set. */
4759 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4760 /* After this t1 will have MSB set for elements from other lane. */
4761 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4762 /* Clear bits other than MSB. */
4763 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4764 /* Or in the lower bits from mask into t3. */
4765 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4766 /* And invert MSB bits in t1, so MSB is set for elements from the same
4768 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4769 /* Swap 128-bit lanes in t3. */
4770 t6
= gen_reg_rtx (V4DImode
);
4771 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4772 const2_rtx
, GEN_INT (3),
4773 const0_rtx
, const1_rtx
));
4774 /* And or in the lower bits from mask into t1. */
4775 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4776 if (one_operand_shuffle
)
4778 /* Each of these shuffles will put 0s in places where
4779 element from the other 128-bit lane is needed, otherwise
4780 will shuffle in the requested value. */
4781 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4782 gen_lowpart (V32QImode
, t6
)));
4783 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4784 /* For t3 the 128-bit lanes are swapped again. */
4785 t7
= gen_reg_rtx (V4DImode
);
4786 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4787 const2_rtx
, GEN_INT (3),
4788 const0_rtx
, const1_rtx
));
4789 /* And oring both together leads to the result. */
4790 emit_insn (gen_iorv32qi3 (target
, t1
,
4791 gen_lowpart (V32QImode
, t7
)));
4792 if (target
!= operands
[0])
4793 emit_move_insn (operands
[0],
4794 gen_lowpart (GET_MODE (operands
[0]), target
));
4798 t4
= gen_reg_rtx (V32QImode
);
4799 /* Similarly to the above one_operand_shuffle code,
4800 just for repeated twice for each operand. merge_two:
4801 code will merge the two results together. */
4802 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4803 gen_lowpart (V32QImode
, t6
)));
4804 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4805 gen_lowpart (V32QImode
, t6
)));
4806 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4807 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4808 t7
= gen_reg_rtx (V4DImode
);
4809 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4810 const2_rtx
, GEN_INT (3),
4811 const0_rtx
, const1_rtx
));
4812 t8
= gen_reg_rtx (V4DImode
);
4813 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4814 const2_rtx
, GEN_INT (3),
4815 const0_rtx
, const1_rtx
));
4816 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4817 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4823 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4830 /* The XOP VPPERM insn supports three inputs. By ignoring the
4831 one_operand_shuffle special case, we avoid creating another
4832 set of constant vectors in memory. */
4833 one_operand_shuffle
= false;
4835 /* mask = mask & {2*w-1, ...} */
4836 vt
= GEN_INT (2*w
- 1);
4840 /* mask = mask & {w-1, ...} */
4841 vt
= GEN_INT (w
- 1);
4844 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4845 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4846 NULL_RTX
, 0, OPTAB_DIRECT
);
4848 /* For non-QImode operations, convert the word permutation control
4849 into a byte permutation control. */
4850 if (mode
!= V16QImode
)
4852 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4853 GEN_INT (exact_log2 (e
)),
4854 NULL_RTX
, 0, OPTAB_DIRECT
);
4856 /* Convert mask to vector of chars. */
4857 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4859 /* Replicate each of the input bytes into byte positions:
4860 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4861 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4862 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4863 for (i
= 0; i
< 16; ++i
)
4864 vec
[i
] = GEN_INT (i
/e
* e
);
4865 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4866 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4868 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4870 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4872 /* Convert it into the byte positions by doing
4873 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4874 for (i
= 0; i
< 16; ++i
)
4875 vec
[i
] = GEN_INT (i
% e
);
4876 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4877 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4878 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4881 /* The actual shuffle operations all operate on V16QImode. */
4882 op0
= gen_lowpart (V16QImode
, op0
);
4883 op1
= gen_lowpart (V16QImode
, op1
);
4887 if (GET_MODE (target
) != V16QImode
)
4888 target
= gen_reg_rtx (V16QImode
);
4889 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4890 if (target
!= operands
[0])
4891 emit_move_insn (operands
[0],
4892 gen_lowpart (GET_MODE (operands
[0]), target
));
4894 else if (one_operand_shuffle
)
4896 if (GET_MODE (target
) != V16QImode
)
4897 target
= gen_reg_rtx (V16QImode
);
4898 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4899 if (target
!= operands
[0])
4900 emit_move_insn (operands
[0],
4901 gen_lowpart (GET_MODE (operands
[0]), target
));
4908 /* Shuffle the two input vectors independently. */
4909 t1
= gen_reg_rtx (V16QImode
);
4910 t2
= gen_reg_rtx (V16QImode
);
4911 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4912 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4915 /* Then merge them together. The key is whether any given control
4916 element contained a bit set that indicates the second word. */
4919 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4921 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4922 more shuffle to convert the V2DI input mask into a V4SI
4923 input mask. At which point the masking that expand_int_vcond
4924 will work as desired. */
4925 rtx t3
= gen_reg_rtx (V4SImode
);
4926 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4927 const0_rtx
, const0_rtx
,
4928 const2_rtx
, const2_rtx
));
4930 maskmode
= V4SImode
;
4934 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4935 vt
= force_reg (maskmode
, vt
);
4936 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4937 NULL_RTX
, 0, OPTAB_DIRECT
);
4939 if (GET_MODE (target
) != mode
)
4940 target
= gen_reg_rtx (mode
);
4942 xops
[1] = gen_lowpart (mode
, t2
);
4943 xops
[2] = gen_lowpart (mode
, t1
);
4944 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4947 ok
= ix86_expand_int_vcond (xops
);
4949 if (target
!= operands
[0])
4950 emit_move_insn (operands
[0],
4951 gen_lowpart (GET_MODE (operands
[0]), target
));
4955 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4956 true if we should do zero extension, else sign extension. HIGH_P is
4957 true if we want the N/2 high elements, else the low elements. */
4960 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4962 machine_mode imode
= GET_MODE (src
);
4967 rtx (*unpack
)(rtx
, rtx
);
4968 rtx (*extract
)(rtx
, rtx
) = NULL
;
4969 machine_mode halfmode
= BLKmode
;
4975 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4977 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4978 halfmode
= V32QImode
;
4980 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4984 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4986 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4987 halfmode
= V16QImode
;
4989 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4993 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4995 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4996 halfmode
= V16HImode
;
4998 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5002 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5004 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5005 halfmode
= V8HImode
;
5007 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5011 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5013 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5014 halfmode
= V8SImode
;
5016 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5020 unpack
= gen_avx2_zero_extendv4siv4di2
;
5022 unpack
= gen_avx2_sign_extendv4siv4di2
;
5023 halfmode
= V4SImode
;
5025 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5029 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5031 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5035 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5037 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5041 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5043 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5049 if (GET_MODE_SIZE (imode
) >= 32)
5051 tmp
= gen_reg_rtx (halfmode
);
5052 emit_insn (extract (tmp
, src
));
5056 /* Shift higher 8 bytes to lower 8 bytes. */
5057 tmp
= gen_reg_rtx (V1TImode
);
5058 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5060 tmp
= gen_lowpart (imode
, tmp
);
5065 emit_insn (unpack (dest
, tmp
));
5069 rtx (*unpack
)(rtx
, rtx
, rtx
);
5075 unpack
= gen_vec_interleave_highv16qi
;
5077 unpack
= gen_vec_interleave_lowv16qi
;
5081 unpack
= gen_vec_interleave_highv8hi
;
5083 unpack
= gen_vec_interleave_lowv8hi
;
5087 unpack
= gen_vec_interleave_highv4si
;
5089 unpack
= gen_vec_interleave_lowv4si
;
5096 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5098 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5099 src
, pc_rtx
, pc_rtx
);
5101 rtx tmp2
= gen_reg_rtx (imode
);
5102 emit_insn (unpack (tmp2
, src
, tmp
));
5103 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5107 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5108 but works for floating pointer parameters and nonoffsetable memories.
5109 For pushes, it returns just stack offsets; the values will be saved
5110 in the right order. Maximally three parts are generated. */
5113 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5118 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5120 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5122 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5123 gcc_assert (size
>= 2 && size
<= 4);
5125 /* Optimize constant pool reference to immediates. This is used by fp
5126 moves, that force all constants to memory to allow combining. */
5127 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5128 operand
= avoid_constant_pool_reference (operand
);
5130 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5132 /* The only non-offsetable memories we handle are pushes. */
5133 int ok
= push_operand (operand
, VOIDmode
);
5137 operand
= copy_rtx (operand
);
5138 PUT_MODE (operand
, word_mode
);
5139 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5143 if (GET_CODE (operand
) == CONST_VECTOR
)
5145 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5146 /* Caution: if we looked through a constant pool memory above,
5147 the operand may actually have a different mode now. That's
5148 ok, since we want to pun this all the way back to an integer. */
5149 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5150 gcc_assert (operand
!= NULL
);
5157 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5162 if (REG_P (operand
))
5164 gcc_assert (reload_completed
);
5165 for (i
= 0; i
< size
; i
++)
5166 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5168 else if (offsettable_memref_p (operand
))
5170 operand
= adjust_address (operand
, SImode
, 0);
5172 for (i
= 1; i
< size
; i
++)
5173 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5175 else if (CONST_DOUBLE_P (operand
))
5177 const REAL_VALUE_TYPE
*r
;
5180 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5184 real_to_target (l
, r
, mode
);
5185 parts
[3] = gen_int_mode (l
[3], SImode
);
5186 parts
[2] = gen_int_mode (l
[2], SImode
);
5189 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5190 long double may not be 80-bit. */
5191 real_to_target (l
, r
, mode
);
5192 parts
[2] = gen_int_mode (l
[2], SImode
);
5195 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5200 parts
[1] = gen_int_mode (l
[1], SImode
);
5201 parts
[0] = gen_int_mode (l
[0], SImode
);
5210 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5211 if (mode
== XFmode
|| mode
== TFmode
)
5213 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5214 if (REG_P (operand
))
5216 gcc_assert (reload_completed
);
5217 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5218 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5220 else if (offsettable_memref_p (operand
))
5222 operand
= adjust_address (operand
, DImode
, 0);
5224 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5226 else if (CONST_DOUBLE_P (operand
))
5230 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5232 /* real_to_target puts 32-bit pieces in each long. */
5233 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5234 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5237 if (upper_mode
== SImode
)
5238 parts
[1] = gen_int_mode (l
[2], SImode
);
5241 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5242 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5253 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5254 Return false when normal moves are needed; true when all required
5255 insns have been emitted. Operands 2-4 contain the input values
5256 int the correct order; operands 5-7 contain the output values. */
5259 ix86_split_long_move (rtx operands
[])
5265 machine_mode mode
= GET_MODE (operands
[0]);
5266 bool collisionparts
[4];
5268 /* The DFmode expanders may ask us to move double.
5269 For 64bit target this is single move. By hiding the fact
5270 here we simplify i386.md splitters. */
5271 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5273 /* Optimize constant pool reference to immediates. This is used by
5274 fp moves, that force all constants to memory to allow combining. */
5276 if (MEM_P (operands
[1])
5277 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5278 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5279 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5280 if (push_operand (operands
[0], VOIDmode
))
5282 operands
[0] = copy_rtx (operands
[0]);
5283 PUT_MODE (operands
[0], word_mode
);
5286 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5287 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5288 emit_move_insn (operands
[0], operands
[1]);
5292 /* The only non-offsettable memory we handle is push. */
5293 if (push_operand (operands
[0], VOIDmode
))
5296 gcc_assert (!MEM_P (operands
[0])
5297 || offsettable_memref_p (operands
[0]));
5299 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5300 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5302 /* When emitting push, take care for source operands on the stack. */
5303 if (push
&& MEM_P (operands
[1])
5304 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5306 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5308 /* Compensate for the stack decrement by 4. */
5309 if (!TARGET_64BIT
&& nparts
== 3
5310 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5311 src_base
= plus_constant (Pmode
, src_base
, 4);
5313 /* src_base refers to the stack pointer and is
5314 automatically decreased by emitted push. */
5315 for (i
= 0; i
< nparts
; i
++)
5316 part
[1][i
] = change_address (part
[1][i
],
5317 GET_MODE (part
[1][i
]), src_base
);
5320 /* We need to do copy in the right order in case an address register
5321 of the source overlaps the destination. */
5322 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5326 for (i
= 0; i
< nparts
; i
++)
5329 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5330 if (collisionparts
[i
])
5334 /* Collision in the middle part can be handled by reordering. */
5335 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5337 std::swap (part
[0][1], part
[0][2]);
5338 std::swap (part
[1][1], part
[1][2]);
5340 else if (collisions
== 1
5342 && (collisionparts
[1] || collisionparts
[2]))
5344 if (collisionparts
[1])
5346 std::swap (part
[0][1], part
[0][2]);
5347 std::swap (part
[1][1], part
[1][2]);
5351 std::swap (part
[0][2], part
[0][3]);
5352 std::swap (part
[1][2], part
[1][3]);
5356 /* If there are more collisions, we can't handle it by reordering.
5357 Do an lea to the last part and use only one colliding move. */
5358 else if (collisions
> 1)
5364 base
= part
[0][nparts
- 1];
5366 /* Handle the case when the last part isn't valid for lea.
5367 Happens in 64-bit mode storing the 12-byte XFmode. */
5368 if (GET_MODE (base
) != Pmode
)
5369 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5371 addr
= XEXP (part
[1][0], 0);
5372 if (TARGET_TLS_DIRECT_SEG_REFS
)
5374 struct ix86_address parts
;
5375 int ok
= ix86_decompose_address (addr
, &parts
);
5377 /* It is not valid to use %gs: or %fs: in lea. */
5378 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5380 emit_insn (gen_rtx_SET (base
, addr
));
5381 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5382 for (i
= 1; i
< nparts
; i
++)
5384 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5385 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5396 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5397 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5398 emit_move_insn (part
[0][2], part
[1][2]);
5400 else if (nparts
== 4)
5402 emit_move_insn (part
[0][3], part
[1][3]);
5403 emit_move_insn (part
[0][2], part
[1][2]);
5408 /* In 64bit mode we don't have 32bit push available. In case this is
5409 register, it is OK - we will just use larger counterpart. We also
5410 retype memory - these comes from attempt to avoid REX prefix on
5411 moving of second half of TFmode value. */
5412 if (GET_MODE (part
[1][1]) == SImode
)
5414 switch (GET_CODE (part
[1][1]))
5417 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5421 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5428 if (GET_MODE (part
[1][0]) == SImode
)
5429 part
[1][0] = part
[1][1];
5432 emit_move_insn (part
[0][1], part
[1][1]);
5433 emit_move_insn (part
[0][0], part
[1][0]);
5437 /* Choose correct order to not overwrite the source before it is copied. */
5438 if ((REG_P (part
[0][0])
5439 && REG_P (part
[1][1])
5440 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5442 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5444 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5446 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5448 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5450 operands
[2 + i
] = part
[0][j
];
5451 operands
[6 + i
] = part
[1][j
];
5456 for (i
= 0; i
< nparts
; i
++)
5458 operands
[2 + i
] = part
[0][i
];
5459 operands
[6 + i
] = part
[1][i
];
5463 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5464 if (optimize_insn_for_size_p ())
5466 for (j
= 0; j
< nparts
- 1; j
++)
5467 if (CONST_INT_P (operands
[6 + j
])
5468 && operands
[6 + j
] != const0_rtx
5469 && REG_P (operands
[2 + j
]))
5470 for (i
= j
; i
< nparts
- 1; i
++)
5471 if (CONST_INT_P (operands
[7 + i
])
5472 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5473 operands
[7 + i
] = operands
[2 + j
];
5476 for (i
= 0; i
< nparts
; i
++)
5477 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5482 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5483 left shift by a constant, either using a single shift or
5484 a sequence of add instructions. */
5487 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5490 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5491 && !optimize_insn_for_size_p ()))
5494 emit_insn (gen_add2_insn (operand
, operand
));
5498 rtx (*insn
)(rtx
, rtx
, rtx
);
5500 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5501 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5506 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5508 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5509 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5510 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5511 machine_mode half_mode
;
5513 rtx low
[2], high
[2];
5516 if (CONST_INT_P (operands
[2]))
5518 split_double_mode (mode
, operands
, 2, low
, high
);
5519 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5521 if (count
>= half_width
)
5523 emit_move_insn (high
[0], low
[1]);
5524 emit_move_insn (low
[0], const0_rtx
);
5526 if (count
> half_width
)
5527 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5531 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5533 if (!rtx_equal_p (operands
[0], operands
[1]))
5534 emit_move_insn (operands
[0], operands
[1]);
5536 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5537 ix86_expand_ashl_const (low
[0], count
, mode
);
5542 split_double_mode (mode
, operands
, 1, low
, high
);
5543 half_mode
= mode
== DImode
? SImode
: DImode
;
5545 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5547 if (operands
[1] == const1_rtx
)
5549 /* Assuming we've chosen a QImode capable registers, then 1 << N
5550 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5551 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5553 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5555 ix86_expand_clear (low
[0]);
5556 ix86_expand_clear (high
[0]);
5557 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5559 d
= gen_lowpart (QImode
, low
[0]);
5560 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5561 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5562 emit_insn (gen_rtx_SET (d
, s
));
5564 d
= gen_lowpart (QImode
, high
[0]);
5565 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5566 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5567 emit_insn (gen_rtx_SET (d
, s
));
5570 /* Otherwise, we can get the same results by manually performing
5571 a bit extract operation on bit 5/6, and then performing the two
5572 shifts. The two methods of getting 0/1 into low/high are exactly
5573 the same size. Avoiding the shift in the bit extract case helps
5574 pentium4 a bit; no one else seems to care much either way. */
5577 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5578 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5579 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5585 gen_lshr3
= gen_lshrsi3
;
5586 gen_and3
= gen_andsi3
;
5587 gen_xor3
= gen_xorsi3
;
5592 gen_lshr3
= gen_lshrdi3
;
5593 gen_and3
= gen_anddi3
;
5594 gen_xor3
= gen_xordi3
;
5598 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5599 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5601 x
= gen_lowpart (half_mode
, operands
[2]);
5602 emit_insn (gen_rtx_SET (high
[0], x
));
5604 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5605 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5606 emit_move_insn (low
[0], high
[0]);
5607 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5610 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5611 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5615 if (operands
[1] == constm1_rtx
)
5617 /* For -1 << N, we can avoid the shld instruction, because we
5618 know that we're shifting 0...31/63 ones into a -1. */
5619 emit_move_insn (low
[0], constm1_rtx
);
5620 if (optimize_insn_for_size_p ())
5621 emit_move_insn (high
[0], low
[0]);
5623 emit_move_insn (high
[0], constm1_rtx
);
5627 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5629 if (!rtx_equal_p (operands
[0], operands
[1]))
5630 emit_move_insn (operands
[0], operands
[1]);
5632 split_double_mode (mode
, operands
, 1, low
, high
);
5633 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5636 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5638 if (TARGET_CMOVE
&& scratch
)
5640 ix86_expand_clear (scratch
);
5641 emit_insn (gen_x86_shift_adj_1
5642 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5645 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5649 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5651 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5652 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5653 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5654 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5656 rtx low
[2], high
[2];
5659 if (CONST_INT_P (operands
[2]))
5661 split_double_mode (mode
, operands
, 2, low
, high
);
5662 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5664 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5666 emit_move_insn (high
[0], high
[1]);
5667 emit_insn (gen_ashr3 (high
[0], high
[0],
5668 GEN_INT (half_width
- 1)));
5669 emit_move_insn (low
[0], high
[0]);
5672 else if (count
>= half_width
)
5674 emit_move_insn (low
[0], high
[1]);
5675 emit_move_insn (high
[0], low
[0]);
5676 emit_insn (gen_ashr3 (high
[0], high
[0],
5677 GEN_INT (half_width
- 1)));
5679 if (count
> half_width
)
5680 emit_insn (gen_ashr3 (low
[0], low
[0],
5681 GEN_INT (count
- half_width
)));
5685 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5687 if (!rtx_equal_p (operands
[0], operands
[1]))
5688 emit_move_insn (operands
[0], operands
[1]);
5690 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5691 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5696 machine_mode half_mode
;
5698 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5700 if (!rtx_equal_p (operands
[0], operands
[1]))
5701 emit_move_insn (operands
[0], operands
[1]);
5703 split_double_mode (mode
, operands
, 1, low
, high
);
5704 half_mode
= mode
== DImode
? SImode
: DImode
;
5706 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5707 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5709 if (TARGET_CMOVE
&& scratch
)
5711 emit_move_insn (scratch
, high
[0]);
5712 emit_insn (gen_ashr3 (scratch
, scratch
,
5713 GEN_INT (half_width
- 1)));
5714 emit_insn (gen_x86_shift_adj_1
5715 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5718 emit_insn (gen_x86_shift_adj_3
5719 (half_mode
, low
[0], high
[0], operands
[2]));
5724 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5726 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5727 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5728 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5729 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5731 rtx low
[2], high
[2];
5734 if (CONST_INT_P (operands
[2]))
5736 split_double_mode (mode
, operands
, 2, low
, high
);
5737 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5739 if (count
>= half_width
)
5741 emit_move_insn (low
[0], high
[1]);
5742 ix86_expand_clear (high
[0]);
5744 if (count
> half_width
)
5745 emit_insn (gen_lshr3 (low
[0], low
[0],
5746 GEN_INT (count
- half_width
)));
5750 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5752 if (!rtx_equal_p (operands
[0], operands
[1]))
5753 emit_move_insn (operands
[0], operands
[1]);
5755 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5756 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5761 machine_mode half_mode
;
5763 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5765 if (!rtx_equal_p (operands
[0], operands
[1]))
5766 emit_move_insn (operands
[0], operands
[1]);
5768 split_double_mode (mode
, operands
, 1, low
, high
);
5769 half_mode
= mode
== DImode
? SImode
: DImode
;
5771 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5772 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5774 if (TARGET_CMOVE
&& scratch
)
5776 ix86_expand_clear (scratch
);
5777 emit_insn (gen_x86_shift_adj_1
5778 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5781 emit_insn (gen_x86_shift_adj_2
5782 (half_mode
, low
[0], high
[0], operands
[2]));
5786 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5787 DImode for constant loop counts. */
5790 counter_mode (rtx count_exp
)
5792 if (GET_MODE (count_exp
) != VOIDmode
)
5793 return GET_MODE (count_exp
);
5794 if (!CONST_INT_P (count_exp
))
5796 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5801 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5802 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5803 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5804 memory by VALUE (supposed to be in MODE).
5806 The size is rounded down to whole number of chunk size moved at once.
5807 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5811 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5812 rtx destptr
, rtx srcptr
, rtx value
,
5813 rtx count
, machine_mode mode
, int unroll
,
5814 int expected_size
, bool issetmem
)
5816 rtx_code_label
*out_label
, *top_label
;
5818 machine_mode iter_mode
= counter_mode (count
);
5819 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5820 rtx piece_size
= GEN_INT (piece_size_n
);
5821 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5825 top_label
= gen_label_rtx ();
5826 out_label
= gen_label_rtx ();
5827 iter
= gen_reg_rtx (iter_mode
);
5829 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5830 NULL
, 1, OPTAB_DIRECT
);
5831 /* Those two should combine. */
5832 if (piece_size
== const1_rtx
)
5834 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5836 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5838 emit_move_insn (iter
, const0_rtx
);
5840 emit_label (top_label
);
5842 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5844 /* This assert could be relaxed - in this case we'll need to compute
5845 smallest power of two, containing in PIECE_SIZE_N and pass it to
5847 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5848 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5849 destmem
= adjust_address (destmem
, mode
, 0);
5853 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5854 srcmem
= adjust_address (srcmem
, mode
, 0);
5856 /* When unrolling for chips that reorder memory reads and writes,
5857 we can save registers by using single temporary.
5858 Also using 4 temporaries is overkill in 32bit mode. */
5859 if (!TARGET_64BIT
&& 0)
5861 for (i
= 0; i
< unroll
; i
++)
5865 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5866 GET_MODE_SIZE (mode
));
5867 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5868 GET_MODE_SIZE (mode
));
5870 emit_move_insn (destmem
, srcmem
);
5876 gcc_assert (unroll
<= 4);
5877 for (i
= 0; i
< unroll
; i
++)
5879 tmpreg
[i
] = gen_reg_rtx (mode
);
5881 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5882 GET_MODE_SIZE (mode
));
5883 emit_move_insn (tmpreg
[i
], srcmem
);
5885 for (i
= 0; i
< unroll
; i
++)
5888 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5889 GET_MODE_SIZE (mode
));
5890 emit_move_insn (destmem
, tmpreg
[i
]);
5895 for (i
= 0; i
< unroll
; i
++)
5898 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5899 GET_MODE_SIZE (mode
));
5900 emit_move_insn (destmem
, value
);
5903 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5904 true, OPTAB_LIB_WIDEN
);
5906 emit_move_insn (iter
, tmp
);
5908 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5910 if (expected_size
!= -1)
5912 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5913 if (expected_size
== 0)
5915 else if (expected_size
> REG_BR_PROB_BASE
)
5916 predict_jump (REG_BR_PROB_BASE
- 1);
5918 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5922 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5923 iter
= ix86_zero_extend_to_Pmode (iter
);
5924 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5925 true, OPTAB_LIB_WIDEN
);
5927 emit_move_insn (destptr
, tmp
);
5930 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5931 true, OPTAB_LIB_WIDEN
);
5933 emit_move_insn (srcptr
, tmp
);
5935 emit_label (out_label
);
5938 /* Divide COUNTREG by SCALE. */
5940 scale_counter (rtx countreg
, int scale
)
5946 if (CONST_INT_P (countreg
))
5947 return GEN_INT (INTVAL (countreg
) / scale
);
5948 gcc_assert (REG_P (countreg
));
5950 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5951 GEN_INT (exact_log2 (scale
)),
5952 NULL
, 1, OPTAB_DIRECT
);
5956 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5957 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5958 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5959 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5960 ORIG_VALUE is the original value passed to memset to fill the memory with.
5961 Other arguments have same meaning as for previous function. */
5964 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5965 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5967 machine_mode mode
, bool issetmem
)
5972 HOST_WIDE_INT rounded_count
;
5974 /* If possible, it is shorter to use rep movs.
5975 TODO: Maybe it is better to move this logic to decide_alg. */
5976 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5977 && (!issetmem
|| orig_value
== const0_rtx
))
5980 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5981 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5983 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5984 GET_MODE_SIZE (mode
)));
5987 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5988 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5989 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5992 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5993 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5996 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5997 destmem
= shallow_copy_rtx (destmem
);
5998 set_mem_size (destmem
, rounded_count
);
6000 else if (MEM_SIZE_KNOWN_P (destmem
))
6001 clear_mem_size (destmem
);
6005 value
= force_reg (mode
, gen_lowpart (mode
, value
));
6006 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
6010 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
6011 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6014 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6015 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6016 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6019 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6020 if (CONST_INT_P (count
))
6023 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6024 srcmem
= shallow_copy_rtx (srcmem
);
6025 set_mem_size (srcmem
, rounded_count
);
6029 if (MEM_SIZE_KNOWN_P (srcmem
))
6030 clear_mem_size (srcmem
);
6032 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6037 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6039 SRC is passed by pointer to be updated on return.
6040 Return value is updated DST. */
6042 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6043 HOST_WIDE_INT size_to_move
)
6045 rtx dst
= destmem
, src
= *srcmem
, adjust
, tempreg
;
6046 enum insn_code code
;
6047 machine_mode move_mode
;
6050 /* Find the widest mode in which we could perform moves.
6051 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6052 it until move of such size is supported. */
6053 piece_size
= 1 << floor_log2 (size_to_move
);
6054 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6055 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6057 gcc_assert (piece_size
> 1);
6061 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6062 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6063 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6065 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6066 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6067 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6069 move_mode
= word_mode
;
6070 piece_size
= GET_MODE_SIZE (move_mode
);
6071 code
= optab_handler (mov_optab
, move_mode
);
6074 gcc_assert (code
!= CODE_FOR_nothing
);
6076 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6077 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6079 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6080 gcc_assert (size_to_move
% piece_size
== 0);
6081 adjust
= GEN_INT (piece_size
);
6082 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6084 /* We move from memory to memory, so we'll need to do it via
6085 a temporary register. */
6086 tempreg
= gen_reg_rtx (move_mode
);
6087 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6088 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6090 emit_move_insn (destptr
,
6091 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6092 emit_move_insn (srcptr
,
6093 gen_rtx_PLUS (Pmode
, copy_rtx (srcptr
), adjust
));
6095 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6097 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6101 /* Update DST and SRC rtx. */
6106 /* Helper function for the string operations below. Dest VARIABLE whether
6107 it is aligned to VALUE bytes. If true, jump to the label. */
6109 static rtx_code_label
*
6110 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6112 rtx_code_label
*label
= gen_label_rtx ();
6113 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6114 if (GET_MODE (variable
) == DImode
)
6115 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6117 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6118 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6121 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6123 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6128 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6131 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6132 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6135 if (CONST_INT_P (count
))
6137 HOST_WIDE_INT countval
= INTVAL (count
);
6138 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6141 /* For now MAX_SIZE should be a power of 2. This assert could be
6142 relaxed, but it'll require a bit more complicated epilogue
6144 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6145 for (i
= max_size
; i
>= 1; i
>>= 1)
6147 if (epilogue_size
& i
)
6148 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6154 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6155 count
, 1, OPTAB_DIRECT
);
6156 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6157 count
, QImode
, 1, 4, false);
6161 /* When there are stringops, we can cheaply increase dest and src pointers.
6162 Otherwise we save code size by maintaining offset (zero is readily
6163 available from preceding rep operation) and using x86 addressing modes.
6165 if (TARGET_SINGLE_STRINGOP
)
6169 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6170 src
= change_address (srcmem
, SImode
, srcptr
);
6171 dest
= change_address (destmem
, SImode
, destptr
);
6172 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6174 LABEL_NUSES (label
) = 1;
6178 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6179 src
= change_address (srcmem
, HImode
, srcptr
);
6180 dest
= change_address (destmem
, HImode
, destptr
);
6181 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6183 LABEL_NUSES (label
) = 1;
6187 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6188 src
= change_address (srcmem
, QImode
, srcptr
);
6189 dest
= change_address (destmem
, QImode
, destptr
);
6190 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6192 LABEL_NUSES (label
) = 1;
6197 rtx offset
= force_reg (Pmode
, const0_rtx
);
6202 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6203 src
= change_address (srcmem
, SImode
, srcptr
);
6204 dest
= change_address (destmem
, SImode
, destptr
);
6205 emit_move_insn (dest
, src
);
6206 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6207 true, OPTAB_LIB_WIDEN
);
6209 emit_move_insn (offset
, tmp
);
6211 LABEL_NUSES (label
) = 1;
6215 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6216 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6217 src
= change_address (srcmem
, HImode
, tmp
);
6218 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6219 dest
= change_address (destmem
, HImode
, tmp
);
6220 emit_move_insn (dest
, src
);
6221 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6222 true, OPTAB_LIB_WIDEN
);
6224 emit_move_insn (offset
, tmp
);
6226 LABEL_NUSES (label
) = 1;
6230 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6231 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6232 src
= change_address (srcmem
, QImode
, tmp
);
6233 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6234 dest
= change_address (destmem
, QImode
, tmp
);
6235 emit_move_insn (dest
, src
);
6237 LABEL_NUSES (label
) = 1;
6242 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6243 with value PROMOTED_VAL.
6244 SRC is passed by pointer to be updated on return.
6245 Return value is updated DST. */
6247 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6248 HOST_WIDE_INT size_to_move
)
6250 rtx dst
= destmem
, adjust
;
6251 enum insn_code code
;
6252 machine_mode move_mode
;
6255 /* Find the widest mode in which we could perform moves.
6256 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6257 it until move of such size is supported. */
6258 move_mode
= GET_MODE (promoted_val
);
6259 if (move_mode
== VOIDmode
)
6261 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6263 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6264 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6265 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6267 piece_size
= GET_MODE_SIZE (move_mode
);
6268 code
= optab_handler (mov_optab
, move_mode
);
6269 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6271 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6273 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6274 gcc_assert (size_to_move
% piece_size
== 0);
6275 adjust
= GEN_INT (piece_size
);
6276 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6278 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6280 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6281 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6286 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6288 emit_move_insn (destptr
,
6289 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6291 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6295 /* Update DST rtx. */
6298 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6300 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6301 rtx count
, int max_size
)
6303 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6304 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6305 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6306 gen_lowpart (QImode
, value
), count
, QImode
,
6307 1, max_size
/ 2, true);
6310 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6312 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6313 rtx count
, int max_size
)
6317 if (CONST_INT_P (count
))
6319 HOST_WIDE_INT countval
= INTVAL (count
);
6320 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6323 /* For now MAX_SIZE should be a power of 2. This assert could be
6324 relaxed, but it'll require a bit more complicated epilogue
6326 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6327 for (i
= max_size
; i
>= 1; i
>>= 1)
6329 if (epilogue_size
& i
)
6331 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6332 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6334 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6341 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6346 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6349 dest
= change_address (destmem
, DImode
, destptr
);
6350 emit_insn (gen_strset (destptr
, dest
, value
));
6351 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6352 emit_insn (gen_strset (destptr
, dest
, value
));
6356 dest
= change_address (destmem
, SImode
, destptr
);
6357 emit_insn (gen_strset (destptr
, dest
, value
));
6358 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6359 emit_insn (gen_strset (destptr
, dest
, value
));
6360 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6361 emit_insn (gen_strset (destptr
, dest
, value
));
6362 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6363 emit_insn (gen_strset (destptr
, dest
, value
));
6366 LABEL_NUSES (label
) = 1;
6370 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6373 dest
= change_address (destmem
, DImode
, destptr
);
6374 emit_insn (gen_strset (destptr
, dest
, value
));
6378 dest
= change_address (destmem
, SImode
, destptr
);
6379 emit_insn (gen_strset (destptr
, dest
, value
));
6380 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6381 emit_insn (gen_strset (destptr
, dest
, value
));
6384 LABEL_NUSES (label
) = 1;
6388 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6389 dest
= change_address (destmem
, SImode
, destptr
);
6390 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6392 LABEL_NUSES (label
) = 1;
6396 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6397 dest
= change_address (destmem
, HImode
, destptr
);
6398 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6400 LABEL_NUSES (label
) = 1;
6404 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6405 dest
= change_address (destmem
, QImode
, destptr
);
6406 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6408 LABEL_NUSES (label
) = 1;
6412 /* Adjust COUNTER by the VALUE. */
6414 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6416 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6419 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6420 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6421 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6423 Return value is updated DESTMEM. */
6426 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6427 rtx destptr
, rtx srcptr
, rtx value
,
6428 rtx vec_value
, rtx count
, int align
,
6429 int desired_alignment
, bool issetmem
)
6432 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6436 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6439 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6440 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6442 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6445 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6446 ix86_adjust_counter (count
, i
);
6448 LABEL_NUSES (label
) = 1;
6449 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6455 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6456 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6457 and jump to DONE_LABEL. */
6459 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6460 rtx destptr
, rtx srcptr
,
6461 rtx value
, rtx vec_value
,
6462 rtx count
, int size
,
6463 rtx done_label
, bool issetmem
)
6465 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6466 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6470 /* If we do not have vector value to copy, we must reduce size. */
6475 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6477 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6478 mode
= GET_MODE (value
);
6481 mode
= GET_MODE (vec_value
), value
= vec_value
;
6485 /* Choose appropriate vector mode. */
6487 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6488 else if (size
>= 16)
6489 mode
= TARGET_SSE
? V16QImode
: DImode
;
6490 srcmem
= change_address (srcmem
, mode
, srcptr
);
6492 destmem
= change_address (destmem
, mode
, destptr
);
6493 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6494 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6495 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6498 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6501 emit_move_insn (destmem
, srcmem
);
6502 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6504 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6507 destmem
= offset_address (destmem
, count
, 1);
6508 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6509 GET_MODE_SIZE (mode
));
6512 srcmem
= offset_address (srcmem
, count
, 1);
6513 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6514 GET_MODE_SIZE (mode
));
6516 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6519 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6522 emit_move_insn (destmem
, srcmem
);
6523 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6525 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6527 emit_jump_insn (gen_jump (done_label
));
6531 LABEL_NUSES (label
) = 1;
6534 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6535 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6536 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6537 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6538 DONE_LABEL is a label after the whole copying sequence. The label is created
6539 on demand if *DONE_LABEL is NULL.
6540 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6541 bounds after the initial copies.
6543 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6544 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6545 we will dispatch to a library call for large blocks.
6547 In pseudocode we do:
6551 Assume that SIZE is 4. Bigger sizes are handled analogously
6554 copy 4 bytes from SRCPTR to DESTPTR
6555 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6560 copy 1 byte from SRCPTR to DESTPTR
6563 copy 2 bytes from SRCPTR to DESTPTR
6564 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6569 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6570 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6572 OLD_DESPTR = DESTPTR;
6573 Align DESTPTR up to DESIRED_ALIGN
6574 SRCPTR += DESTPTR - OLD_DESTPTR
6575 COUNT -= DEST_PTR - OLD_DESTPTR
6577 Round COUNT down to multiple of SIZE
6578 << optional caller supplied zero size guard is here >>
6579 << optional caller supplied dynamic check is here >>
6580 << caller supplied main copy loop is here >>
6585 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6586 rtx
*destptr
, rtx
*srcptr
,
6588 rtx value
, rtx vec_value
,
6590 rtx_code_label
**done_label
,
6594 unsigned HOST_WIDE_INT
*min_size
,
6598 rtx_code_label
*loop_label
= NULL
, *label
;
6601 int prolog_size
= 0;
6604 /* Chose proper value to copy. */
6605 if (issetmem
&& VECTOR_MODE_P (mode
))
6606 mode_value
= vec_value
;
6609 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6611 /* See if block is big or small, handle small blocks. */
6612 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6615 loop_label
= gen_label_rtx ();
6618 *done_label
= gen_label_rtx ();
6620 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6624 /* Handle sizes > 3. */
6625 for (;size2
> 2; size2
>>= 1)
6626 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6630 size2
, *done_label
, issetmem
);
6631 /* Nothing to copy? Jump to DONE_LABEL if so */
6632 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6635 /* Do a byte copy. */
6636 destmem
= change_address (destmem
, QImode
, *destptr
);
6638 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6641 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6642 emit_move_insn (destmem
, srcmem
);
6645 /* Handle sizes 2 and 3. */
6646 label
= ix86_expand_aligntest (*count
, 2, false);
6647 destmem
= change_address (destmem
, HImode
, *destptr
);
6648 destmem
= offset_address (destmem
, *count
, 1);
6649 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6651 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6654 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6655 srcmem
= offset_address (srcmem
, *count
, 1);
6656 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6657 emit_move_insn (destmem
, srcmem
);
6661 LABEL_NUSES (label
) = 1;
6662 emit_jump_insn (gen_jump (*done_label
));
6666 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6667 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6669 /* Start memcpy for COUNT >= SIZE. */
6672 emit_label (loop_label
);
6673 LABEL_NUSES (loop_label
) = 1;
6676 /* Copy first desired_align bytes. */
6678 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6679 destmem
= change_address (destmem
, mode
, *destptr
);
6680 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6681 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6684 emit_move_insn (destmem
, mode_value
);
6687 emit_move_insn (destmem
, srcmem
);
6688 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6690 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6691 prolog_size
+= GET_MODE_SIZE (mode
);
6695 /* Copy last SIZE bytes. */
6696 destmem
= offset_address (destmem
, *count
, 1);
6697 destmem
= offset_address (destmem
,
6698 GEN_INT (-size
- prolog_size
),
6701 emit_move_insn (destmem
, mode_value
);
6704 srcmem
= offset_address (srcmem
, *count
, 1);
6705 srcmem
= offset_address (srcmem
,
6706 GEN_INT (-size
- prolog_size
),
6708 emit_move_insn (destmem
, srcmem
);
6710 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6712 destmem
= offset_address (destmem
, modesize
, 1);
6714 emit_move_insn (destmem
, mode_value
);
6717 srcmem
= offset_address (srcmem
, modesize
, 1);
6718 emit_move_insn (destmem
, srcmem
);
6722 /* Align destination. */
6723 if (desired_align
> 1 && desired_align
> align
)
6725 rtx saveddest
= *destptr
;
6727 gcc_assert (desired_align
<= size
);
6728 /* Align destptr up, place it to new register. */
6729 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6730 GEN_INT (prolog_size
),
6731 NULL_RTX
, 1, OPTAB_DIRECT
);
6732 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6733 REG_POINTER (*destptr
) = 1;
6734 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6735 GEN_INT (-desired_align
),
6736 *destptr
, 1, OPTAB_DIRECT
);
6737 /* See how many bytes we skipped. */
6738 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6740 saveddest
, 1, OPTAB_DIRECT
);
6741 /* Adjust srcptr and count. */
6743 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6744 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6745 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6746 saveddest
, *count
, 1, OPTAB_DIRECT
);
6747 /* We copied at most size + prolog_size. */
6748 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6750 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6754 /* Our loops always round down the block size, but for dispatch to
6755 library we need precise value. */
6757 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6758 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6762 gcc_assert (prolog_size
== 0);
6763 /* Decrease count, so we won't end up copying last word twice. */
6764 if (!CONST_INT_P (*count
))
6765 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6766 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6768 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6769 (unsigned HOST_WIDE_INT
)size
));
6771 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6776 /* This function is like the previous one, except here we know how many bytes
6777 need to be copied. That allows us to update alignment not only of DST, which
6778 is returned, but also of SRC, which is passed as a pointer for that
6781 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6782 rtx srcreg
, rtx value
, rtx vec_value
,
6783 int desired_align
, int align_bytes
,
6788 rtx orig_src
= NULL
;
6790 int copied_bytes
= 0;
6794 gcc_assert (srcp
!= NULL
);
6799 for (piece_size
= 1;
6800 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6803 if (align_bytes
& piece_size
)
6807 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6808 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6810 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6813 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6814 copied_bytes
+= piece_size
;
6817 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6818 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6819 if (MEM_SIZE_KNOWN_P (orig_dst
))
6820 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6824 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6826 if (src_align_bytes
>= 0)
6827 src_align_bytes
= desired_align
- src_align_bytes
;
6828 if (src_align_bytes
>= 0)
6830 unsigned int src_align
;
6831 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6833 if ((src_align_bytes
& (src_align
- 1))
6834 == (align_bytes
& (src_align
- 1)))
6837 if (src_align
> (unsigned int) desired_align
)
6838 src_align
= desired_align
;
6839 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6840 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6842 if (MEM_SIZE_KNOWN_P (orig_src
))
6843 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6850 /* Return true if ALG can be used in current context.
6851 Assume we expand memset if MEMSET is true. */
6853 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6855 if (alg
== no_stringop
)
6857 if (alg
== vector_loop
)
6858 return TARGET_SSE
|| TARGET_AVX
;
6859 /* Algorithms using the rep prefix want at least edi and ecx;
6860 additionally, memset wants eax and memcpy wants esi. Don't
6861 consider such algorithms if the user has appropriated those
6862 registers for their own purposes, or if we have a non-default
6863 address space, since some string insns cannot override the segment. */
6864 if (alg
== rep_prefix_1_byte
6865 || alg
== rep_prefix_4_byte
6866 || alg
== rep_prefix_8_byte
)
6870 if (fixed_regs
[CX_REG
]
6871 || fixed_regs
[DI_REG
]
6872 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6878 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6879 static enum stringop_alg
6880 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6881 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6882 bool memset
, bool zero_memset
, bool have_as
,
6883 int *dynamic_check
, bool *noalign
, bool recur
)
6885 const struct stringop_algs
*algs
;
6886 bool optimize_for_speed
;
6888 const struct processor_costs
*cost
;
6890 bool any_alg_usable_p
= false;
6893 *dynamic_check
= -1;
6895 /* Even if the string operation call is cold, we still might spend a lot
6896 of time processing large blocks. */
6897 if (optimize_function_for_size_p (cfun
)
6898 || (optimize_insn_for_size_p ()
6900 || (expected_size
!= -1 && expected_size
< 256))))
6901 optimize_for_speed
= false;
6903 optimize_for_speed
= true;
6905 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6907 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6909 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6911 /* See maximal size for user defined algorithm. */
6912 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6914 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6915 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6916 any_alg_usable_p
|= usable
;
6918 if (candidate
!= libcall
&& candidate
&& usable
)
6919 max
= algs
->size
[i
].max
;
6922 /* If expected size is not known but max size is small enough
6923 so inline version is a win, set expected size into
6925 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6926 && expected_size
== -1)
6927 expected_size
= min_size
/ 2 + max_size
/ 2;
6929 /* If user specified the algorithm, honor it if possible. */
6930 if (ix86_stringop_alg
!= no_stringop
6931 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6932 return ix86_stringop_alg
;
6933 /* rep; movq or rep; movl is the smallest variant. */
6934 else if (!optimize_for_speed
)
6937 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6938 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6939 ? rep_prefix_1_byte
: loop_1_byte
;
6941 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6942 ? rep_prefix_4_byte
: loop
;
6944 /* Very tiny blocks are best handled via the loop, REP is expensive to
6946 else if (expected_size
!= -1 && expected_size
< 4)
6948 else if (expected_size
!= -1)
6950 enum stringop_alg alg
= libcall
;
6951 bool alg_noalign
= false;
6952 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6954 /* We get here if the algorithms that were not libcall-based
6955 were rep-prefix based and we are unable to use rep prefixes
6956 based on global register usage. Break out of the loop and
6957 use the heuristic below. */
6958 if (algs
->size
[i
].max
== 0)
6960 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6962 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6964 if (candidate
!= libcall
6965 && alg_usable_p (candidate
, memset
, have_as
))
6968 alg_noalign
= algs
->size
[i
].noalign
;
6970 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6971 last non-libcall inline algorithm. */
6972 if (TARGET_INLINE_ALL_STRINGOPS
)
6974 /* When the current size is best to be copied by a libcall,
6975 but we are still forced to inline, run the heuristic below
6976 that will pick code for medium sized blocks. */
6979 *noalign
= alg_noalign
;
6982 else if (!any_alg_usable_p
)
6985 else if (alg_usable_p (candidate
, memset
, have_as
))
6987 *noalign
= algs
->size
[i
].noalign
;
6993 /* When asked to inline the call anyway, try to pick meaningful choice.
6994 We look for maximal size of block that is faster to copy by hand and
6995 take blocks of at most of that size guessing that average size will
6996 be roughly half of the block.
6998 If this turns out to be bad, we might simply specify the preferred
6999 choice in ix86_costs. */
7000 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7001 && (algs
->unknown_size
== libcall
7002 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
7004 enum stringop_alg alg
;
7005 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
7007 /* If there aren't any usable algorithms or if recursing already,
7008 then recursing on smaller sizes or same size isn't going to
7009 find anything. Just return the simple byte-at-a-time copy loop. */
7010 if (!any_alg_usable_p
|| recur
)
7012 /* Pick something reasonable. */
7013 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7014 *dynamic_check
= 128;
7017 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7018 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7019 gcc_assert (*dynamic_check
== -1);
7020 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7021 *dynamic_check
= max
;
7023 gcc_assert (alg
!= libcall
);
7026 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7027 ? algs
->unknown_size
: libcall
);
7030 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7031 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7033 decide_alignment (int align
,
7034 enum stringop_alg alg
,
7036 machine_mode move_mode
)
7038 int desired_align
= 0;
7040 gcc_assert (alg
!= no_stringop
);
7044 if (move_mode
== VOIDmode
)
7047 desired_align
= GET_MODE_SIZE (move_mode
);
7048 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7049 copying whole cacheline at once. */
7050 if (TARGET_PENTIUMPRO
7051 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7056 if (desired_align
< align
)
7057 desired_align
= align
;
7058 if (expected_size
!= -1 && expected_size
< 4)
7059 desired_align
= align
;
7061 return desired_align
;
7065 /* Helper function for memcpy. For QImode value 0xXY produce
7066 0xXYXYXYXY of wide specified by MODE. This is essentially
7067 a * 0x10101010, but we can do slightly better than
7068 synth_mult by unwinding the sequence by hand on CPUs with
7071 promote_duplicated_reg (machine_mode mode
, rtx val
)
7073 machine_mode valmode
= GET_MODE (val
);
7075 int nops
= mode
== DImode
? 3 : 2;
7077 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7078 if (val
== const0_rtx
)
7079 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7080 if (CONST_INT_P (val
))
7082 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7087 v
|= (v
<< 16) << 16;
7088 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7091 if (valmode
== VOIDmode
)
7093 if (valmode
!= QImode
)
7094 val
= gen_lowpart (QImode
, val
);
7097 if (!TARGET_PARTIAL_REG_STALL
)
7099 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7100 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7101 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7102 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7104 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7105 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7106 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7111 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7113 if (!TARGET_PARTIAL_REG_STALL
)
7115 emit_insn (gen_insvsi_1 (reg
, reg
));
7117 emit_insn (gen_insvdi_1 (reg
, reg
));
7120 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7121 NULL
, 1, OPTAB_DIRECT
);
7122 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7125 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7126 NULL
, 1, OPTAB_DIRECT
);
7127 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7130 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7131 NULL
, 1, OPTAB_DIRECT
);
7132 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7137 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7138 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7139 alignment from ALIGN to DESIRED_ALIGN. */
7141 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7147 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7148 promoted_val
= promote_duplicated_reg (DImode
, val
);
7149 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7150 promoted_val
= promote_duplicated_reg (SImode
, val
);
7151 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7152 promoted_val
= promote_duplicated_reg (HImode
, val
);
7156 return promoted_val
;
7159 /* Copy the address to a Pmode register. This is used for x32 to
7160 truncate DImode TLS address to a SImode register. */
7163 ix86_copy_addr_to_reg (rtx addr
)
7166 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7168 reg
= copy_addr_to_reg (addr
);
7169 REG_POINTER (reg
) = 1;
7174 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7175 reg
= copy_to_mode_reg (DImode
, addr
);
7176 REG_POINTER (reg
) = 1;
7177 return gen_rtx_SUBREG (SImode
, reg
, 0);
7181 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7182 operations when profitable. The code depends upon architecture, block size
7183 and alignment, but always has one of the following overall structures:
7185 Aligned move sequence:
7187 1) Prologue guard: Conditional that jumps up to epilogues for small
7188 blocks that can be handled by epilogue alone. This is faster
7189 but also needed for correctness, since prologue assume the block
7190 is larger than the desired alignment.
7192 Optional dynamic check for size and libcall for large
7193 blocks is emitted here too, with -minline-stringops-dynamically.
7195 2) Prologue: copy first few bytes in order to get destination
7196 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7197 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7198 copied. We emit either a jump tree on power of two sized
7199 blocks, or a byte loop.
7201 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7202 with specified algorithm.
7204 4) Epilogue: code copying tail of the block that is too small to be
7205 handled by main body (or up to size guarded by prologue guard).
7207 Misaligned move sequence
7209 1) missaligned move prologue/epilogue containing:
7210 a) Prologue handling small memory blocks and jumping to done_label
7211 (skipped if blocks are known to be large enough)
7212 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7213 needed by single possibly misaligned move
7214 (skipped if alignment is not needed)
7215 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7217 2) Zero size guard dispatching to done_label, if needed
7219 3) dispatch to library call, if needed,
7221 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7222 with specified algorithm. */
7224 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7225 rtx align_exp
, rtx expected_align_exp
,
7226 rtx expected_size_exp
, rtx min_size_exp
,
7227 rtx max_size_exp
, rtx probable_max_size_exp
,
7232 rtx_code_label
*label
= NULL
;
7234 rtx_code_label
*jump_around_label
= NULL
;
7235 HOST_WIDE_INT align
= 1;
7236 unsigned HOST_WIDE_INT count
= 0;
7237 HOST_WIDE_INT expected_size
= -1;
7238 int size_needed
= 0, epilogue_size_needed
;
7239 int desired_align
= 0, align_bytes
= 0;
7240 enum stringop_alg alg
;
7241 rtx promoted_val
= NULL
;
7242 rtx vec_promoted_val
= NULL
;
7243 bool force_loopy_epilogue
= false;
7245 bool need_zero_guard
= false;
7247 machine_mode move_mode
= VOIDmode
;
7248 machine_mode wider_mode
;
7249 int unroll_factor
= 1;
7250 /* TODO: Once value ranges are available, fill in proper data. */
7251 unsigned HOST_WIDE_INT min_size
= 0;
7252 unsigned HOST_WIDE_INT max_size
= -1;
7253 unsigned HOST_WIDE_INT probable_max_size
= -1;
7254 bool misaligned_prologue_used
= false;
7257 if (CONST_INT_P (align_exp
))
7258 align
= INTVAL (align_exp
);
7259 /* i386 can do misaligned access on reasonably increased cost. */
7260 if (CONST_INT_P (expected_align_exp
)
7261 && INTVAL (expected_align_exp
) > align
)
7262 align
= INTVAL (expected_align_exp
);
7263 /* ALIGN is the minimum of destination and source alignment, but we care here
7264 just about destination alignment. */
7266 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7267 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7269 if (CONST_INT_P (count_exp
))
7271 min_size
= max_size
= probable_max_size
= count
= expected_size
7272 = INTVAL (count_exp
);
7273 /* When COUNT is 0, there is nothing to do. */
7280 min_size
= INTVAL (min_size_exp
);
7282 max_size
= INTVAL (max_size_exp
);
7283 if (probable_max_size_exp
)
7284 probable_max_size
= INTVAL (probable_max_size_exp
);
7285 if (CONST_INT_P (expected_size_exp
))
7286 expected_size
= INTVAL (expected_size_exp
);
7289 /* Make sure we don't need to care about overflow later on. */
7290 if (count
> (HOST_WIDE_INT_1U
<< 30))
7293 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7295 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7297 /* Step 0: Decide on preferred algorithm, desired alignment and
7298 size of chunks to be copied by main loop. */
7299 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7301 issetmem
&& val_exp
== const0_rtx
, have_as
,
7302 &dynamic_check
, &noalign
, false);
7305 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7306 stringop_alg_names
[alg
]);
7310 gcc_assert (alg
!= no_stringop
);
7312 /* For now vector-version of memset is generated only for memory zeroing, as
7313 creating of promoted vector value is very cheap in this case. */
7314 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7315 alg
= unrolled_loop
;
7318 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7319 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7321 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7324 move_mode
= word_mode
;
7332 need_zero_guard
= true;
7336 need_zero_guard
= true;
7339 need_zero_guard
= true;
7340 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7343 need_zero_guard
= true;
7345 /* Find the widest supported mode. */
7346 move_mode
= word_mode
;
7347 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7348 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7349 move_mode
= wider_mode
;
7351 if (TARGET_AVX128_OPTIMAL
&& GET_MODE_BITSIZE (move_mode
) > 128)
7354 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7355 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7356 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7358 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7359 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7360 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7361 move_mode
= word_mode
;
7363 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7365 case rep_prefix_8_byte
:
7368 case rep_prefix_4_byte
:
7371 case rep_prefix_1_byte
:
7375 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7376 epilogue_size_needed
= size_needed
;
7378 /* If we are going to call any library calls conditionally, make sure any
7379 pending stack adjustment happen before the first conditional branch,
7380 otherwise they will be emitted before the library call only and won't
7381 happen from the other branches. */
7382 if (dynamic_check
!= -1)
7383 do_pending_stack_adjust ();
7385 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7386 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7387 align
= desired_align
;
7389 /* Step 1: Prologue guard. */
7391 /* Alignment code needs count to be in register. */
7392 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7394 if (INTVAL (count_exp
) > desired_align
7395 && INTVAL (count_exp
) > size_needed
)
7398 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7399 if (align_bytes
<= 0)
7402 align_bytes
= desired_align
- align_bytes
;
7404 if (align_bytes
== 0)
7405 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7407 gcc_assert (desired_align
>= 1 && align
>= 1);
7409 /* Misaligned move sequences handle both prologue and epilogue at once.
7410 Default code generation results in a smaller code for large alignments
7411 and also avoids redundant job when sizes are known precisely. */
7412 misaligned_prologue_used
7413 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7414 && MAX (desired_align
, epilogue_size_needed
) <= 32
7415 && desired_align
<= epilogue_size_needed
7416 && ((desired_align
> align
&& !align_bytes
)
7417 || (!count
&& epilogue_size_needed
> 1)));
7419 /* Do the cheap promotion to allow better CSE across the
7420 main loop and epilogue (ie one load of the big constant in the
7422 For now the misaligned move sequences do not have fast path
7423 without broadcasting. */
7424 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7426 if (alg
== vector_loop
)
7428 gcc_assert (val_exp
== const0_rtx
);
7429 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7430 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7431 GET_MODE_SIZE (word_mode
),
7432 desired_align
, align
);
7436 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7437 desired_align
, align
);
7440 /* Misaligned move sequences handles both prologues and epilogues at once.
7441 Default code generation results in smaller code for large alignments and
7442 also avoids redundant job when sizes are known precisely. */
7443 if (misaligned_prologue_used
)
7445 /* Misaligned move prologue handled small blocks by itself. */
7446 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7447 (dst
, src
, &destreg
, &srcreg
,
7448 move_mode
, promoted_val
, vec_promoted_val
,
7451 desired_align
< align
7452 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7453 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7455 src
= change_address (src
, BLKmode
, srcreg
);
7456 dst
= change_address (dst
, BLKmode
, destreg
);
7457 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7458 epilogue_size_needed
= 0;
7460 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7462 /* It is possible that we copied enough so the main loop will not
7464 gcc_assert (size_needed
> 1);
7465 if (jump_around_label
== NULL_RTX
)
7466 jump_around_label
= gen_label_rtx ();
7467 emit_cmp_and_jump_insns (count_exp
,
7468 GEN_INT (size_needed
),
7469 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7470 if (expected_size
== -1
7471 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7472 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7474 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7477 /* Ensure that alignment prologue won't copy past end of block. */
7478 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7480 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7481 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7482 Make sure it is power of 2. */
7483 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7485 /* To improve performance of small blocks, we jump around the VAL
7486 promoting mode. This mean that if the promoted VAL is not constant,
7487 we might not use it in the epilogue and have to use byte
7489 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7490 force_loopy_epilogue
= true;
7491 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7492 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7494 /* If main algorithm works on QImode, no epilogue is needed.
7495 For small sizes just don't align anything. */
7496 if (size_needed
== 1)
7497 desired_align
= align
;
7502 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7504 label
= gen_label_rtx ();
7505 emit_cmp_and_jump_insns (count_exp
,
7506 GEN_INT (epilogue_size_needed
),
7507 LTU
, 0, counter_mode (count_exp
), 1, label
);
7508 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7509 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7511 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7515 /* Emit code to decide on runtime whether library call or inline should be
7517 if (dynamic_check
!= -1)
7519 if (!issetmem
&& CONST_INT_P (count_exp
))
7521 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7523 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7524 count_exp
= const0_rtx
;
7530 rtx_code_label
*hot_label
= gen_label_rtx ();
7531 if (jump_around_label
== NULL_RTX
)
7532 jump_around_label
= gen_label_rtx ();
7533 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7534 LEU
, 0, counter_mode (count_exp
),
7536 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7538 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7540 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7541 emit_jump (jump_around_label
);
7542 emit_label (hot_label
);
7546 /* Step 2: Alignment prologue. */
7547 /* Do the expensive promotion once we branched off the small blocks. */
7548 if (issetmem
&& !promoted_val
)
7549 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7550 desired_align
, align
);
7552 if (desired_align
> align
&& !misaligned_prologue_used
)
7554 if (align_bytes
== 0)
7556 /* Except for the first move in prologue, we no longer know
7557 constant offset in aliasing info. It don't seems to worth
7558 the pain to maintain it for the first move, so throw away
7560 dst
= change_address (dst
, BLKmode
, destreg
);
7562 src
= change_address (src
, BLKmode
, srcreg
);
7563 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7564 promoted_val
, vec_promoted_val
,
7565 count_exp
, align
, desired_align
,
7567 /* At most desired_align - align bytes are copied. */
7568 if (min_size
< (unsigned)(desired_align
- align
))
7571 min_size
-= desired_align
- align
;
7575 /* If we know how many bytes need to be stored before dst is
7576 sufficiently aligned, maintain aliasing info accurately. */
7577 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7585 count_exp
= plus_constant (counter_mode (count_exp
),
7586 count_exp
, -align_bytes
);
7587 count
-= align_bytes
;
7588 min_size
-= align_bytes
;
7589 max_size
-= align_bytes
;
7592 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7593 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7594 || (align_bytes
== 0
7595 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7596 + desired_align
- align
))))
7598 /* It is possible that we copied enough so the main loop will not
7600 gcc_assert (size_needed
> 1);
7601 if (label
== NULL_RTX
)
7602 label
= gen_label_rtx ();
7603 emit_cmp_and_jump_insns (count_exp
,
7604 GEN_INT (size_needed
),
7605 LTU
, 0, counter_mode (count_exp
), 1, label
);
7606 if (expected_size
== -1
7607 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7608 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7610 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7613 if (label
&& size_needed
== 1)
7616 LABEL_NUSES (label
) = 1;
7618 epilogue_size_needed
= 1;
7620 promoted_val
= val_exp
;
7622 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7623 epilogue_size_needed
= size_needed
;
7625 /* Step 3: Main loop. */
7636 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7637 count_exp
, move_mode
, unroll_factor
,
7638 expected_size
, issetmem
);
7641 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7642 vec_promoted_val
, count_exp
, move_mode
,
7643 unroll_factor
, expected_size
, issetmem
);
7645 case rep_prefix_8_byte
:
7646 case rep_prefix_4_byte
:
7647 case rep_prefix_1_byte
:
7648 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7649 val_exp
, count_exp
, move_mode
, issetmem
);
7652 /* Adjust properly the offset of src and dest memory for aliasing. */
7653 if (CONST_INT_P (count_exp
))
7656 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7657 (count
/ size_needed
) * size_needed
);
7658 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7659 (count
/ size_needed
) * size_needed
);
7664 src
= change_address (src
, BLKmode
, srcreg
);
7665 dst
= change_address (dst
, BLKmode
, destreg
);
7668 /* Step 4: Epilogue to copy the remaining bytes. */
7672 /* When the main loop is done, COUNT_EXP might hold original count,
7673 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7674 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7675 bytes. Compensate if needed. */
7677 if (size_needed
< epilogue_size_needed
)
7679 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7680 GEN_INT (size_needed
- 1), count_exp
, 1,
7682 if (tmp
!= count_exp
)
7683 emit_move_insn (count_exp
, tmp
);
7686 LABEL_NUSES (label
) = 1;
7689 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7691 if (force_loopy_epilogue
)
7692 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7693 epilogue_size_needed
);
7697 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7698 vec_promoted_val
, count_exp
,
7699 epilogue_size_needed
);
7701 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7702 epilogue_size_needed
);
7705 if (jump_around_label
)
7706 emit_label (jump_around_label
);
7711 /* Expand the appropriate insns for doing strlen if not just doing
7714 out = result, initialized with the start address
7715 align_rtx = alignment of the address.
7716 scratch = scratch register, initialized with the startaddress when
7717 not aligned, otherwise undefined
7719 This is just the body. It needs the initializations mentioned above and
7720 some address computing at the end. These things are done in i386.md. */
7723 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7727 rtx_code_label
*align_2_label
= NULL
;
7728 rtx_code_label
*align_3_label
= NULL
;
7729 rtx_code_label
*align_4_label
= gen_label_rtx ();
7730 rtx_code_label
*end_0_label
= gen_label_rtx ();
7732 rtx tmpreg
= gen_reg_rtx (SImode
);
7733 rtx scratch
= gen_reg_rtx (SImode
);
7737 if (CONST_INT_P (align_rtx
))
7738 align
= INTVAL (align_rtx
);
7740 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7742 /* Is there a known alignment and is it less than 4? */
7745 rtx scratch1
= gen_reg_rtx (Pmode
);
7746 emit_move_insn (scratch1
, out
);
7747 /* Is there a known alignment and is it not 2? */
7750 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7751 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7753 /* Leave just the 3 lower bits. */
7754 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7755 NULL_RTX
, 0, OPTAB_WIDEN
);
7757 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7758 Pmode
, 1, align_4_label
);
7759 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7760 Pmode
, 1, align_2_label
);
7761 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7762 Pmode
, 1, align_3_label
);
7766 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7767 check if is aligned to 4 - byte. */
7769 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7770 NULL_RTX
, 0, OPTAB_WIDEN
);
7772 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7773 Pmode
, 1, align_4_label
);
7776 mem
= change_address (src
, QImode
, out
);
7778 /* Now compare the bytes. */
7780 /* Compare the first n unaligned byte on a byte per byte basis. */
7781 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7782 QImode
, 1, end_0_label
);
7784 /* Increment the address. */
7785 emit_insn (gen_add2_insn (out
, const1_rtx
));
7787 /* Not needed with an alignment of 2 */
7790 emit_label (align_2_label
);
7792 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7795 emit_insn (gen_add2_insn (out
, const1_rtx
));
7797 emit_label (align_3_label
);
7800 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7803 emit_insn (gen_add2_insn (out
, const1_rtx
));
7806 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7807 align this loop. It gives only huge programs, but does not help to
7809 emit_label (align_4_label
);
7811 mem
= change_address (src
, SImode
, out
);
7812 emit_move_insn (scratch
, mem
);
7813 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7815 /* This formula yields a nonzero result iff one of the bytes is zero.
7816 This saves three branches inside loop and many cycles. */
7818 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7819 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7820 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7821 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7822 gen_int_mode (0x80808080, SImode
)));
7823 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7828 rtx reg
= gen_reg_rtx (SImode
);
7829 rtx reg2
= gen_reg_rtx (Pmode
);
7830 emit_move_insn (reg
, tmpreg
);
7831 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7833 /* If zero is not in the first two bytes, move two bytes forward. */
7834 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7835 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7836 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7837 emit_insn (gen_rtx_SET (tmpreg
,
7838 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7841 /* Emit lea manually to avoid clobbering of flags. */
7842 emit_insn (gen_rtx_SET (reg2
, gen_rtx_PLUS (Pmode
, out
, const2_rtx
)));
7844 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7845 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7846 emit_insn (gen_rtx_SET (out
,
7847 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7853 rtx_code_label
*end_2_label
= gen_label_rtx ();
7854 /* Is zero in the first two bytes? */
7856 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7857 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7858 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7859 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7860 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7862 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7863 JUMP_LABEL (tmp
) = end_2_label
;
7865 /* Not in the first two. Move two bytes forward. */
7866 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7867 emit_insn (gen_add2_insn (out
, const2_rtx
));
7869 emit_label (end_2_label
);
7873 /* Avoid branch in fixing the byte. */
7874 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7875 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7876 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7877 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7878 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7880 emit_label (end_0_label
);
7883 /* Expand strlen. */
7886 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7888 if (TARGET_UNROLL_STRLEN
7889 && TARGET_INLINE_ALL_STRINGOPS
7890 && eoschar
== const0_rtx
7893 /* The generic case of strlen expander is long. Avoid it's
7894 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7895 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7896 /* Well it seems that some optimizer does not combine a call like
7897 foo(strlen(bar), strlen(bar));
7898 when the move and the subtraction is done here. It does calculate
7899 the length just once when these instructions are done inside of
7900 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7901 often used and I use one fewer register for the lifetime of
7902 output_strlen_unroll() this is better. */
7904 emit_move_insn (out
, addr
);
7906 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7908 /* strlensi_unroll_1 returns the address of the zero at the end of
7909 the string, like memchr(), so compute the length by subtracting
7910 the start address. */
7911 emit_insn (gen_sub2_insn (out
, addr
));
7918 /* For given symbol (function) construct code to compute address of it's PLT
7919 entry in large x86-64 PIC model. */
7922 construct_plt_address (rtx symbol
)
7926 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7927 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7928 gcc_assert (Pmode
== DImode
);
7930 tmp
= gen_reg_rtx (Pmode
);
7931 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7933 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7934 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7938 /* Additional registers that are clobbered by SYSV calls. */
7940 static int const x86_64_ms_sysv_extra_clobbered_registers
7941 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7945 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7946 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7950 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7952 rtx pop
, bool sibcall
)
7955 rtx use
= NULL
, call
;
7956 unsigned int vec_len
= 0;
7959 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7961 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7963 && (lookup_attribute ("interrupt",
7964 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7965 error ("interrupt service routine cannot be called directly");
7970 if (pop
== const0_rtx
)
7972 gcc_assert (!TARGET_64BIT
|| !pop
);
7974 if (TARGET_MACHO
&& !TARGET_64BIT
)
7977 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7978 fnaddr
= machopic_indirect_call_target (fnaddr
);
7983 /* Static functions and indirect calls don't need the pic register. Also,
7984 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7985 it an indirect call. */
7986 rtx addr
= XEXP (fnaddr
, 0);
7988 && GET_CODE (addr
) == SYMBOL_REF
7989 && !SYMBOL_REF_LOCAL_P (addr
))
7992 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7993 || !lookup_attribute ("noplt",
7994 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
7997 || (ix86_cmodel
== CM_LARGE_PIC
7998 && DEFAULT_ABI
!= MS_ABI
))
8000 use_reg (&use
, gen_rtx_REG (Pmode
,
8001 REAL_PIC_OFFSET_TABLE_REGNUM
));
8002 if (ix86_use_pseudo_pic_reg ())
8003 emit_move_insn (gen_rtx_REG (Pmode
,
8004 REAL_PIC_OFFSET_TABLE_REGNUM
),
8005 pic_offset_table_rtx
);
8008 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8012 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8013 gen_rtvec (1, addr
),
8015 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8019 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8021 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8022 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8025 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8026 /* Pmode may not be the same as word_mode for x32, which
8027 doesn't support indirect branch via 32-bit memory slot.
8028 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8029 indirect branch via x32 GOT slot is OK. */
8030 if (GET_MODE (fnaddr
) != word_mode
)
8031 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8032 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8037 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8038 parameters passed in vector registers. */
8040 && (INTVAL (callarg2
) > 0
8041 || (INTVAL (callarg2
) == 0
8042 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8044 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8045 emit_move_insn (al
, callarg2
);
8049 if (ix86_cmodel
== CM_LARGE_PIC
8052 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8053 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8054 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8055 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8056 branch via x32 GOT slot is OK. */
8057 else if (!(TARGET_X32
8059 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8060 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8062 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8063 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8065 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8066 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8069 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8072 call
= gen_rtx_SET (retval
, call
);
8073 vec
[vec_len
++] = call
;
8077 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8078 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8079 vec
[vec_len
++] = pop
;
8082 if (cfun
->machine
->no_caller_saved_registers
8084 || (!TREE_THIS_VOLATILE (fndecl
)
8085 && !lookup_attribute ("no_caller_saved_registers",
8086 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8088 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8089 bool is_64bit_ms_abi
= (TARGET_64BIT
8090 && ix86_function_abi (fndecl
) == MS_ABI
);
8091 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8093 /* If there are no caller-saved registers, add all registers
8094 that are clobbered by the call which returns. */
8095 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8097 && (ix86_call_used_regs
[i
] == 1
8098 || (ix86_call_used_regs
[i
] & c_mask
))
8099 && !STACK_REGNO_P (i
)
8100 && !MMX_REGNO_P (i
))
8102 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8104 else if (TARGET_64BIT_MS_ABI
8105 && (!callarg2
|| INTVAL (callarg2
) != -2))
8109 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8111 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8112 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8114 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8117 /* Set here, but it may get cleared later. */
8118 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8123 /* Don't break hot-patched functions. */
8124 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8127 /* TODO: Cases not yet examined. */
8128 else if (flag_split_stack
)
8129 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8133 gcc_assert (!reload_completed
);
8134 cfun
->machine
->call_ms2sysv
= true;
8140 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8141 rtx_insn
*call_insn
= emit_call_insn (call
);
8143 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8148 /* Split simple return with popping POPC bytes from stack to indirect
8149 branch with stack adjustment . */
8152 ix86_split_simple_return_pop_internal (rtx popc
)
8154 struct machine_function
*m
= cfun
->machine
;
8155 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8158 /* There is no "pascal" calling convention in any 64bit ABI. */
8159 gcc_assert (!TARGET_64BIT
);
8161 insn
= emit_insn (gen_pop (ecx
));
8162 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8163 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8165 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8166 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8167 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8168 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8169 RTX_FRAME_RELATED_P (insn
) = 1;
8171 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8172 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8173 insn
= emit_insn (x
);
8174 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8175 RTX_FRAME_RELATED_P (insn
) = 1;
8177 /* Now return address is in ECX. */
8178 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8181 /* Errors in the source file can cause expand_expr to return const0_rtx
8182 where we expect a vector. To avoid crashing, use one of the vector
8183 clear instructions. */
8186 safe_vector_operand (rtx x
, machine_mode mode
)
8188 if (x
== const0_rtx
)
8189 x
= CONST0_RTX (mode
);
8193 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8196 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8199 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8200 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8201 rtx op0
= expand_normal (arg0
);
8202 rtx op1
= expand_normal (arg1
);
8203 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8204 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8205 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8207 if (VECTOR_MODE_P (mode0
))
8208 op0
= safe_vector_operand (op0
, mode0
);
8209 if (VECTOR_MODE_P (mode1
))
8210 op1
= safe_vector_operand (op1
, mode1
);
8212 if (optimize
|| !target
8213 || GET_MODE (target
) != tmode
8214 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8215 target
= gen_reg_rtx (tmode
);
8217 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8219 rtx x
= gen_reg_rtx (V4SImode
);
8220 emit_insn (gen_sse2_loadd (x
, op1
));
8221 op1
= gen_lowpart (TImode
, x
);
8224 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8225 op0
= copy_to_mode_reg (mode0
, op0
);
8226 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8227 op1
= copy_to_mode_reg (mode1
, op1
);
8229 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8238 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8241 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8242 enum ix86_builtin_func_type m_type
,
8243 enum rtx_code sub_code
)
8248 bool comparison_p
= false;
8250 bool last_arg_constant
= false;
8257 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8261 case MULTI_ARG_4_DF2_DI_I
:
8262 case MULTI_ARG_4_DF2_DI_I1
:
8263 case MULTI_ARG_4_SF2_SI_I
:
8264 case MULTI_ARG_4_SF2_SI_I1
:
8266 last_arg_constant
= true;
8269 case MULTI_ARG_3_SF
:
8270 case MULTI_ARG_3_DF
:
8271 case MULTI_ARG_3_SF2
:
8272 case MULTI_ARG_3_DF2
:
8273 case MULTI_ARG_3_DI
:
8274 case MULTI_ARG_3_SI
:
8275 case MULTI_ARG_3_SI_DI
:
8276 case MULTI_ARG_3_HI
:
8277 case MULTI_ARG_3_HI_SI
:
8278 case MULTI_ARG_3_QI
:
8279 case MULTI_ARG_3_DI2
:
8280 case MULTI_ARG_3_SI2
:
8281 case MULTI_ARG_3_HI2
:
8282 case MULTI_ARG_3_QI2
:
8286 case MULTI_ARG_2_SF
:
8287 case MULTI_ARG_2_DF
:
8288 case MULTI_ARG_2_DI
:
8289 case MULTI_ARG_2_SI
:
8290 case MULTI_ARG_2_HI
:
8291 case MULTI_ARG_2_QI
:
8295 case MULTI_ARG_2_DI_IMM
:
8296 case MULTI_ARG_2_SI_IMM
:
8297 case MULTI_ARG_2_HI_IMM
:
8298 case MULTI_ARG_2_QI_IMM
:
8300 last_arg_constant
= true;
8303 case MULTI_ARG_1_SF
:
8304 case MULTI_ARG_1_DF
:
8305 case MULTI_ARG_1_SF2
:
8306 case MULTI_ARG_1_DF2
:
8307 case MULTI_ARG_1_DI
:
8308 case MULTI_ARG_1_SI
:
8309 case MULTI_ARG_1_HI
:
8310 case MULTI_ARG_1_QI
:
8311 case MULTI_ARG_1_SI_DI
:
8312 case MULTI_ARG_1_HI_DI
:
8313 case MULTI_ARG_1_HI_SI
:
8314 case MULTI_ARG_1_QI_DI
:
8315 case MULTI_ARG_1_QI_SI
:
8316 case MULTI_ARG_1_QI_HI
:
8320 case MULTI_ARG_2_DI_CMP
:
8321 case MULTI_ARG_2_SI_CMP
:
8322 case MULTI_ARG_2_HI_CMP
:
8323 case MULTI_ARG_2_QI_CMP
:
8325 comparison_p
= true;
8328 case MULTI_ARG_2_SF_TF
:
8329 case MULTI_ARG_2_DF_TF
:
8330 case MULTI_ARG_2_DI_TF
:
8331 case MULTI_ARG_2_SI_TF
:
8332 case MULTI_ARG_2_HI_TF
:
8333 case MULTI_ARG_2_QI_TF
:
8342 if (optimize
|| !target
8343 || GET_MODE (target
) != tmode
8344 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8345 target
= gen_reg_rtx (tmode
);
8346 else if (memory_operand (target
, tmode
))
8349 gcc_assert (nargs
<= 4);
8351 for (i
= 0; i
< nargs
; i
++)
8353 tree arg
= CALL_EXPR_ARG (exp
, i
);
8354 rtx op
= expand_normal (arg
);
8355 int adjust
= (comparison_p
) ? 1 : 0;
8356 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8358 if (last_arg_constant
&& i
== nargs
- 1)
8360 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8362 enum insn_code new_icode
= icode
;
8365 case CODE_FOR_xop_vpermil2v2df3
:
8366 case CODE_FOR_xop_vpermil2v4sf3
:
8367 case CODE_FOR_xop_vpermil2v4df3
:
8368 case CODE_FOR_xop_vpermil2v8sf3
:
8369 error ("the last argument must be a 2-bit immediate");
8370 return gen_reg_rtx (tmode
);
8371 case CODE_FOR_xop_rotlv2di3
:
8372 new_icode
= CODE_FOR_rotlv2di3
;
8374 case CODE_FOR_xop_rotlv4si3
:
8375 new_icode
= CODE_FOR_rotlv4si3
;
8377 case CODE_FOR_xop_rotlv8hi3
:
8378 new_icode
= CODE_FOR_rotlv8hi3
;
8380 case CODE_FOR_xop_rotlv16qi3
:
8381 new_icode
= CODE_FOR_rotlv16qi3
;
8383 if (CONST_INT_P (op
))
8385 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8386 op
= GEN_INT (INTVAL (op
) & mask
);
8388 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8394 && insn_data
[new_icode
].operand
[0].mode
== tmode
8395 && insn_data
[new_icode
].operand
[1].mode
== tmode
8396 && insn_data
[new_icode
].operand
[2].mode
== mode
8397 && insn_data
[new_icode
].operand
[0].predicate
8398 == insn_data
[icode
].operand
[0].predicate
8399 && insn_data
[new_icode
].operand
[1].predicate
8400 == insn_data
[icode
].operand
[1].predicate
);
8413 if (VECTOR_MODE_P (mode
))
8414 op
= safe_vector_operand (op
, mode
);
8416 /* If we aren't optimizing, only allow one memory operand to be
8418 if (memory_operand (op
, mode
))
8421 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8424 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8426 op
= force_reg (mode
, op
);
8430 args
[i
].mode
= mode
;
8436 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8441 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8442 GEN_INT ((int)sub_code
));
8443 else if (! comparison_p
)
8444 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8447 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8451 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8456 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8460 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8474 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8475 insns with vec_merge. */
8478 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8482 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8483 rtx op1
, op0
= expand_normal (arg0
);
8484 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8485 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8487 if (optimize
|| !target
8488 || GET_MODE (target
) != tmode
8489 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8490 target
= gen_reg_rtx (tmode
);
8492 if (VECTOR_MODE_P (mode0
))
8493 op0
= safe_vector_operand (op0
, mode0
);
8495 if ((optimize
&& !register_operand (op0
, mode0
))
8496 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8497 op0
= copy_to_mode_reg (mode0
, op0
);
8500 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8501 op1
= copy_to_mode_reg (mode0
, op1
);
8503 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8510 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8513 ix86_expand_sse_compare (const struct builtin_description
*d
,
8514 tree exp
, rtx target
, bool swap
)
8517 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8518 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8519 rtx op0
= expand_normal (arg0
);
8520 rtx op1
= expand_normal (arg1
);
8522 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8523 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8524 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8525 enum rtx_code comparison
= d
->comparison
;
8527 if (VECTOR_MODE_P (mode0
))
8528 op0
= safe_vector_operand (op0
, mode0
);
8529 if (VECTOR_MODE_P (mode1
))
8530 op1
= safe_vector_operand (op1
, mode1
);
8532 /* Swap operands if we have a comparison that isn't available in
8535 std::swap (op0
, op1
);
8537 if (optimize
|| !target
8538 || GET_MODE (target
) != tmode
8539 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8540 target
= gen_reg_rtx (tmode
);
8542 if ((optimize
&& !register_operand (op0
, mode0
))
8543 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8544 op0
= copy_to_mode_reg (mode0
, op0
);
8545 if ((optimize
&& !register_operand (op1
, mode1
))
8546 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8547 op1
= copy_to_mode_reg (mode1
, op1
);
8549 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8550 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8557 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8560 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8564 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8565 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8566 rtx op0
= expand_normal (arg0
);
8567 rtx op1
= expand_normal (arg1
);
8568 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8569 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8570 enum rtx_code comparison
= d
->comparison
;
8572 if (VECTOR_MODE_P (mode0
))
8573 op0
= safe_vector_operand (op0
, mode0
);
8574 if (VECTOR_MODE_P (mode1
))
8575 op1
= safe_vector_operand (op1
, mode1
);
8577 /* Swap operands if we have a comparison that isn't available in
8579 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8580 std::swap (op0
, op1
);
8582 target
= gen_reg_rtx (SImode
);
8583 emit_move_insn (target
, const0_rtx
);
8584 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8586 if ((optimize
&& !register_operand (op0
, mode0
))
8587 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8588 op0
= copy_to_mode_reg (mode0
, op0
);
8589 if ((optimize
&& !register_operand (op1
, mode1
))
8590 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8591 op1
= copy_to_mode_reg (mode1
, op1
);
8593 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8597 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8598 gen_rtx_fmt_ee (comparison
, QImode
,
8602 return SUBREG_REG (target
);
8605 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8608 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8612 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8613 rtx op1
, op0
= expand_normal (arg0
);
8614 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8615 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8617 if (optimize
|| target
== 0
8618 || GET_MODE (target
) != tmode
8619 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8620 target
= gen_reg_rtx (tmode
);
8622 if (VECTOR_MODE_P (mode0
))
8623 op0
= safe_vector_operand (op0
, mode0
);
8625 if ((optimize
&& !register_operand (op0
, mode0
))
8626 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8627 op0
= copy_to_mode_reg (mode0
, op0
);
8629 op1
= GEN_INT (d
->comparison
);
8631 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8639 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8640 tree exp
, rtx target
)
8643 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8644 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8645 rtx op0
= expand_normal (arg0
);
8646 rtx op1
= expand_normal (arg1
);
8648 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8649 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8650 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8652 if (optimize
|| target
== 0
8653 || GET_MODE (target
) != tmode
8654 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8655 target
= gen_reg_rtx (tmode
);
8657 op0
= safe_vector_operand (op0
, mode0
);
8658 op1
= safe_vector_operand (op1
, mode1
);
8660 if ((optimize
&& !register_operand (op0
, mode0
))
8661 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8662 op0
= copy_to_mode_reg (mode0
, op0
);
8663 if ((optimize
&& !register_operand (op1
, mode1
))
8664 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8665 op1
= copy_to_mode_reg (mode1
, op1
);
8667 op2
= GEN_INT (d
->comparison
);
8669 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8676 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8679 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8683 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8684 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8685 rtx op0
= expand_normal (arg0
);
8686 rtx op1
= expand_normal (arg1
);
8687 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8688 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8689 enum rtx_code comparison
= d
->comparison
;
8691 if (VECTOR_MODE_P (mode0
))
8692 op0
= safe_vector_operand (op0
, mode0
);
8693 if (VECTOR_MODE_P (mode1
))
8694 op1
= safe_vector_operand (op1
, mode1
);
8696 target
= gen_reg_rtx (SImode
);
8697 emit_move_insn (target
, const0_rtx
);
8698 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8700 if ((optimize
&& !register_operand (op0
, mode0
))
8701 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8702 op0
= copy_to_mode_reg (mode0
, op0
);
8703 if ((optimize
&& !register_operand (op1
, mode1
))
8704 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8705 op1
= copy_to_mode_reg (mode1
, op1
);
8707 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8711 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8712 gen_rtx_fmt_ee (comparison
, QImode
,
8716 return SUBREG_REG (target
);
8719 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8722 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8723 tree exp
, rtx target
)
8726 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8727 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8728 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8729 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8730 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8731 rtx scratch0
, scratch1
;
8732 rtx op0
= expand_normal (arg0
);
8733 rtx op1
= expand_normal (arg1
);
8734 rtx op2
= expand_normal (arg2
);
8735 rtx op3
= expand_normal (arg3
);
8736 rtx op4
= expand_normal (arg4
);
8737 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8739 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8740 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8741 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8742 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8743 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8744 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8745 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8747 if (VECTOR_MODE_P (modev2
))
8748 op0
= safe_vector_operand (op0
, modev2
);
8749 if (VECTOR_MODE_P (modev4
))
8750 op2
= safe_vector_operand (op2
, modev4
);
8752 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8753 op0
= copy_to_mode_reg (modev2
, op0
);
8754 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8755 op1
= copy_to_mode_reg (modei3
, op1
);
8756 if ((optimize
&& !register_operand (op2
, modev4
))
8757 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8758 op2
= copy_to_mode_reg (modev4
, op2
);
8759 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8760 op3
= copy_to_mode_reg (modei5
, op3
);
8762 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8764 error ("the fifth argument must be an 8-bit immediate");
8768 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8770 if (optimize
|| !target
8771 || GET_MODE (target
) != tmode0
8772 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8773 target
= gen_reg_rtx (tmode0
);
8775 scratch1
= gen_reg_rtx (tmode1
);
8777 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8779 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8781 if (optimize
|| !target
8782 || GET_MODE (target
) != tmode1
8783 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8784 target
= gen_reg_rtx (tmode1
);
8786 scratch0
= gen_reg_rtx (tmode0
);
8788 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8792 gcc_assert (d
->flag
);
8794 scratch0
= gen_reg_rtx (tmode0
);
8795 scratch1
= gen_reg_rtx (tmode1
);
8797 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8807 target
= gen_reg_rtx (SImode
);
8808 emit_move_insn (target
, const0_rtx
);
8809 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8812 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8813 gen_rtx_fmt_ee (EQ
, QImode
,
8814 gen_rtx_REG ((machine_mode
) d
->flag
,
8817 return SUBREG_REG (target
);
8824 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8827 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8828 tree exp
, rtx target
)
8831 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8832 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8833 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8834 rtx scratch0
, scratch1
;
8835 rtx op0
= expand_normal (arg0
);
8836 rtx op1
= expand_normal (arg1
);
8837 rtx op2
= expand_normal (arg2
);
8838 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8840 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8841 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8842 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8843 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8844 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8846 if (VECTOR_MODE_P (modev2
))
8847 op0
= safe_vector_operand (op0
, modev2
);
8848 if (VECTOR_MODE_P (modev3
))
8849 op1
= safe_vector_operand (op1
, modev3
);
8851 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8852 op0
= copy_to_mode_reg (modev2
, op0
);
8853 if ((optimize
&& !register_operand (op1
, modev3
))
8854 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8855 op1
= copy_to_mode_reg (modev3
, op1
);
8857 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8859 error ("the third argument must be an 8-bit immediate");
8863 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8865 if (optimize
|| !target
8866 || GET_MODE (target
) != tmode0
8867 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8868 target
= gen_reg_rtx (tmode0
);
8870 scratch1
= gen_reg_rtx (tmode1
);
8872 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8874 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8876 if (optimize
|| !target
8877 || GET_MODE (target
) != tmode1
8878 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8879 target
= gen_reg_rtx (tmode1
);
8881 scratch0
= gen_reg_rtx (tmode0
);
8883 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8887 gcc_assert (d
->flag
);
8889 scratch0
= gen_reg_rtx (tmode0
);
8890 scratch1
= gen_reg_rtx (tmode1
);
8892 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8902 target
= gen_reg_rtx (SImode
);
8903 emit_move_insn (target
, const0_rtx
);
8904 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8907 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8908 gen_rtx_fmt_ee (EQ
, QImode
,
8909 gen_rtx_REG ((machine_mode
) d
->flag
,
8912 return SUBREG_REG (target
);
8918 /* Fixup modeless constants to fit required mode. */
8921 fixup_modeless_constant (rtx x
, machine_mode mode
)
8923 if (GET_MODE (x
) == VOIDmode
)
8924 x
= convert_to_mode (mode
, x
, 1);
8928 /* Subroutine of ix86_expand_builtin to take care of insns with
8929 variable number of operands. */
8932 ix86_expand_args_builtin (const struct builtin_description
*d
,
8933 tree exp
, rtx target
)
8935 rtx pat
, real_target
;
8936 unsigned int i
, nargs
;
8937 unsigned int nargs_constant
= 0;
8938 unsigned int mask_pos
= 0;
8945 bool second_arg_count
= false;
8946 enum insn_code icode
= d
->icode
;
8947 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8948 machine_mode tmode
= insn_p
->operand
[0].mode
;
8949 machine_mode rmode
= VOIDmode
;
8951 enum rtx_code comparison
= d
->comparison
;
8953 switch ((enum ix86_builtin_func_type
) d
->flag
)
8955 case V2DF_FTYPE_V2DF_ROUND
:
8956 case V4DF_FTYPE_V4DF_ROUND
:
8957 case V8DF_FTYPE_V8DF_ROUND
:
8958 case V4SF_FTYPE_V4SF_ROUND
:
8959 case V8SF_FTYPE_V8SF_ROUND
:
8960 case V16SF_FTYPE_V16SF_ROUND
:
8961 case V4SI_FTYPE_V4SF_ROUND
:
8962 case V8SI_FTYPE_V8SF_ROUND
:
8963 case V16SI_FTYPE_V16SF_ROUND
:
8964 return ix86_expand_sse_round (d
, exp
, target
);
8965 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8966 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8967 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8968 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8969 case INT_FTYPE_V8SF_V8SF_PTEST
:
8970 case INT_FTYPE_V4DI_V4DI_PTEST
:
8971 case INT_FTYPE_V4DF_V4DF_PTEST
:
8972 case INT_FTYPE_V4SF_V4SF_PTEST
:
8973 case INT_FTYPE_V2DI_V2DI_PTEST
:
8974 case INT_FTYPE_V2DF_V2DF_PTEST
:
8975 return ix86_expand_sse_ptest (d
, exp
, target
);
8976 case FLOAT128_FTYPE_FLOAT128
:
8977 case FLOAT_FTYPE_FLOAT
:
8979 case UINT_FTYPE_UINT
:
8980 case UINT16_FTYPE_UINT16
:
8981 case UINT64_FTYPE_INT
:
8982 case UINT64_FTYPE_UINT64
:
8983 case INT64_FTYPE_INT64
:
8984 case INT64_FTYPE_V4SF
:
8985 case INT64_FTYPE_V2DF
:
8986 case INT_FTYPE_V16QI
:
8987 case INT_FTYPE_V8QI
:
8988 case INT_FTYPE_V8SF
:
8989 case INT_FTYPE_V4DF
:
8990 case INT_FTYPE_V4SF
:
8991 case INT_FTYPE_V2DF
:
8992 case INT_FTYPE_V32QI
:
8993 case V16QI_FTYPE_V16QI
:
8994 case V8SI_FTYPE_V8SF
:
8995 case V8SI_FTYPE_V4SI
:
8996 case V8HI_FTYPE_V8HI
:
8997 case V8HI_FTYPE_V16QI
:
8998 case V8QI_FTYPE_V8QI
:
8999 case V8SF_FTYPE_V8SF
:
9000 case V8SF_FTYPE_V8SI
:
9001 case V8SF_FTYPE_V4SF
:
9002 case V8SF_FTYPE_V8HI
:
9003 case V4SI_FTYPE_V4SI
:
9004 case V4SI_FTYPE_V16QI
:
9005 case V4SI_FTYPE_V4SF
:
9006 case V4SI_FTYPE_V8SI
:
9007 case V4SI_FTYPE_V8HI
:
9008 case V4SI_FTYPE_V4DF
:
9009 case V4SI_FTYPE_V2DF
:
9010 case V4HI_FTYPE_V4HI
:
9011 case V4DF_FTYPE_V4DF
:
9012 case V4DF_FTYPE_V4SI
:
9013 case V4DF_FTYPE_V4SF
:
9014 case V4DF_FTYPE_V2DF
:
9015 case V4SF_FTYPE_V4SF
:
9016 case V4SF_FTYPE_V4SI
:
9017 case V4SF_FTYPE_V8SF
:
9018 case V4SF_FTYPE_V4DF
:
9019 case V4SF_FTYPE_V8HI
:
9020 case V4SF_FTYPE_V2DF
:
9021 case V2DI_FTYPE_V2DI
:
9022 case V2DI_FTYPE_V16QI
:
9023 case V2DI_FTYPE_V8HI
:
9024 case V2DI_FTYPE_V4SI
:
9025 case V2DF_FTYPE_V2DF
:
9026 case V2DF_FTYPE_V4SI
:
9027 case V2DF_FTYPE_V4DF
:
9028 case V2DF_FTYPE_V4SF
:
9029 case V2DF_FTYPE_V2SI
:
9030 case V2SI_FTYPE_V2SI
:
9031 case V2SI_FTYPE_V4SF
:
9032 case V2SI_FTYPE_V2SF
:
9033 case V2SI_FTYPE_V2DF
:
9034 case V2SF_FTYPE_V2SF
:
9035 case V2SF_FTYPE_V2SI
:
9036 case V32QI_FTYPE_V32QI
:
9037 case V32QI_FTYPE_V16QI
:
9038 case V16HI_FTYPE_V16HI
:
9039 case V16HI_FTYPE_V8HI
:
9040 case V8SI_FTYPE_V8SI
:
9041 case V16HI_FTYPE_V16QI
:
9042 case V8SI_FTYPE_V16QI
:
9043 case V4DI_FTYPE_V16QI
:
9044 case V8SI_FTYPE_V8HI
:
9045 case V4DI_FTYPE_V8HI
:
9046 case V4DI_FTYPE_V4SI
:
9047 case V4DI_FTYPE_V2DI
:
9054 case UHI_FTYPE_V16QI
:
9055 case USI_FTYPE_V32QI
:
9056 case UDI_FTYPE_V64QI
:
9057 case V16QI_FTYPE_UHI
:
9058 case V32QI_FTYPE_USI
:
9059 case V64QI_FTYPE_UDI
:
9060 case V8HI_FTYPE_UQI
:
9061 case V16HI_FTYPE_UHI
:
9062 case V32HI_FTYPE_USI
:
9063 case V4SI_FTYPE_UQI
:
9064 case V8SI_FTYPE_UQI
:
9065 case V4SI_FTYPE_UHI
:
9066 case V8SI_FTYPE_UHI
:
9067 case UQI_FTYPE_V8HI
:
9068 case UHI_FTYPE_V16HI
:
9069 case USI_FTYPE_V32HI
:
9070 case UQI_FTYPE_V4SI
:
9071 case UQI_FTYPE_V8SI
:
9072 case UHI_FTYPE_V16SI
:
9073 case UQI_FTYPE_V2DI
:
9074 case UQI_FTYPE_V4DI
:
9075 case UQI_FTYPE_V8DI
:
9076 case V16SI_FTYPE_UHI
:
9077 case V2DI_FTYPE_UQI
:
9078 case V4DI_FTYPE_UQI
:
9079 case V16SI_FTYPE_INT
:
9080 case V16SF_FTYPE_V8SF
:
9081 case V16SI_FTYPE_V8SI
:
9082 case V16SF_FTYPE_V4SF
:
9083 case V16SI_FTYPE_V4SI
:
9084 case V16SI_FTYPE_V16SF
:
9085 case V16SI_FTYPE_V16SI
:
9086 case V64QI_FTYPE_V64QI
:
9087 case V32HI_FTYPE_V32HI
:
9088 case V16SF_FTYPE_V16SF
:
9089 case V8DI_FTYPE_UQI
:
9090 case V8DI_FTYPE_V8DI
:
9091 case V8DF_FTYPE_V4DF
:
9092 case V8DF_FTYPE_V2DF
:
9093 case V8DF_FTYPE_V8DF
:
9094 case V4DI_FTYPE_V4DI
:
9095 case V16HI_FTYPE_V16SF
:
9096 case V8HI_FTYPE_V8SF
:
9097 case V8HI_FTYPE_V4SF
:
9100 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9101 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9102 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9103 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9104 case V16QI_FTYPE_V16QI_V16QI
:
9105 case V16QI_FTYPE_V8HI_V8HI
:
9106 case V16SF_FTYPE_V16SF_V16SF
:
9107 case V8QI_FTYPE_V8QI_V8QI
:
9108 case V8QI_FTYPE_V4HI_V4HI
:
9109 case V8HI_FTYPE_V8HI_V8HI
:
9110 case V8HI_FTYPE_V16QI_V16QI
:
9111 case V8HI_FTYPE_V4SI_V4SI
:
9112 case V8SF_FTYPE_V8SF_V8SF
:
9113 case V8SF_FTYPE_V8SF_V8SI
:
9114 case V8DF_FTYPE_V8DF_V8DF
:
9115 case V4SI_FTYPE_V4SI_V4SI
:
9116 case V4SI_FTYPE_V8HI_V8HI
:
9117 case V4SI_FTYPE_V2DF_V2DF
:
9118 case V4HI_FTYPE_V4HI_V4HI
:
9119 case V4HI_FTYPE_V8QI_V8QI
:
9120 case V4HI_FTYPE_V2SI_V2SI
:
9121 case V4DF_FTYPE_V4DF_V4DF
:
9122 case V4DF_FTYPE_V4DF_V4DI
:
9123 case V4SF_FTYPE_V4SF_V4SF
:
9124 case V4SF_FTYPE_V4SF_V4SI
:
9125 case V4SF_FTYPE_V4SF_V2SI
:
9126 case V4SF_FTYPE_V4SF_V2DF
:
9127 case V4SF_FTYPE_V4SF_UINT
:
9128 case V4SF_FTYPE_V4SF_DI
:
9129 case V4SF_FTYPE_V4SF_SI
:
9130 case V2DI_FTYPE_V2DI_V2DI
:
9131 case V2DI_FTYPE_V16QI_V16QI
:
9132 case V2DI_FTYPE_V4SI_V4SI
:
9133 case V2DI_FTYPE_V2DI_V16QI
:
9134 case V2SI_FTYPE_V2SI_V2SI
:
9135 case V2SI_FTYPE_V4HI_V4HI
:
9136 case V2SI_FTYPE_V2SF_V2SF
:
9137 case V2DF_FTYPE_V2DF_V2DF
:
9138 case V2DF_FTYPE_V2DF_V4SF
:
9139 case V2DF_FTYPE_V2DF_V2DI
:
9140 case V2DF_FTYPE_V2DF_DI
:
9141 case V2DF_FTYPE_V2DF_SI
:
9142 case V2DF_FTYPE_V2DF_UINT
:
9143 case V2SF_FTYPE_V2SF_V2SF
:
9144 case V1DI_FTYPE_V1DI_V1DI
:
9145 case V1DI_FTYPE_V8QI_V8QI
:
9146 case V1DI_FTYPE_V2SI_V2SI
:
9147 case V32QI_FTYPE_V16HI_V16HI
:
9148 case V16HI_FTYPE_V8SI_V8SI
:
9149 case V64QI_FTYPE_V64QI_V64QI
:
9150 case V32QI_FTYPE_V32QI_V32QI
:
9151 case V16HI_FTYPE_V32QI_V32QI
:
9152 case V16HI_FTYPE_V16HI_V16HI
:
9153 case V8SI_FTYPE_V4DF_V4DF
:
9154 case V8SI_FTYPE_V8SI_V8SI
:
9155 case V8SI_FTYPE_V16HI_V16HI
:
9156 case V4DI_FTYPE_V4DI_V4DI
:
9157 case V4DI_FTYPE_V8SI_V8SI
:
9158 case V8DI_FTYPE_V64QI_V64QI
:
9159 if (comparison
== UNKNOWN
)
9160 return ix86_expand_binop_builtin (icode
, exp
, target
);
9163 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9164 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9165 gcc_assert (comparison
!= UNKNOWN
);
9169 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9170 case V16HI_FTYPE_V16HI_SI_COUNT
:
9171 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9172 case V8SI_FTYPE_V8SI_SI_COUNT
:
9173 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9174 case V4DI_FTYPE_V4DI_INT_COUNT
:
9175 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9176 case V8HI_FTYPE_V8HI_SI_COUNT
:
9177 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9178 case V4SI_FTYPE_V4SI_SI_COUNT
:
9179 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9180 case V4HI_FTYPE_V4HI_SI_COUNT
:
9181 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9182 case V2DI_FTYPE_V2DI_SI_COUNT
:
9183 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9184 case V2SI_FTYPE_V2SI_SI_COUNT
:
9185 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9186 case V1DI_FTYPE_V1DI_SI_COUNT
:
9188 second_arg_count
= true;
9190 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9191 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9192 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9193 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9194 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9195 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9196 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9197 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9198 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9199 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9200 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9201 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9202 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9203 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9204 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9205 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9206 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9207 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9209 second_arg_count
= true;
9211 case UINT64_FTYPE_UINT64_UINT64
:
9212 case UINT_FTYPE_UINT_UINT
:
9213 case UINT_FTYPE_UINT_USHORT
:
9214 case UINT_FTYPE_UINT_UCHAR
:
9215 case UINT16_FTYPE_UINT16_INT
:
9216 case UINT8_FTYPE_UINT8_INT
:
9217 case UQI_FTYPE_UQI_UQI
:
9218 case UHI_FTYPE_UHI_UHI
:
9219 case USI_FTYPE_USI_USI
:
9220 case UDI_FTYPE_UDI_UDI
:
9221 case V16SI_FTYPE_V8DF_V8DF
:
9222 case V32HI_FTYPE_V16SF_V16SF
:
9223 case V16HI_FTYPE_V8SF_V8SF
:
9224 case V8HI_FTYPE_V4SF_V4SF
:
9225 case V16HI_FTYPE_V16SF_UHI
:
9226 case V8HI_FTYPE_V8SF_UQI
:
9227 case V8HI_FTYPE_V4SF_UQI
:
9230 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9235 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9240 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9245 case V8HI_FTYPE_V8HI_INT
:
9246 case V8HI_FTYPE_V8SF_INT
:
9247 case V16HI_FTYPE_V16SF_INT
:
9248 case V8HI_FTYPE_V4SF_INT
:
9249 case V8SF_FTYPE_V8SF_INT
:
9250 case V4SF_FTYPE_V16SF_INT
:
9251 case V16SF_FTYPE_V16SF_INT
:
9252 case V4SI_FTYPE_V4SI_INT
:
9253 case V4SI_FTYPE_V8SI_INT
:
9254 case V4HI_FTYPE_V4HI_INT
:
9255 case V4DF_FTYPE_V4DF_INT
:
9256 case V4DF_FTYPE_V8DF_INT
:
9257 case V4SF_FTYPE_V4SF_INT
:
9258 case V4SF_FTYPE_V8SF_INT
:
9259 case V2DI_FTYPE_V2DI_INT
:
9260 case V2DF_FTYPE_V2DF_INT
:
9261 case V2DF_FTYPE_V4DF_INT
:
9262 case V16HI_FTYPE_V16HI_INT
:
9263 case V8SI_FTYPE_V8SI_INT
:
9264 case V16SI_FTYPE_V16SI_INT
:
9265 case V4SI_FTYPE_V16SI_INT
:
9266 case V4DI_FTYPE_V4DI_INT
:
9267 case V2DI_FTYPE_V4DI_INT
:
9268 case V4DI_FTYPE_V8DI_INT
:
9269 case UQI_FTYPE_UQI_UQI_CONST
:
9270 case UHI_FTYPE_UHI_UQI
:
9271 case USI_FTYPE_USI_UQI
:
9272 case UDI_FTYPE_UDI_UQI
:
9276 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9277 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9278 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9279 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9280 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9281 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9282 case UHI_FTYPE_V16SI_V16SI_UHI
:
9283 case UQI_FTYPE_V8DI_V8DI_UQI
:
9284 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9285 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9286 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9287 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9288 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9289 case V16SI_FTYPE_SI_V16SI_UHI
:
9290 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9291 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9292 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9293 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9294 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9295 case V8SI_FTYPE_SI_V8SI_UQI
:
9296 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9297 case V4SI_FTYPE_SI_V4SI_UQI
:
9298 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9299 case V4DI_FTYPE_DI_V4DI_UQI
:
9300 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9301 case V2DI_FTYPE_DI_V2DI_UQI
:
9302 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9303 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9304 case V64QI_FTYPE_QI_V64QI_UDI
:
9305 case V32QI_FTYPE_V32QI_V32QI_USI
:
9306 case V32QI_FTYPE_V16QI_V32QI_USI
:
9307 case V32QI_FTYPE_QI_V32QI_USI
:
9308 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9309 case V16QI_FTYPE_QI_V16QI_UHI
:
9310 case V32HI_FTYPE_V8HI_V32HI_USI
:
9311 case V32HI_FTYPE_HI_V32HI_USI
:
9312 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9313 case V16HI_FTYPE_HI_V16HI_UHI
:
9314 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9315 case V8HI_FTYPE_HI_V8HI_UQI
:
9316 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9317 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9318 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9319 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9320 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9321 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9322 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9323 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9324 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9325 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9326 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9327 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9328 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9329 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9330 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9331 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9332 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9333 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9334 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9335 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9336 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9337 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9338 case V32QI_FTYPE_V32HI_V32QI_USI
:
9339 case UHI_FTYPE_V16QI_V16QI_UHI
:
9340 case USI_FTYPE_V32QI_V32QI_USI
:
9341 case UDI_FTYPE_V64QI_V64QI_UDI
:
9342 case UQI_FTYPE_V8HI_V8HI_UQI
:
9343 case UHI_FTYPE_V16HI_V16HI_UHI
:
9344 case USI_FTYPE_V32HI_V32HI_USI
:
9345 case UQI_FTYPE_V4SI_V4SI_UQI
:
9346 case UQI_FTYPE_V8SI_V8SI_UQI
:
9347 case UQI_FTYPE_V2DI_V2DI_UQI
:
9348 case UQI_FTYPE_V4DI_V4DI_UQI
:
9349 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9350 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9351 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9352 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9353 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9354 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9355 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9356 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9357 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9358 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9359 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9360 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9361 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9362 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9363 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9364 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9365 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9366 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9367 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9368 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9369 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9370 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9371 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9372 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9373 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9374 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9375 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9376 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9377 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9378 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9379 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9380 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9381 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9382 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9383 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9384 case V8DI_FTYPE_DI_V8DI_UQI
:
9385 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9386 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9387 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9388 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9389 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9390 case V32HI_FTYPE_V32HI_V32HI_USI
:
9391 case V32HI_FTYPE_V32QI_V32HI_USI
:
9392 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9393 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9394 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9395 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9396 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9397 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9398 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9399 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9400 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9401 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9402 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9403 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9404 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9405 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9406 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9407 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9408 case V32HI_FTYPE_V16SF_V16SF_USI
:
9409 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9410 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9411 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9412 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9413 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9414 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9415 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9416 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9419 case V32QI_FTYPE_V32QI_V32QI_INT
:
9420 case V16HI_FTYPE_V16HI_V16HI_INT
:
9421 case V16QI_FTYPE_V16QI_V16QI_INT
:
9422 case V4DI_FTYPE_V4DI_V4DI_INT
:
9423 case V8HI_FTYPE_V8HI_V8HI_INT
:
9424 case V8SI_FTYPE_V8SI_V8SI_INT
:
9425 case V8SI_FTYPE_V8SI_V4SI_INT
:
9426 case V8SF_FTYPE_V8SF_V8SF_INT
:
9427 case V8SF_FTYPE_V8SF_V4SF_INT
:
9428 case V4SI_FTYPE_V4SI_V4SI_INT
:
9429 case V4DF_FTYPE_V4DF_V4DF_INT
:
9430 case V16SF_FTYPE_V16SF_V16SF_INT
:
9431 case V16SF_FTYPE_V16SF_V4SF_INT
:
9432 case V16SI_FTYPE_V16SI_V4SI_INT
:
9433 case V4DF_FTYPE_V4DF_V2DF_INT
:
9434 case V4SF_FTYPE_V4SF_V4SF_INT
:
9435 case V2DI_FTYPE_V2DI_V2DI_INT
:
9436 case V4DI_FTYPE_V4DI_V2DI_INT
:
9437 case V2DF_FTYPE_V2DF_V2DF_INT
:
9438 case UQI_FTYPE_V8DI_V8UDI_INT
:
9439 case UQI_FTYPE_V8DF_V8DF_INT
:
9440 case UQI_FTYPE_V2DF_V2DF_INT
:
9441 case UQI_FTYPE_V4SF_V4SF_INT
:
9442 case UHI_FTYPE_V16SI_V16SI_INT
:
9443 case UHI_FTYPE_V16SF_V16SF_INT
:
9444 case V64QI_FTYPE_V64QI_V64QI_INT
:
9445 case V32HI_FTYPE_V32HI_V32HI_INT
:
9446 case V16SI_FTYPE_V16SI_V16SI_INT
:
9447 case V8DI_FTYPE_V8DI_V8DI_INT
:
9451 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9456 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9461 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9466 case V2DI_FTYPE_V2DI_UINT_UINT
:
9470 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9475 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9481 case QI_FTYPE_V8DF_INT_UQI
:
9482 case QI_FTYPE_V4DF_INT_UQI
:
9483 case QI_FTYPE_V2DF_INT_UQI
:
9484 case HI_FTYPE_V16SF_INT_UHI
:
9485 case QI_FTYPE_V8SF_INT_UQI
:
9486 case QI_FTYPE_V4SF_INT_UQI
:
9487 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9488 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9493 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9499 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9505 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9506 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9507 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9508 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9509 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9510 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9511 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9512 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9513 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9514 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9515 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9516 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9517 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9518 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9519 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9520 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9521 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9522 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9523 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9524 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9525 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9526 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9527 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9528 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9529 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9530 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9531 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9532 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9533 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9534 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9535 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9536 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9537 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9538 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9539 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9540 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9541 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9542 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9543 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9544 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9545 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9546 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9547 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9548 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9549 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9550 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9551 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9552 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9553 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9554 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9555 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9556 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9557 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9558 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9561 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9562 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9563 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9564 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9565 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9569 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9570 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9571 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9572 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9573 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9574 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9575 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9576 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9577 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9578 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9579 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9580 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9581 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9582 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9587 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9591 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9592 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9593 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9594 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9595 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9598 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9599 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9604 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9605 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9606 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9607 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9608 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9609 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9610 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9611 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9612 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9613 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9614 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9615 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9616 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9617 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9618 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9619 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9620 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9621 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9622 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9623 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9624 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9625 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9626 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9627 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9628 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9629 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9630 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9631 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9632 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9633 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9638 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9639 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9640 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9641 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9642 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9643 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9644 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9645 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9646 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9647 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9648 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9649 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9650 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9651 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9652 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9653 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9654 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9655 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9656 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9657 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9658 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9659 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9660 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9661 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9662 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9663 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9664 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9669 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9670 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9671 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9672 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9673 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9674 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9675 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9676 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9677 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9678 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9683 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9684 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9685 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9686 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9687 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9688 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9689 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9690 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9691 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9692 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9693 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9694 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9704 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9706 if (comparison
!= UNKNOWN
)
9708 gcc_assert (nargs
== 2);
9709 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9712 if (rmode
== VOIDmode
|| rmode
== tmode
)
9716 || GET_MODE (target
) != tmode
9717 || !insn_p
->operand
[0].predicate (target
, tmode
))
9718 target
= gen_reg_rtx (tmode
);
9719 else if (memory_operand (target
, tmode
))
9721 real_target
= target
;
9725 real_target
= gen_reg_rtx (tmode
);
9726 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9729 for (i
= 0; i
< nargs
; i
++)
9731 tree arg
= CALL_EXPR_ARG (exp
, i
);
9732 rtx op
= expand_normal (arg
);
9733 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9734 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9736 if (second_arg_count
&& i
== 1)
9738 /* SIMD shift insns take either an 8-bit immediate or
9739 register as count. But builtin functions take int as
9740 count. If count doesn't match, we put it in register.
9741 The instructions are using 64-bit count, if op is just
9742 32-bit, zero-extend it, as negative shift counts
9743 are undefined behavior and zero-extension is more
9747 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9748 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9750 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9751 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9752 op
= copy_to_reg (op
);
9755 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9756 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9761 case CODE_FOR_avx_vinsertf128v4di
:
9762 case CODE_FOR_avx_vextractf128v4di
:
9763 error ("the last argument must be an 1-bit immediate");
9766 case CODE_FOR_avx512f_cmpv8di3_mask
:
9767 case CODE_FOR_avx512f_cmpv16si3_mask
:
9768 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9769 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9770 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9771 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9772 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9773 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9774 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9775 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9776 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9777 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9778 error ("the last argument must be a 3-bit immediate");
9781 case CODE_FOR_sse4_1_roundsd
:
9782 case CODE_FOR_sse4_1_roundss
:
9784 case CODE_FOR_sse4_1_roundpd
:
9785 case CODE_FOR_sse4_1_roundps
:
9786 case CODE_FOR_avx_roundpd256
:
9787 case CODE_FOR_avx_roundps256
:
9789 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9790 case CODE_FOR_sse4_1_roundps_sfix
:
9791 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9792 case CODE_FOR_avx_roundps_sfix256
:
9794 case CODE_FOR_sse4_1_blendps
:
9795 case CODE_FOR_avx_blendpd256
:
9796 case CODE_FOR_avx_vpermilv4df
:
9797 case CODE_FOR_avx_vpermilv4df_mask
:
9798 case CODE_FOR_avx512f_getmantv8df_mask
:
9799 case CODE_FOR_avx512f_getmantv16sf_mask
:
9800 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9801 case CODE_FOR_avx512vl_getmantv4df_mask
:
9802 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9803 case CODE_FOR_avx512vl_getmantv2df_mask
:
9804 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9805 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9806 case CODE_FOR_avx512dq_rangepv4df_mask
:
9807 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9808 case CODE_FOR_avx512dq_rangepv2df_mask
:
9809 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9810 case CODE_FOR_avx_shufpd256_mask
:
9811 error ("the last argument must be a 4-bit immediate");
9814 case CODE_FOR_sha1rnds4
:
9815 case CODE_FOR_sse4_1_blendpd
:
9816 case CODE_FOR_avx_vpermilv2df
:
9817 case CODE_FOR_avx_vpermilv2df_mask
:
9818 case CODE_FOR_xop_vpermil2v2df3
:
9819 case CODE_FOR_xop_vpermil2v4sf3
:
9820 case CODE_FOR_xop_vpermil2v4df3
:
9821 case CODE_FOR_xop_vpermil2v8sf3
:
9822 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9823 case CODE_FOR_avx512f_vinserti32x4_mask
:
9824 case CODE_FOR_avx512f_vextractf32x4_mask
:
9825 case CODE_FOR_avx512f_vextracti32x4_mask
:
9826 case CODE_FOR_sse2_shufpd
:
9827 case CODE_FOR_sse2_shufpd_mask
:
9828 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9829 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9830 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9831 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9832 error ("the last argument must be a 2-bit immediate");
9835 case CODE_FOR_avx_vextractf128v4df
:
9836 case CODE_FOR_avx_vextractf128v8sf
:
9837 case CODE_FOR_avx_vextractf128v8si
:
9838 case CODE_FOR_avx_vinsertf128v4df
:
9839 case CODE_FOR_avx_vinsertf128v8sf
:
9840 case CODE_FOR_avx_vinsertf128v8si
:
9841 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9842 case CODE_FOR_avx512f_vinserti64x4_mask
:
9843 case CODE_FOR_avx512f_vextractf64x4_mask
:
9844 case CODE_FOR_avx512f_vextracti64x4_mask
:
9845 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9846 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9847 case CODE_FOR_avx512vl_vinsertv4df
:
9848 case CODE_FOR_avx512vl_vinsertv4di
:
9849 case CODE_FOR_avx512vl_vinsertv8sf
:
9850 case CODE_FOR_avx512vl_vinsertv8si
:
9851 error ("the last argument must be a 1-bit immediate");
9854 case CODE_FOR_avx_vmcmpv2df3
:
9855 case CODE_FOR_avx_vmcmpv4sf3
:
9856 case CODE_FOR_avx_cmpv2df3
:
9857 case CODE_FOR_avx_cmpv4sf3
:
9858 case CODE_FOR_avx_cmpv4df3
:
9859 case CODE_FOR_avx_cmpv8sf3
:
9860 case CODE_FOR_avx512f_cmpv8df3_mask
:
9861 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9862 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9863 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9864 error ("the last argument must be a 5-bit immediate");
9868 switch (nargs_constant
)
9871 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9872 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9874 error ("the next to last argument must be an 8-bit immediate");
9879 error ("the last argument must be an 8-bit immediate");
9889 if (VECTOR_MODE_P (mode
))
9890 op
= safe_vector_operand (op
, mode
);
9892 /* If we aren't optimizing, only allow one memory operand to
9894 if (memory_operand (op
, mode
))
9897 op
= fixup_modeless_constant (op
, mode
);
9899 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9901 if (optimize
|| !match
|| num_memory
> 1)
9902 op
= copy_to_mode_reg (mode
, op
);
9906 op
= copy_to_reg (op
);
9907 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9912 args
[i
].mode
= mode
;
9918 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9921 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9924 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9928 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9929 args
[2].op
, args
[3].op
);
9932 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9933 args
[2].op
, args
[3].op
, args
[4].op
);
9936 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9937 args
[2].op
, args
[3].op
, args
[4].op
,
9951 /* Transform pattern of following layout:
9953 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9959 ix86_erase_embedded_rounding (rtx pat
)
9961 if (GET_CODE (pat
) == INSN
)
9962 pat
= PATTERN (pat
);
9964 gcc_assert (GET_CODE (pat
) == SET
);
9965 rtx src
= SET_SRC (pat
);
9966 gcc_assert (XVECLEN (src
, 0) == 2);
9967 rtx p0
= XVECEXP (src
, 0, 0);
9968 gcc_assert (GET_CODE (src
) == UNSPEC
9969 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9970 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9974 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9977 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9978 tree exp
, rtx target
)
9981 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9982 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9983 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9984 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9985 rtx op0
= expand_normal (arg0
);
9986 rtx op1
= expand_normal (arg1
);
9987 rtx op2
= expand_normal (arg2
);
9988 rtx op3
= expand_normal (arg3
);
9989 enum insn_code icode
= d
->icode
;
9990 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9991 machine_mode mode0
= insn_p
->operand
[0].mode
;
9992 machine_mode mode1
= insn_p
->operand
[1].mode
;
9994 /* See avxintrin.h for values. */
9995 static const enum rtx_code comparisons
[32] =
9997 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9998 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
9999 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10000 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
10002 static const bool ordereds
[32] =
10004 true, true, true, false, false, false, false, true,
10005 false, false, false, true, true, true, true, false,
10006 true, true, true, false, false, false, false, true,
10007 false, false, false, true, true, true, true, false
10009 static const bool non_signalings
[32] =
10011 true, false, false, true, true, false, false, true,
10012 true, false, false, true, true, false, false, true,
10013 false, true, true, false, false, true, true, false,
10014 false, true, true, false, false, true, true, false
10017 if (!CONST_INT_P (op2
))
10019 error ("the third argument must be comparison constant");
10022 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10024 error ("incorrect comparison mode");
10028 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10030 error ("incorrect rounding operand");
10034 if (VECTOR_MODE_P (mode0
))
10035 op0
= safe_vector_operand (op0
, mode0
);
10036 if (VECTOR_MODE_P (mode1
))
10037 op1
= safe_vector_operand (op1
, mode1
);
10039 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10040 bool ordered
= ordereds
[INTVAL (op2
)];
10041 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10042 rtx const_val
= const0_rtx
;
10044 bool check_unordered
= false;
10045 machine_mode mode
= CCFPmode
;
10046 switch (comparison
)
10051 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10052 if (!non_signaling
)
10058 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10068 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10075 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10076 if (!non_signaling
)
10083 case LE
: /* -> GE */
10084 case LT
: /* -> GT */
10085 case UNGE
: /* -> UNLE */
10086 case UNGT
: /* -> UNLT */
10087 std::swap (op0
, op1
);
10088 comparison
= swap_condition (comparison
);
10096 /* These are supported by CCFPmode. NB: Use ordered/signaling
10097 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10098 with NAN operands. */
10099 if (ordered
== non_signaling
)
10100 ordered
= !ordered
;
10103 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10104 _CMP_EQ_OQ/_CMP_EQ_OS. */
10105 check_unordered
= true;
10109 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10110 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10111 gcc_assert (!ordered
);
10112 check_unordered
= true;
10114 const_val
= const1_rtx
;
10117 gcc_unreachable ();
10120 target
= gen_reg_rtx (SImode
);
10121 emit_move_insn (target
, const_val
);
10122 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10124 if ((optimize
&& !register_operand (op0
, mode0
))
10125 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10126 op0
= copy_to_mode_reg (mode0
, op0
);
10127 if ((optimize
&& !register_operand (op1
, mode1
))
10128 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10129 op1
= copy_to_mode_reg (mode1
, op1
);
10132 1. COMI: ordered and signaling.
10133 2. UCOMI: unordered and non-signaling.
10136 icode
= (icode
== CODE_FOR_sse_comi_round
10137 ? CODE_FOR_sse_ucomi_round
10138 : CODE_FOR_sse2_ucomi_round
);
10140 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10144 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10145 if (INTVAL (op3
) == NO_ROUND
)
10147 pat
= ix86_erase_embedded_rounding (pat
);
10151 set_dst
= SET_DEST (pat
);
10155 gcc_assert (GET_CODE (pat
) == SET
);
10156 set_dst
= SET_DEST (pat
);
10161 rtx_code_label
*label
= NULL
;
10163 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10164 with NAN operands. */
10165 if (check_unordered
)
10167 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10169 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10170 label
= gen_label_rtx ();
10171 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10172 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10173 gen_rtx_LABEL_REF (VOIDmode
, label
),
10175 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10178 /* NB: Set CCFPmode and check a different CCmode which is in subset
10180 if (GET_MODE (set_dst
) != mode
)
10182 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10183 || mode
== CCOmode
|| mode
== CCPmode
10184 || mode
== CCSmode
|| mode
== CCZmode
);
10185 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10188 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10189 gen_rtx_fmt_ee (comparison
, QImode
,
10194 emit_label (label
);
10196 return SUBREG_REG (target
);
10200 ix86_expand_round_builtin (const struct builtin_description
*d
,
10201 tree exp
, rtx target
)
10204 unsigned int i
, nargs
;
10210 enum insn_code icode
= d
->icode
;
10211 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10212 machine_mode tmode
= insn_p
->operand
[0].mode
;
10213 unsigned int nargs_constant
= 0;
10214 unsigned int redundant_embed_rnd
= 0;
10216 switch ((enum ix86_builtin_func_type
) d
->flag
)
10218 case UINT64_FTYPE_V2DF_INT
:
10219 case UINT64_FTYPE_V4SF_INT
:
10220 case UINT_FTYPE_V2DF_INT
:
10221 case UINT_FTYPE_V4SF_INT
:
10222 case INT64_FTYPE_V2DF_INT
:
10223 case INT64_FTYPE_V4SF_INT
:
10224 case INT_FTYPE_V2DF_INT
:
10225 case INT_FTYPE_V4SF_INT
:
10228 case V4SF_FTYPE_V4SF_UINT_INT
:
10229 case V4SF_FTYPE_V4SF_UINT64_INT
:
10230 case V2DF_FTYPE_V2DF_UINT64_INT
:
10231 case V4SF_FTYPE_V4SF_INT_INT
:
10232 case V4SF_FTYPE_V4SF_INT64_INT
:
10233 case V2DF_FTYPE_V2DF_INT64_INT
:
10234 case V4SF_FTYPE_V4SF_V4SF_INT
:
10235 case V2DF_FTYPE_V2DF_V2DF_INT
:
10236 case V4SF_FTYPE_V4SF_V2DF_INT
:
10237 case V2DF_FTYPE_V2DF_V4SF_INT
:
10240 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10241 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10242 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10243 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10244 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10245 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10246 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10247 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10248 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10249 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10250 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10251 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10252 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10253 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10256 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10257 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10258 nargs_constant
= 2;
10261 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10262 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10263 return ix86_expand_sse_comi_round (d
, exp
, target
);
10264 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10265 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10266 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10267 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10268 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10269 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10270 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10271 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10274 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10275 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10276 nargs_constant
= 4;
10279 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10280 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10281 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10282 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10283 nargs_constant
= 3;
10286 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10287 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10288 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10289 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10290 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10291 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10293 nargs_constant
= 4;
10295 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10296 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10297 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10298 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10300 nargs_constant
= 3;
10303 gcc_unreachable ();
10305 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10309 || GET_MODE (target
) != tmode
10310 || !insn_p
->operand
[0].predicate (target
, tmode
))
10311 target
= gen_reg_rtx (tmode
);
10313 for (i
= 0; i
< nargs
; i
++)
10315 tree arg
= CALL_EXPR_ARG (exp
, i
);
10316 rtx op
= expand_normal (arg
);
10317 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10318 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10320 if (i
== nargs
- nargs_constant
)
10326 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10327 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10328 case CODE_FOR_avx512f_vgetmantv2df_round
:
10329 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10330 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10331 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10332 error ("the immediate argument must be a 4-bit immediate");
10334 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10335 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10336 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10337 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10338 error ("the immediate argument must be a 5-bit immediate");
10341 error ("the immediate argument must be an 8-bit immediate");
10346 else if (i
== nargs
-1)
10348 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10350 error ("incorrect rounding operand");
10354 /* If there is no rounding use normal version of the pattern. */
10355 if (INTVAL (op
) == NO_ROUND
)
10356 redundant_embed_rnd
= 1;
10360 if (VECTOR_MODE_P (mode
))
10361 op
= safe_vector_operand (op
, mode
);
10363 op
= fixup_modeless_constant (op
, mode
);
10365 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10367 if (optimize
|| !match
)
10368 op
= copy_to_mode_reg (mode
, op
);
10372 op
= copy_to_reg (op
);
10373 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10378 args
[i
].mode
= mode
;
10384 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10387 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10390 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10394 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10395 args
[2].op
, args
[3].op
);
10398 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10399 args
[2].op
, args
[3].op
, args
[4].op
);
10402 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10403 args
[2].op
, args
[3].op
, args
[4].op
,
10407 gcc_unreachable ();
10413 if (redundant_embed_rnd
)
10414 pat
= ix86_erase_embedded_rounding (pat
);
10420 /* Subroutine of ix86_expand_builtin to take care of special insns
10421 with variable number of operands. */
10424 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10425 tree exp
, rtx target
)
10429 unsigned int i
, nargs
, arg_adjust
, memory
;
10430 bool aligned_mem
= false;
10436 enum insn_code icode
= d
->icode
;
10437 bool last_arg_constant
= false;
10438 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10439 machine_mode tmode
= insn_p
->operand
[0].mode
;
10440 enum { load
, store
} klass
;
10442 switch ((enum ix86_builtin_func_type
) d
->flag
)
10444 case VOID_FTYPE_VOID
:
10445 emit_insn (GEN_FCN (icode
) (target
));
10447 case VOID_FTYPE_UINT64
:
10448 case VOID_FTYPE_UNSIGNED
:
10454 case INT_FTYPE_VOID
:
10455 case USHORT_FTYPE_VOID
:
10456 case UINT64_FTYPE_VOID
:
10457 case UINT_FTYPE_VOID
:
10458 case UNSIGNED_FTYPE_VOID
:
10463 case UINT64_FTYPE_PUNSIGNED
:
10464 case V2DI_FTYPE_PV2DI
:
10465 case V4DI_FTYPE_PV4DI
:
10466 case V32QI_FTYPE_PCCHAR
:
10467 case V16QI_FTYPE_PCCHAR
:
10468 case V8SF_FTYPE_PCV4SF
:
10469 case V8SF_FTYPE_PCFLOAT
:
10470 case V4SF_FTYPE_PCFLOAT
:
10471 case V4DF_FTYPE_PCV2DF
:
10472 case V4DF_FTYPE_PCDOUBLE
:
10473 case V2DF_FTYPE_PCDOUBLE
:
10474 case VOID_FTYPE_PVOID
:
10475 case V8DI_FTYPE_PV8DI
:
10481 case CODE_FOR_sse4_1_movntdqa
:
10482 case CODE_FOR_avx2_movntdqa
:
10483 case CODE_FOR_avx512f_movntdqa
:
10484 aligned_mem
= true;
10490 case VOID_FTYPE_PV2SF_V4SF
:
10491 case VOID_FTYPE_PV8DI_V8DI
:
10492 case VOID_FTYPE_PV4DI_V4DI
:
10493 case VOID_FTYPE_PV2DI_V2DI
:
10494 case VOID_FTYPE_PCHAR_V32QI
:
10495 case VOID_FTYPE_PCHAR_V16QI
:
10496 case VOID_FTYPE_PFLOAT_V16SF
:
10497 case VOID_FTYPE_PFLOAT_V8SF
:
10498 case VOID_FTYPE_PFLOAT_V4SF
:
10499 case VOID_FTYPE_PDOUBLE_V8DF
:
10500 case VOID_FTYPE_PDOUBLE_V4DF
:
10501 case VOID_FTYPE_PDOUBLE_V2DF
:
10502 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10503 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10504 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10505 case VOID_FTYPE_PINT_INT
:
10508 /* Reserve memory operand for target. */
10509 memory
= ARRAY_SIZE (args
);
10512 /* These builtins and instructions require the memory
10513 to be properly aligned. */
10514 case CODE_FOR_avx_movntv4di
:
10515 case CODE_FOR_sse2_movntv2di
:
10516 case CODE_FOR_avx_movntv8sf
:
10517 case CODE_FOR_sse_movntv4sf
:
10518 case CODE_FOR_sse4a_vmmovntv4sf
:
10519 case CODE_FOR_avx_movntv4df
:
10520 case CODE_FOR_sse2_movntv2df
:
10521 case CODE_FOR_sse4a_vmmovntv2df
:
10522 case CODE_FOR_sse2_movntidi
:
10523 case CODE_FOR_sse_movntq
:
10524 case CODE_FOR_sse2_movntisi
:
10525 case CODE_FOR_avx512f_movntv16sf
:
10526 case CODE_FOR_avx512f_movntv8df
:
10527 case CODE_FOR_avx512f_movntv8di
:
10528 aligned_mem
= true;
10534 case VOID_FTYPE_PVOID_PCVOID
:
10540 case V4SF_FTYPE_V4SF_PCV2SF
:
10541 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10546 case V8SF_FTYPE_PCV8SF_V8SI
:
10547 case V4DF_FTYPE_PCV4DF_V4DI
:
10548 case V4SF_FTYPE_PCV4SF_V4SI
:
10549 case V2DF_FTYPE_PCV2DF_V2DI
:
10550 case V8SI_FTYPE_PCV8SI_V8SI
:
10551 case V4DI_FTYPE_PCV4DI_V4DI
:
10552 case V4SI_FTYPE_PCV4SI_V4SI
:
10553 case V2DI_FTYPE_PCV2DI_V2DI
:
10554 case VOID_FTYPE_INT_INT64
:
10559 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10560 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10561 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10562 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10563 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10564 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10565 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10566 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10567 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10568 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10569 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10570 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10571 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10572 case VOID_FTYPE_PV32HI_V32HI_USI
:
10573 case VOID_FTYPE_PV32QI_V32QI_USI
:
10574 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10575 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10576 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10579 /* These builtins and instructions require the memory
10580 to be properly aligned. */
10581 case CODE_FOR_avx512f_storev16sf_mask
:
10582 case CODE_FOR_avx512f_storev16si_mask
:
10583 case CODE_FOR_avx512f_storev8df_mask
:
10584 case CODE_FOR_avx512f_storev8di_mask
:
10585 case CODE_FOR_avx512vl_storev8sf_mask
:
10586 case CODE_FOR_avx512vl_storev8si_mask
:
10587 case CODE_FOR_avx512vl_storev4df_mask
:
10588 case CODE_FOR_avx512vl_storev4di_mask
:
10589 case CODE_FOR_avx512vl_storev4sf_mask
:
10590 case CODE_FOR_avx512vl_storev4si_mask
:
10591 case CODE_FOR_avx512vl_storev2df_mask
:
10592 case CODE_FOR_avx512vl_storev2di_mask
:
10593 aligned_mem
= true;
10599 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10600 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10601 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10602 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10603 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10604 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10605 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10606 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10607 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10608 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10609 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10610 case VOID_FTYPE_PV16QI_V8DI_UQI
:
10611 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10612 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10613 case VOID_FTYPE_PV4SI_V2DI_UQI
:
10614 case VOID_FTYPE_PV8HI_V4DI_UQI
:
10615 case VOID_FTYPE_PV8HI_V2DI_UQI
:
10616 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10617 case VOID_FTYPE_PV8HI_V4SI_UQI
:
10618 case VOID_FTYPE_PV16QI_V4DI_UQI
:
10619 case VOID_FTYPE_PV16QI_V2DI_UQI
:
10620 case VOID_FTYPE_PV16QI_V8SI_UQI
:
10621 case VOID_FTYPE_PV16QI_V4SI_UQI
:
10622 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10623 case VOID_FTYPE_PCHAR_V32QI_USI
:
10624 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10625 case VOID_FTYPE_PSHORT_V32HI_USI
:
10626 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10627 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10628 case VOID_FTYPE_PINT_V16SI_UHI
:
10629 case VOID_FTYPE_PINT_V8SI_UQI
:
10630 case VOID_FTYPE_PINT_V4SI_UQI
:
10631 case VOID_FTYPE_PINT64_V8DI_UQI
:
10632 case VOID_FTYPE_PINT64_V4DI_UQI
:
10633 case VOID_FTYPE_PINT64_V2DI_UQI
:
10634 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10635 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10636 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10637 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10638 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10639 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10640 case VOID_FTYPE_PV32QI_V32HI_USI
:
10641 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10642 case VOID_FTYPE_PV8QI_V8HI_UQI
:
10645 /* Reserve memory operand for target. */
10646 memory
= ARRAY_SIZE (args
);
10648 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10649 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10650 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10651 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10652 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10653 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10654 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10655 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10656 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10657 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10658 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10659 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10660 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10661 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10662 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10663 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10664 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10665 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10668 /* These builtins and instructions require the memory
10669 to be properly aligned. */
10670 case CODE_FOR_avx512f_loadv16sf_mask
:
10671 case CODE_FOR_avx512f_loadv16si_mask
:
10672 case CODE_FOR_avx512f_loadv8df_mask
:
10673 case CODE_FOR_avx512f_loadv8di_mask
:
10674 case CODE_FOR_avx512vl_loadv8sf_mask
:
10675 case CODE_FOR_avx512vl_loadv8si_mask
:
10676 case CODE_FOR_avx512vl_loadv4df_mask
:
10677 case CODE_FOR_avx512vl_loadv4di_mask
:
10678 case CODE_FOR_avx512vl_loadv4sf_mask
:
10679 case CODE_FOR_avx512vl_loadv4si_mask
:
10680 case CODE_FOR_avx512vl_loadv2df_mask
:
10681 case CODE_FOR_avx512vl_loadv2di_mask
:
10682 case CODE_FOR_avx512bw_loadv64qi_mask
:
10683 case CODE_FOR_avx512vl_loadv32qi_mask
:
10684 case CODE_FOR_avx512vl_loadv16qi_mask
:
10685 case CODE_FOR_avx512bw_loadv32hi_mask
:
10686 case CODE_FOR_avx512vl_loadv16hi_mask
:
10687 case CODE_FOR_avx512vl_loadv8hi_mask
:
10688 aligned_mem
= true;
10694 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10695 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10696 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10697 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10698 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10699 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10700 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10701 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10702 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10703 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10704 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10705 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10706 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10707 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10708 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10709 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10710 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10711 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10716 case VOID_FTYPE_UINT_UINT_UINT
:
10717 case VOID_FTYPE_UINT64_UINT_UINT
:
10718 case UCHAR_FTYPE_UINT_UINT_UINT
:
10719 case UCHAR_FTYPE_UINT64_UINT_UINT
:
10722 memory
= ARRAY_SIZE (args
);
10723 last_arg_constant
= true;
10726 gcc_unreachable ();
10729 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10731 if (klass
== store
)
10733 arg
= CALL_EXPR_ARG (exp
, 0);
10734 op
= expand_normal (arg
);
10735 gcc_assert (target
== 0);
10738 op
= ix86_zero_extend_to_Pmode (op
);
10739 target
= gen_rtx_MEM (tmode
, op
);
10740 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10741 on it. Try to improve it using get_pointer_alignment,
10742 and if the special builtin is one that requires strict
10743 mode alignment, also from it's GET_MODE_ALIGNMENT.
10744 Failure to do so could lead to ix86_legitimate_combined_insn
10745 rejecting all changes to such insns. */
10746 unsigned int align
= get_pointer_alignment (arg
);
10747 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10748 align
= GET_MODE_ALIGNMENT (tmode
);
10749 if (MEM_ALIGN (target
) < align
)
10750 set_mem_align (target
, align
);
10753 target
= force_reg (tmode
, op
);
10761 || !register_operand (target
, tmode
)
10762 || GET_MODE (target
) != tmode
)
10763 target
= gen_reg_rtx (tmode
);
10766 for (i
= 0; i
< nargs
; i
++)
10768 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10771 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10772 op
= expand_normal (arg
);
10773 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10775 if (last_arg_constant
&& (i
+ 1) == nargs
)
10779 if (icode
== CODE_FOR_lwp_lwpvalsi3
10780 || icode
== CODE_FOR_lwp_lwpinssi3
10781 || icode
== CODE_FOR_lwp_lwpvaldi3
10782 || icode
== CODE_FOR_lwp_lwpinsdi3
)
10783 error ("the last argument must be a 32-bit immediate");
10785 error ("the last argument must be an 8-bit immediate");
10793 /* This must be the memory operand. */
10794 op
= ix86_zero_extend_to_Pmode (op
);
10795 op
= gen_rtx_MEM (mode
, op
);
10796 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10797 on it. Try to improve it using get_pointer_alignment,
10798 and if the special builtin is one that requires strict
10799 mode alignment, also from it's GET_MODE_ALIGNMENT.
10800 Failure to do so could lead to ix86_legitimate_combined_insn
10801 rejecting all changes to such insns. */
10802 unsigned int align
= get_pointer_alignment (arg
);
10803 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10804 align
= GET_MODE_ALIGNMENT (mode
);
10805 if (MEM_ALIGN (op
) < align
)
10806 set_mem_align (op
, align
);
10810 /* This must be register. */
10811 if (VECTOR_MODE_P (mode
))
10812 op
= safe_vector_operand (op
, mode
);
10814 op
= fixup_modeless_constant (op
, mode
);
10816 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10817 op
= copy_to_mode_reg (mode
, op
);
10820 op
= copy_to_reg (op
);
10821 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10827 args
[i
].mode
= mode
;
10833 pat
= GEN_FCN (icode
) (target
);
10836 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10839 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10842 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10845 gcc_unreachable ();
10851 return klass
== store
? 0 : target
;
10854 /* Return the integer constant in ARG. Constrain it to be in the range
10855 of the subparts of VEC_TYPE; issue an error if not. */
10858 get_element_number (tree vec_type
, tree arg
)
10860 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10862 if (!tree_fits_uhwi_p (arg
)
10863 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10865 error ("selector must be an integer constant in the range "
10873 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10874 ix86_expand_vector_init. We DO have language-level syntax for this, in
10875 the form of (type){ init-list }. Except that since we can't place emms
10876 instructions from inside the compiler, we can't allow the use of MMX
10877 registers unless the user explicitly asks for it. So we do *not* define
10878 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10879 we have builtins invoked by mmintrin.h that gives us license to emit
10880 these sorts of instructions. */
10883 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10885 machine_mode tmode
= TYPE_MODE (type
);
10886 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10887 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10888 rtvec v
= rtvec_alloc (n_elt
);
10890 gcc_assert (VECTOR_MODE_P (tmode
));
10891 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10893 for (i
= 0; i
< n_elt
; ++i
)
10895 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10896 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10899 if (!target
|| !register_operand (target
, tmode
))
10900 target
= gen_reg_rtx (tmode
);
10902 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10906 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10907 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10908 had a language-level syntax for referencing vector elements. */
10911 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10913 machine_mode tmode
, mode0
;
10918 arg0
= CALL_EXPR_ARG (exp
, 0);
10919 arg1
= CALL_EXPR_ARG (exp
, 1);
10921 op0
= expand_normal (arg0
);
10922 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10924 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10925 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10926 gcc_assert (VECTOR_MODE_P (mode0
));
10928 op0
= force_reg (mode0
, op0
);
10930 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10931 target
= gen_reg_rtx (tmode
);
10933 ix86_expand_vector_extract (true, target
, op0
, elt
);
10938 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10939 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10940 a language-level syntax for referencing vector elements. */
10943 ix86_expand_vec_set_builtin (tree exp
)
10945 machine_mode tmode
, mode1
;
10946 tree arg0
, arg1
, arg2
;
10948 rtx op0
, op1
, target
;
10950 arg0
= CALL_EXPR_ARG (exp
, 0);
10951 arg1
= CALL_EXPR_ARG (exp
, 1);
10952 arg2
= CALL_EXPR_ARG (exp
, 2);
10954 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10955 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10956 gcc_assert (VECTOR_MODE_P (tmode
));
10958 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10959 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10960 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10962 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10963 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10965 op0
= force_reg (tmode
, op0
);
10966 op1
= force_reg (mode1
, op1
);
10968 /* OP0 is the source of these builtin functions and shouldn't be
10969 modified. Create a copy, use it and return it as target. */
10970 target
= gen_reg_rtx (tmode
);
10971 emit_move_insn (target
, op0
);
10972 ix86_expand_vector_set (true, target
, op1
, elt
);
10977 /* Expand an expression EXP that calls a built-in function,
10978 with result going to TARGET if that's convenient
10979 (and in mode MODE if that's convenient).
10980 SUBTARGET may be used as the target for computing one of EXP's operands.
10981 IGNORE is nonzero if the value is to be ignored. */
10984 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10985 machine_mode mode
, int ignore
)
10988 enum insn_code icode
, icode2
;
10989 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10990 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10991 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10992 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10993 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
10995 /* For CPU builtins that can be folded, fold first and expand the fold. */
10998 case IX86_BUILTIN_CPU_INIT
:
11000 /* Make it call __cpu_indicator_init in libgcc. */
11001 tree call_expr
, fndecl
, type
;
11002 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
11003 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
11004 call_expr
= build_call_expr (fndecl
, 0);
11005 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
11007 case IX86_BUILTIN_CPU_IS
:
11008 case IX86_BUILTIN_CPU_SUPPORTS
:
11010 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11011 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11012 gcc_assert (fold_expr
!= NULL_TREE
);
11013 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11017 HOST_WIDE_INT isa
= ix86_isa_flags
;
11018 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11019 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11020 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11021 /* The general case is we require all the ISAs specified in bisa{,2}
11023 The exceptions are:
11024 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11025 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11026 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11027 where for each this pair it is sufficient if either of the ISAs is
11028 enabled, plus if it is ored with other options also those others. */
11029 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11030 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11031 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11032 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11033 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11034 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11035 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11036 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11037 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11038 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11039 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11040 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11041 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11042 MMX is disabled. NB: Since MMX intrinsics are marked with
11043 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11045 if (TARGET_MMX
|| TARGET_MMX_WITH_SSE
)
11047 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11048 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11049 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
)) != 0)
11050 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
);
11051 if (((bisa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11052 == (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11053 && (isa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
)) != 0)
11054 isa
|= (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
);
11055 if (((bisa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11056 == (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11057 && (isa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
)) != 0)
11058 isa
|= (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
);
11060 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11062 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11063 if (TARGET_ABI_X32
)
11064 bisa
|= OPTION_MASK_ABI_X32
;
11066 bisa
|= OPTION_MASK_ABI_64
;
11067 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11068 (enum fpmath_unit
) 0, false, add_abi_p
);
11070 error ("%qE needs unknown isa option", fndecl
);
11073 gcc_assert (opts
!= NULL
);
11074 error ("%qE needs isa option %s", fndecl
, opts
);
11077 return expand_call (exp
, target
, ignore
);
11082 case IX86_BUILTIN_MASKMOVQ
:
11083 case IX86_BUILTIN_MASKMOVDQU
:
11084 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11085 ? CODE_FOR_mmx_maskmovq
11086 : CODE_FOR_sse2_maskmovdqu
);
11087 /* Note the arg order is different from the operand order. */
11088 arg1
= CALL_EXPR_ARG (exp
, 0);
11089 arg2
= CALL_EXPR_ARG (exp
, 1);
11090 arg0
= CALL_EXPR_ARG (exp
, 2);
11091 op0
= expand_normal (arg0
);
11092 op1
= expand_normal (arg1
);
11093 op2
= expand_normal (arg2
);
11094 mode0
= insn_data
[icode
].operand
[0].mode
;
11095 mode1
= insn_data
[icode
].operand
[1].mode
;
11096 mode2
= insn_data
[icode
].operand
[2].mode
;
11098 op0
= ix86_zero_extend_to_Pmode (op0
);
11099 op0
= gen_rtx_MEM (mode1
, op0
);
11101 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11102 op0
= copy_to_mode_reg (mode0
, op0
);
11103 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11104 op1
= copy_to_mode_reg (mode1
, op1
);
11105 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11106 op2
= copy_to_mode_reg (mode2
, op2
);
11107 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11113 case IX86_BUILTIN_LDMXCSR
:
11114 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11115 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11116 emit_move_insn (target
, op0
);
11117 emit_insn (gen_sse_ldmxcsr (target
));
11120 case IX86_BUILTIN_STMXCSR
:
11121 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11122 emit_insn (gen_sse_stmxcsr (target
));
11123 return copy_to_mode_reg (SImode
, target
);
11125 case IX86_BUILTIN_CLFLUSH
:
11126 arg0
= CALL_EXPR_ARG (exp
, 0);
11127 op0
= expand_normal (arg0
);
11128 icode
= CODE_FOR_sse2_clflush
;
11129 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11130 op0
= ix86_zero_extend_to_Pmode (op0
);
11132 emit_insn (gen_sse2_clflush (op0
));
11135 case IX86_BUILTIN_CLWB
:
11136 arg0
= CALL_EXPR_ARG (exp
, 0);
11137 op0
= expand_normal (arg0
);
11138 icode
= CODE_FOR_clwb
;
11139 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11140 op0
= ix86_zero_extend_to_Pmode (op0
);
11142 emit_insn (gen_clwb (op0
));
11145 case IX86_BUILTIN_CLFLUSHOPT
:
11146 arg0
= CALL_EXPR_ARG (exp
, 0);
11147 op0
= expand_normal (arg0
);
11148 icode
= CODE_FOR_clflushopt
;
11149 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11150 op0
= ix86_zero_extend_to_Pmode (op0
);
11152 emit_insn (gen_clflushopt (op0
));
11155 case IX86_BUILTIN_MONITOR
:
11156 case IX86_BUILTIN_MONITORX
:
11157 arg0
= CALL_EXPR_ARG (exp
, 0);
11158 arg1
= CALL_EXPR_ARG (exp
, 1);
11159 arg2
= CALL_EXPR_ARG (exp
, 2);
11160 op0
= expand_normal (arg0
);
11161 op1
= expand_normal (arg1
);
11162 op2
= expand_normal (arg2
);
11164 op0
= ix86_zero_extend_to_Pmode (op0
);
11166 op1
= copy_to_mode_reg (SImode
, op1
);
11168 op2
= copy_to_mode_reg (SImode
, op2
);
11170 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11171 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11172 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11175 case IX86_BUILTIN_MWAIT
:
11176 arg0
= CALL_EXPR_ARG (exp
, 0);
11177 arg1
= CALL_EXPR_ARG (exp
, 1);
11178 op0
= expand_normal (arg0
);
11179 op1
= expand_normal (arg1
);
11181 op0
= copy_to_mode_reg (SImode
, op0
);
11183 op1
= copy_to_mode_reg (SImode
, op1
);
11184 emit_insn (gen_sse3_mwait (op0
, op1
));
11187 case IX86_BUILTIN_MWAITX
:
11188 arg0
= CALL_EXPR_ARG (exp
, 0);
11189 arg1
= CALL_EXPR_ARG (exp
, 1);
11190 arg2
= CALL_EXPR_ARG (exp
, 2);
11191 op0
= expand_normal (arg0
);
11192 op1
= expand_normal (arg1
);
11193 op2
= expand_normal (arg2
);
11195 op0
= copy_to_mode_reg (SImode
, op0
);
11197 op1
= copy_to_mode_reg (SImode
, op1
);
11199 op2
= copy_to_mode_reg (SImode
, op2
);
11200 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11203 case IX86_BUILTIN_UMONITOR
:
11204 arg0
= CALL_EXPR_ARG (exp
, 0);
11205 op0
= expand_normal (arg0
);
11207 op0
= ix86_zero_extend_to_Pmode (op0
);
11208 emit_insn (gen_umonitor (Pmode
, op0
));
11211 case IX86_BUILTIN_UMWAIT
:
11212 case IX86_BUILTIN_TPAUSE
:
11213 arg0
= CALL_EXPR_ARG (exp
, 0);
11214 arg1
= CALL_EXPR_ARG (exp
, 1);
11215 op0
= expand_normal (arg0
);
11216 op1
= expand_normal (arg1
);
11219 op0
= copy_to_mode_reg (SImode
, op0
);
11221 op1
= force_reg (DImode
, op1
);
11225 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11226 NULL
, 1, OPTAB_DIRECT
);
11229 case IX86_BUILTIN_UMWAIT
:
11230 icode
= CODE_FOR_umwait_rex64
;
11232 case IX86_BUILTIN_TPAUSE
:
11233 icode
= CODE_FOR_tpause_rex64
;
11236 gcc_unreachable ();
11239 op2
= gen_lowpart (SImode
, op2
);
11240 op1
= gen_lowpart (SImode
, op1
);
11241 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11247 case IX86_BUILTIN_UMWAIT
:
11248 icode
= CODE_FOR_umwait
;
11250 case IX86_BUILTIN_TPAUSE
:
11251 icode
= CODE_FOR_tpause
;
11254 gcc_unreachable ();
11256 pat
= GEN_FCN (icode
) (op0
, op1
);
11265 || !register_operand (target
, QImode
))
11266 target
= gen_reg_rtx (QImode
);
11268 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11270 emit_insn (gen_rtx_SET (target
, pat
));
11274 case IX86_BUILTIN_CLZERO
:
11275 arg0
= CALL_EXPR_ARG (exp
, 0);
11276 op0
= expand_normal (arg0
);
11278 op0
= ix86_zero_extend_to_Pmode (op0
);
11279 emit_insn (gen_clzero (Pmode
, op0
));
11282 case IX86_BUILTIN_CLDEMOTE
:
11283 arg0
= CALL_EXPR_ARG (exp
, 0);
11284 op0
= expand_normal (arg0
);
11285 icode
= CODE_FOR_cldemote
;
11286 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11287 op0
= ix86_zero_extend_to_Pmode (op0
);
11289 emit_insn (gen_cldemote (op0
));
11292 case IX86_BUILTIN_VEC_INIT_V2SI
:
11293 case IX86_BUILTIN_VEC_INIT_V4HI
:
11294 case IX86_BUILTIN_VEC_INIT_V8QI
:
11295 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11297 case IX86_BUILTIN_VEC_EXT_V2DF
:
11298 case IX86_BUILTIN_VEC_EXT_V2DI
:
11299 case IX86_BUILTIN_VEC_EXT_V4SF
:
11300 case IX86_BUILTIN_VEC_EXT_V4SI
:
11301 case IX86_BUILTIN_VEC_EXT_V8HI
:
11302 case IX86_BUILTIN_VEC_EXT_V2SI
:
11303 case IX86_BUILTIN_VEC_EXT_V4HI
:
11304 case IX86_BUILTIN_VEC_EXT_V16QI
:
11305 return ix86_expand_vec_ext_builtin (exp
, target
);
11307 case IX86_BUILTIN_VEC_SET_V2DI
:
11308 case IX86_BUILTIN_VEC_SET_V4SF
:
11309 case IX86_BUILTIN_VEC_SET_V4SI
:
11310 case IX86_BUILTIN_VEC_SET_V8HI
:
11311 case IX86_BUILTIN_VEC_SET_V4HI
:
11312 case IX86_BUILTIN_VEC_SET_V16QI
:
11313 return ix86_expand_vec_set_builtin (exp
);
11315 case IX86_BUILTIN_NANQ
:
11316 case IX86_BUILTIN_NANSQ
:
11317 return expand_call (exp
, target
, ignore
);
11319 case IX86_BUILTIN_RDPID
:
11321 op0
= gen_reg_rtx (word_mode
);
11325 insn
= gen_rdpid_rex64 (op0
);
11326 op0
= convert_to_mode (SImode
, op0
, 1);
11329 insn
= gen_rdpid (op0
);
11334 || !register_operand (target
, SImode
))
11335 target
= gen_reg_rtx (SImode
);
11337 emit_move_insn (target
, op0
);
11340 case IX86_BUILTIN_2INTERSECTD512
:
11341 case IX86_BUILTIN_2INTERSECTQ512
:
11342 case IX86_BUILTIN_2INTERSECTD256
:
11343 case IX86_BUILTIN_2INTERSECTQ256
:
11344 case IX86_BUILTIN_2INTERSECTD128
:
11345 case IX86_BUILTIN_2INTERSECTQ128
:
11346 arg0
= CALL_EXPR_ARG (exp
, 0);
11347 arg1
= CALL_EXPR_ARG (exp
, 1);
11348 arg2
= CALL_EXPR_ARG (exp
, 2);
11349 arg3
= CALL_EXPR_ARG (exp
, 3);
11350 op0
= expand_normal (arg0
);
11351 op1
= expand_normal (arg1
);
11352 op2
= expand_normal (arg2
);
11353 op3
= expand_normal (arg3
);
11355 if (!address_operand (op0
, VOIDmode
))
11357 op0
= convert_memory_address (Pmode
, op0
);
11358 op0
= copy_addr_to_reg (op0
);
11360 if (!address_operand (op1
, VOIDmode
))
11362 op1
= convert_memory_address (Pmode
, op1
);
11363 op1
= copy_addr_to_reg (op1
);
11368 case IX86_BUILTIN_2INTERSECTD512
:
11370 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11372 case IX86_BUILTIN_2INTERSECTQ512
:
11374 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11376 case IX86_BUILTIN_2INTERSECTD256
:
11378 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11380 case IX86_BUILTIN_2INTERSECTQ256
:
11382 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11384 case IX86_BUILTIN_2INTERSECTD128
:
11386 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11388 case IX86_BUILTIN_2INTERSECTQ128
:
11390 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11393 gcc_unreachable ();
11396 mode2
= insn_data
[icode
].operand
[1].mode
;
11397 mode3
= insn_data
[icode
].operand
[2].mode
;
11398 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11399 op2
= copy_to_mode_reg (mode2
, op2
);
11400 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11401 op3
= copy_to_mode_reg (mode3
, op3
);
11403 op4
= gen_reg_rtx (mode4
);
11404 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11405 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11406 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11407 gen_lowpart (mode0
, op4
));
11408 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11409 gen_highpart (mode0
, op4
));
11413 case IX86_BUILTIN_RDPMC
:
11414 case IX86_BUILTIN_RDTSC
:
11415 case IX86_BUILTIN_RDTSCP
:
11416 case IX86_BUILTIN_XGETBV
:
11418 op0
= gen_reg_rtx (DImode
);
11419 op1
= gen_reg_rtx (DImode
);
11421 if (fcode
== IX86_BUILTIN_RDPMC
)
11423 arg0
= CALL_EXPR_ARG (exp
, 0);
11424 op2
= expand_normal (arg0
);
11425 if (!register_operand (op2
, SImode
))
11426 op2
= copy_to_mode_reg (SImode
, op2
);
11428 insn
= (TARGET_64BIT
11429 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11430 : gen_rdpmc (op0
, op2
));
11433 else if (fcode
== IX86_BUILTIN_XGETBV
)
11435 arg0
= CALL_EXPR_ARG (exp
, 0);
11436 op2
= expand_normal (arg0
);
11437 if (!register_operand (op2
, SImode
))
11438 op2
= copy_to_mode_reg (SImode
, op2
);
11440 insn
= (TARGET_64BIT
11441 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11442 : gen_xgetbv (op0
, op2
));
11445 else if (fcode
== IX86_BUILTIN_RDTSC
)
11447 insn
= (TARGET_64BIT
11448 ? gen_rdtsc_rex64 (op0
, op1
)
11449 : gen_rdtsc (op0
));
11454 op2
= gen_reg_rtx (SImode
);
11456 insn
= (TARGET_64BIT
11457 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11458 : gen_rdtscp (op0
, op2
));
11461 arg0
= CALL_EXPR_ARG (exp
, 0);
11462 op4
= expand_normal (arg0
);
11463 if (!address_operand (op4
, VOIDmode
))
11465 op4
= convert_memory_address (Pmode
, op4
);
11466 op4
= copy_addr_to_reg (op4
);
11468 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11472 || !register_operand (target
, DImode
))
11473 target
= gen_reg_rtx (DImode
);
11477 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11478 op1
, 1, OPTAB_DIRECT
);
11479 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11480 op0
, 1, OPTAB_DIRECT
);
11483 emit_move_insn (target
, op0
);
11486 case IX86_BUILTIN_ENQCMD
:
11487 case IX86_BUILTIN_ENQCMDS
:
11488 case IX86_BUILTIN_MOVDIR64B
:
11490 arg0
= CALL_EXPR_ARG (exp
, 0);
11491 arg1
= CALL_EXPR_ARG (exp
, 1);
11492 op0
= expand_normal (arg0
);
11493 op1
= expand_normal (arg1
);
11495 op0
= ix86_zero_extend_to_Pmode (op0
);
11496 if (!address_operand (op1
, VOIDmode
))
11498 op1
= convert_memory_address (Pmode
, op1
);
11499 op1
= copy_addr_to_reg (op1
);
11501 op1
= gen_rtx_MEM (XImode
, op1
);
11503 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11505 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11512 target
= gen_reg_rtx (SImode
);
11513 emit_move_insn (target
, const0_rtx
);
11514 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11516 if (fcode
== IX86_BUILTIN_ENQCMD
)
11517 pat
= gen_enqcmd (UNSPECV_ENQCMD
, Pmode
, op0
, op1
);
11519 pat
= gen_enqcmd (UNSPECV_ENQCMDS
, Pmode
, op0
, op1
);
11523 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11524 gen_rtx_fmt_ee (EQ
, QImode
,
11528 return SUBREG_REG (target
);
11531 case IX86_BUILTIN_FXSAVE
:
11532 case IX86_BUILTIN_FXRSTOR
:
11533 case IX86_BUILTIN_FXSAVE64
:
11534 case IX86_BUILTIN_FXRSTOR64
:
11535 case IX86_BUILTIN_FNSTENV
:
11536 case IX86_BUILTIN_FLDENV
:
11540 case IX86_BUILTIN_FXSAVE
:
11541 icode
= CODE_FOR_fxsave
;
11543 case IX86_BUILTIN_FXRSTOR
:
11544 icode
= CODE_FOR_fxrstor
;
11546 case IX86_BUILTIN_FXSAVE64
:
11547 icode
= CODE_FOR_fxsave64
;
11549 case IX86_BUILTIN_FXRSTOR64
:
11550 icode
= CODE_FOR_fxrstor64
;
11552 case IX86_BUILTIN_FNSTENV
:
11553 icode
= CODE_FOR_fnstenv
;
11555 case IX86_BUILTIN_FLDENV
:
11556 icode
= CODE_FOR_fldenv
;
11559 gcc_unreachable ();
11562 arg0
= CALL_EXPR_ARG (exp
, 0);
11563 op0
= expand_normal (arg0
);
11565 if (!address_operand (op0
, VOIDmode
))
11567 op0
= convert_memory_address (Pmode
, op0
);
11568 op0
= copy_addr_to_reg (op0
);
11570 op0
= gen_rtx_MEM (mode0
, op0
);
11572 pat
= GEN_FCN (icode
) (op0
);
11577 case IX86_BUILTIN_XSETBV
:
11578 arg0
= CALL_EXPR_ARG (exp
, 0);
11579 arg1
= CALL_EXPR_ARG (exp
, 1);
11580 op0
= expand_normal (arg0
);
11581 op1
= expand_normal (arg1
);
11584 op0
= copy_to_mode_reg (SImode
, op0
);
11586 op1
= force_reg (DImode
, op1
);
11590 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11591 NULL
, 1, OPTAB_DIRECT
);
11593 icode
= CODE_FOR_xsetbv_rex64
;
11595 op2
= gen_lowpart (SImode
, op2
);
11596 op1
= gen_lowpart (SImode
, op1
);
11597 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11601 icode
= CODE_FOR_xsetbv
;
11603 pat
= GEN_FCN (icode
) (op0
, op1
);
11609 case IX86_BUILTIN_XSAVE
:
11610 case IX86_BUILTIN_XRSTOR
:
11611 case IX86_BUILTIN_XSAVE64
:
11612 case IX86_BUILTIN_XRSTOR64
:
11613 case IX86_BUILTIN_XSAVEOPT
:
11614 case IX86_BUILTIN_XSAVEOPT64
:
11615 case IX86_BUILTIN_XSAVES
:
11616 case IX86_BUILTIN_XRSTORS
:
11617 case IX86_BUILTIN_XSAVES64
:
11618 case IX86_BUILTIN_XRSTORS64
:
11619 case IX86_BUILTIN_XSAVEC
:
11620 case IX86_BUILTIN_XSAVEC64
:
11621 arg0
= CALL_EXPR_ARG (exp
, 0);
11622 arg1
= CALL_EXPR_ARG (exp
, 1);
11623 op0
= expand_normal (arg0
);
11624 op1
= expand_normal (arg1
);
11626 if (!address_operand (op0
, VOIDmode
))
11628 op0
= convert_memory_address (Pmode
, op0
);
11629 op0
= copy_addr_to_reg (op0
);
11631 op0
= gen_rtx_MEM (BLKmode
, op0
);
11633 op1
= force_reg (DImode
, op1
);
11637 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11638 NULL
, 1, OPTAB_DIRECT
);
11641 case IX86_BUILTIN_XSAVE
:
11642 icode
= CODE_FOR_xsave_rex64
;
11644 case IX86_BUILTIN_XRSTOR
:
11645 icode
= CODE_FOR_xrstor_rex64
;
11647 case IX86_BUILTIN_XSAVE64
:
11648 icode
= CODE_FOR_xsave64
;
11650 case IX86_BUILTIN_XRSTOR64
:
11651 icode
= CODE_FOR_xrstor64
;
11653 case IX86_BUILTIN_XSAVEOPT
:
11654 icode
= CODE_FOR_xsaveopt_rex64
;
11656 case IX86_BUILTIN_XSAVEOPT64
:
11657 icode
= CODE_FOR_xsaveopt64
;
11659 case IX86_BUILTIN_XSAVES
:
11660 icode
= CODE_FOR_xsaves_rex64
;
11662 case IX86_BUILTIN_XRSTORS
:
11663 icode
= CODE_FOR_xrstors_rex64
;
11665 case IX86_BUILTIN_XSAVES64
:
11666 icode
= CODE_FOR_xsaves64
;
11668 case IX86_BUILTIN_XRSTORS64
:
11669 icode
= CODE_FOR_xrstors64
;
11671 case IX86_BUILTIN_XSAVEC
:
11672 icode
= CODE_FOR_xsavec_rex64
;
11674 case IX86_BUILTIN_XSAVEC64
:
11675 icode
= CODE_FOR_xsavec64
;
11678 gcc_unreachable ();
11681 op2
= gen_lowpart (SImode
, op2
);
11682 op1
= gen_lowpart (SImode
, op1
);
11683 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11689 case IX86_BUILTIN_XSAVE
:
11690 icode
= CODE_FOR_xsave
;
11692 case IX86_BUILTIN_XRSTOR
:
11693 icode
= CODE_FOR_xrstor
;
11695 case IX86_BUILTIN_XSAVEOPT
:
11696 icode
= CODE_FOR_xsaveopt
;
11698 case IX86_BUILTIN_XSAVES
:
11699 icode
= CODE_FOR_xsaves
;
11701 case IX86_BUILTIN_XRSTORS
:
11702 icode
= CODE_FOR_xrstors
;
11704 case IX86_BUILTIN_XSAVEC
:
11705 icode
= CODE_FOR_xsavec
;
11708 gcc_unreachable ();
11710 pat
= GEN_FCN (icode
) (op0
, op1
);
11717 case IX86_BUILTIN_LLWPCB
:
11718 arg0
= CALL_EXPR_ARG (exp
, 0);
11719 op0
= expand_normal (arg0
);
11720 icode
= CODE_FOR_lwp_llwpcb
;
11721 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11722 op0
= ix86_zero_extend_to_Pmode (op0
);
11723 emit_insn (gen_lwp_llwpcb (op0
));
11726 case IX86_BUILTIN_SLWPCB
:
11727 icode
= CODE_FOR_lwp_slwpcb
;
11729 || !insn_data
[icode
].operand
[0].predicate (target
, Pmode
))
11730 target
= gen_reg_rtx (Pmode
);
11731 emit_insn (gen_lwp_slwpcb (target
));
11734 case IX86_BUILTIN_BEXTRI32
:
11735 case IX86_BUILTIN_BEXTRI64
:
11736 arg0
= CALL_EXPR_ARG (exp
, 0);
11737 arg1
= CALL_EXPR_ARG (exp
, 1);
11738 op0
= expand_normal (arg0
);
11739 op1
= expand_normal (arg1
);
11740 icode
= (fcode
== IX86_BUILTIN_BEXTRI32
11741 ? CODE_FOR_tbm_bextri_si
11742 : CODE_FOR_tbm_bextri_di
);
11743 if (!CONST_INT_P (op1
))
11745 error ("last argument must be an immediate");
11750 unsigned char length
= (INTVAL (op1
) >> 8) & 0xFF;
11751 unsigned char lsb_index
= INTVAL (op1
) & 0xFF;
11752 op1
= GEN_INT (length
);
11753 op2
= GEN_INT (lsb_index
);
11755 mode1
= insn_data
[icode
].operand
[1].mode
;
11756 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11757 op0
= copy_to_mode_reg (mode1
, op0
);
11759 mode0
= insn_data
[icode
].operand
[0].mode
;
11761 || !register_operand (target
, mode0
))
11762 target
= gen_reg_rtx (mode0
);
11764 pat
= GEN_FCN (icode
) (target
, op0
, op1
, op2
);
11770 case IX86_BUILTIN_RDRAND16_STEP
:
11771 icode
= CODE_FOR_rdrandhi_1
;
11775 case IX86_BUILTIN_RDRAND32_STEP
:
11776 icode
= CODE_FOR_rdrandsi_1
;
11780 case IX86_BUILTIN_RDRAND64_STEP
:
11781 icode
= CODE_FOR_rdranddi_1
;
11785 arg0
= CALL_EXPR_ARG (exp
, 0);
11786 op1
= expand_normal (arg0
);
11787 if (!address_operand (op1
, VOIDmode
))
11789 op1
= convert_memory_address (Pmode
, op1
);
11790 op1
= copy_addr_to_reg (op1
);
11793 op0
= gen_reg_rtx (mode0
);
11794 emit_insn (GEN_FCN (icode
) (op0
));
11796 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11798 op1
= gen_reg_rtx (SImode
);
11799 emit_move_insn (op1
, CONST1_RTX (SImode
));
11801 /* Emit SImode conditional move. */
11802 if (mode0
== HImode
)
11804 if (TARGET_ZERO_EXTEND_WITH_AND
11805 && optimize_function_for_speed_p (cfun
))
11807 op2
= force_reg (SImode
, const0_rtx
);
11809 emit_insn (gen_movstricthi
11810 (gen_lowpart (HImode
, op2
), op0
));
11814 op2
= gen_reg_rtx (SImode
);
11816 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11819 else if (mode0
== SImode
)
11822 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11825 || !register_operand (target
, SImode
))
11826 target
= gen_reg_rtx (SImode
);
11828 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11830 emit_insn (gen_rtx_SET (target
,
11831 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11834 case IX86_BUILTIN_RDSEED16_STEP
:
11835 icode
= CODE_FOR_rdseedhi_1
;
11839 case IX86_BUILTIN_RDSEED32_STEP
:
11840 icode
= CODE_FOR_rdseedsi_1
;
11844 case IX86_BUILTIN_RDSEED64_STEP
:
11845 icode
= CODE_FOR_rdseeddi_1
;
11849 arg0
= CALL_EXPR_ARG (exp
, 0);
11850 op1
= expand_normal (arg0
);
11851 if (!address_operand (op1
, VOIDmode
))
11853 op1
= convert_memory_address (Pmode
, op1
);
11854 op1
= copy_addr_to_reg (op1
);
11857 op0
= gen_reg_rtx (mode0
);
11858 emit_insn (GEN_FCN (icode
) (op0
));
11860 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11862 op2
= gen_reg_rtx (QImode
);
11864 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11866 emit_insn (gen_rtx_SET (op2
, pat
));
11869 || !register_operand (target
, SImode
))
11870 target
= gen_reg_rtx (SImode
);
11872 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11875 case IX86_BUILTIN_SBB32
:
11876 icode
= CODE_FOR_subborrowsi
;
11877 icode2
= CODE_FOR_subborrowsi_0
;
11883 case IX86_BUILTIN_SBB64
:
11884 icode
= CODE_FOR_subborrowdi
;
11885 icode2
= CODE_FOR_subborrowdi_0
;
11891 case IX86_BUILTIN_ADDCARRYX32
:
11892 icode
= CODE_FOR_addcarrysi
;
11893 icode2
= CODE_FOR_addcarrysi_0
;
11899 case IX86_BUILTIN_ADDCARRYX64
:
11900 icode
= CODE_FOR_addcarrydi
;
11901 icode2
= CODE_FOR_addcarrydi_0
;
11907 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11908 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11909 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11910 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11912 op1
= expand_normal (arg0
);
11913 if (!integer_zerop (arg0
))
11914 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11916 op2
= expand_normal (arg1
);
11917 if (!register_operand (op2
, mode0
))
11918 op2
= copy_to_mode_reg (mode0
, op2
);
11920 op3
= expand_normal (arg2
);
11921 if (!register_operand (op3
, mode0
))
11922 op3
= copy_to_mode_reg (mode0
, op3
);
11924 op4
= expand_normal (arg3
);
11925 if (!address_operand (op4
, VOIDmode
))
11927 op4
= convert_memory_address (Pmode
, op4
);
11928 op4
= copy_addr_to_reg (op4
);
11931 op0
= gen_reg_rtx (mode0
);
11932 if (integer_zerop (arg0
))
11934 /* If arg0 is 0, optimize right away into add or sub
11935 instruction that sets CCCmode flags. */
11936 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11937 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11941 /* Generate CF from input operand. */
11942 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11944 /* Generate instruction that consumes CF. */
11945 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11946 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11947 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11948 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11951 /* Return current CF value. */
11953 target
= gen_reg_rtx (QImode
);
11955 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11956 emit_insn (gen_rtx_SET (target
, pat
));
11958 /* Store the result. */
11959 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11963 case IX86_BUILTIN_READ_FLAGS
:
11964 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11967 || target
== NULL_RTX
11968 || !nonimmediate_operand (target
, word_mode
)
11969 || GET_MODE (target
) != word_mode
)
11970 target
= gen_reg_rtx (word_mode
);
11972 emit_insn (gen_pop (target
));
11975 case IX86_BUILTIN_WRITE_FLAGS
:
11977 arg0
= CALL_EXPR_ARG (exp
, 0);
11978 op0
= expand_normal (arg0
);
11979 if (!general_no_elim_operand (op0
, word_mode
))
11980 op0
= copy_to_mode_reg (word_mode
, op0
);
11982 emit_insn (gen_push (op0
));
11983 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11986 case IX86_BUILTIN_KTESTC8
:
11987 icode
= CODE_FOR_ktestqi
;
11991 case IX86_BUILTIN_KTESTZ8
:
11992 icode
= CODE_FOR_ktestqi
;
11996 case IX86_BUILTIN_KTESTC16
:
11997 icode
= CODE_FOR_ktesthi
;
12001 case IX86_BUILTIN_KTESTZ16
:
12002 icode
= CODE_FOR_ktesthi
;
12006 case IX86_BUILTIN_KTESTC32
:
12007 icode
= CODE_FOR_ktestsi
;
12011 case IX86_BUILTIN_KTESTZ32
:
12012 icode
= CODE_FOR_ktestsi
;
12016 case IX86_BUILTIN_KTESTC64
:
12017 icode
= CODE_FOR_ktestdi
;
12021 case IX86_BUILTIN_KTESTZ64
:
12022 icode
= CODE_FOR_ktestdi
;
12026 case IX86_BUILTIN_KORTESTC8
:
12027 icode
= CODE_FOR_kortestqi
;
12031 case IX86_BUILTIN_KORTESTZ8
:
12032 icode
= CODE_FOR_kortestqi
;
12036 case IX86_BUILTIN_KORTESTC16
:
12037 icode
= CODE_FOR_kortesthi
;
12041 case IX86_BUILTIN_KORTESTZ16
:
12042 icode
= CODE_FOR_kortesthi
;
12046 case IX86_BUILTIN_KORTESTC32
:
12047 icode
= CODE_FOR_kortestsi
;
12051 case IX86_BUILTIN_KORTESTZ32
:
12052 icode
= CODE_FOR_kortestsi
;
12056 case IX86_BUILTIN_KORTESTC64
:
12057 icode
= CODE_FOR_kortestdi
;
12061 case IX86_BUILTIN_KORTESTZ64
:
12062 icode
= CODE_FOR_kortestdi
;
12066 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12067 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12068 op0
= expand_normal (arg0
);
12069 op1
= expand_normal (arg1
);
12071 mode0
= insn_data
[icode
].operand
[0].mode
;
12072 mode1
= insn_data
[icode
].operand
[1].mode
;
12074 if (GET_MODE (op0
) != VOIDmode
)
12075 op0
= force_reg (GET_MODE (op0
), op0
);
12077 op0
= gen_lowpart (mode0
, op0
);
12079 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12080 op0
= copy_to_mode_reg (mode0
, op0
);
12082 if (GET_MODE (op1
) != VOIDmode
)
12083 op1
= force_reg (GET_MODE (op1
), op1
);
12085 op1
= gen_lowpart (mode1
, op1
);
12087 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12088 op1
= copy_to_mode_reg (mode1
, op1
);
12090 target
= gen_reg_rtx (QImode
);
12092 /* Emit kortest. */
12093 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12094 /* And use setcc to return result from flags. */
12095 ix86_expand_setcc (target
, EQ
,
12096 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12099 case IX86_BUILTIN_GATHERSIV2DF
:
12100 icode
= CODE_FOR_avx2_gathersiv2df
;
12102 case IX86_BUILTIN_GATHERSIV4DF
:
12103 icode
= CODE_FOR_avx2_gathersiv4df
;
12105 case IX86_BUILTIN_GATHERDIV2DF
:
12106 icode
= CODE_FOR_avx2_gatherdiv2df
;
12108 case IX86_BUILTIN_GATHERDIV4DF
:
12109 icode
= CODE_FOR_avx2_gatherdiv4df
;
12111 case IX86_BUILTIN_GATHERSIV4SF
:
12112 icode
= CODE_FOR_avx2_gathersiv4sf
;
12114 case IX86_BUILTIN_GATHERSIV8SF
:
12115 icode
= CODE_FOR_avx2_gathersiv8sf
;
12117 case IX86_BUILTIN_GATHERDIV4SF
:
12118 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12120 case IX86_BUILTIN_GATHERDIV8SF
:
12121 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12123 case IX86_BUILTIN_GATHERSIV2DI
:
12124 icode
= CODE_FOR_avx2_gathersiv2di
;
12126 case IX86_BUILTIN_GATHERSIV4DI
:
12127 icode
= CODE_FOR_avx2_gathersiv4di
;
12129 case IX86_BUILTIN_GATHERDIV2DI
:
12130 icode
= CODE_FOR_avx2_gatherdiv2di
;
12132 case IX86_BUILTIN_GATHERDIV4DI
:
12133 icode
= CODE_FOR_avx2_gatherdiv4di
;
12135 case IX86_BUILTIN_GATHERSIV4SI
:
12136 icode
= CODE_FOR_avx2_gathersiv4si
;
12138 case IX86_BUILTIN_GATHERSIV8SI
:
12139 icode
= CODE_FOR_avx2_gathersiv8si
;
12141 case IX86_BUILTIN_GATHERDIV4SI
:
12142 icode
= CODE_FOR_avx2_gatherdiv4si
;
12144 case IX86_BUILTIN_GATHERDIV8SI
:
12145 icode
= CODE_FOR_avx2_gatherdiv8si
;
12147 case IX86_BUILTIN_GATHERALTSIV4DF
:
12148 icode
= CODE_FOR_avx2_gathersiv4df
;
12150 case IX86_BUILTIN_GATHERALTDIV8SF
:
12151 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12153 case IX86_BUILTIN_GATHERALTSIV4DI
:
12154 icode
= CODE_FOR_avx2_gathersiv4di
;
12156 case IX86_BUILTIN_GATHERALTDIV8SI
:
12157 icode
= CODE_FOR_avx2_gatherdiv8si
;
12159 case IX86_BUILTIN_GATHER3SIV16SF
:
12160 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12162 case IX86_BUILTIN_GATHER3SIV8DF
:
12163 icode
= CODE_FOR_avx512f_gathersiv8df
;
12165 case IX86_BUILTIN_GATHER3DIV16SF
:
12166 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12168 case IX86_BUILTIN_GATHER3DIV8DF
:
12169 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12171 case IX86_BUILTIN_GATHER3SIV16SI
:
12172 icode
= CODE_FOR_avx512f_gathersiv16si
;
12174 case IX86_BUILTIN_GATHER3SIV8DI
:
12175 icode
= CODE_FOR_avx512f_gathersiv8di
;
12177 case IX86_BUILTIN_GATHER3DIV16SI
:
12178 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12180 case IX86_BUILTIN_GATHER3DIV8DI
:
12181 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12183 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12184 icode
= CODE_FOR_avx512f_gathersiv8df
;
12186 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12187 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12189 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12190 icode
= CODE_FOR_avx512f_gathersiv8di
;
12192 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12193 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12195 case IX86_BUILTIN_GATHER3SIV2DF
:
12196 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12198 case IX86_BUILTIN_GATHER3SIV4DF
:
12199 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12201 case IX86_BUILTIN_GATHER3DIV2DF
:
12202 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12204 case IX86_BUILTIN_GATHER3DIV4DF
:
12205 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12207 case IX86_BUILTIN_GATHER3SIV4SF
:
12208 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12210 case IX86_BUILTIN_GATHER3SIV8SF
:
12211 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12213 case IX86_BUILTIN_GATHER3DIV4SF
:
12214 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12216 case IX86_BUILTIN_GATHER3DIV8SF
:
12217 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12219 case IX86_BUILTIN_GATHER3SIV2DI
:
12220 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12222 case IX86_BUILTIN_GATHER3SIV4DI
:
12223 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12225 case IX86_BUILTIN_GATHER3DIV2DI
:
12226 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12228 case IX86_BUILTIN_GATHER3DIV4DI
:
12229 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12231 case IX86_BUILTIN_GATHER3SIV4SI
:
12232 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12234 case IX86_BUILTIN_GATHER3SIV8SI
:
12235 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12237 case IX86_BUILTIN_GATHER3DIV4SI
:
12238 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12240 case IX86_BUILTIN_GATHER3DIV8SI
:
12241 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12243 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12244 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12246 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12247 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12249 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12250 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12252 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12253 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12255 case IX86_BUILTIN_SCATTERSIV16SF
:
12256 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12258 case IX86_BUILTIN_SCATTERSIV8DF
:
12259 icode
= CODE_FOR_avx512f_scattersiv8df
;
12261 case IX86_BUILTIN_SCATTERDIV16SF
:
12262 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12264 case IX86_BUILTIN_SCATTERDIV8DF
:
12265 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12267 case IX86_BUILTIN_SCATTERSIV16SI
:
12268 icode
= CODE_FOR_avx512f_scattersiv16si
;
12270 case IX86_BUILTIN_SCATTERSIV8DI
:
12271 icode
= CODE_FOR_avx512f_scattersiv8di
;
12273 case IX86_BUILTIN_SCATTERDIV16SI
:
12274 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12276 case IX86_BUILTIN_SCATTERDIV8DI
:
12277 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12279 case IX86_BUILTIN_SCATTERSIV8SF
:
12280 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12282 case IX86_BUILTIN_SCATTERSIV4SF
:
12283 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12285 case IX86_BUILTIN_SCATTERSIV4DF
:
12286 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12288 case IX86_BUILTIN_SCATTERSIV2DF
:
12289 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12291 case IX86_BUILTIN_SCATTERDIV8SF
:
12292 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12294 case IX86_BUILTIN_SCATTERDIV4SF
:
12295 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12297 case IX86_BUILTIN_SCATTERDIV4DF
:
12298 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12300 case IX86_BUILTIN_SCATTERDIV2DF
:
12301 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12303 case IX86_BUILTIN_SCATTERSIV8SI
:
12304 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12306 case IX86_BUILTIN_SCATTERSIV4SI
:
12307 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12309 case IX86_BUILTIN_SCATTERSIV4DI
:
12310 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12312 case IX86_BUILTIN_SCATTERSIV2DI
:
12313 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12315 case IX86_BUILTIN_SCATTERDIV8SI
:
12316 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12318 case IX86_BUILTIN_SCATTERDIV4SI
:
12319 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12321 case IX86_BUILTIN_SCATTERDIV4DI
:
12322 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12324 case IX86_BUILTIN_SCATTERDIV2DI
:
12325 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12327 case IX86_BUILTIN_GATHERPFDPD
:
12328 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12329 goto vec_prefetch_gen
;
12330 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12331 icode
= CODE_FOR_avx512f_scattersiv8df
;
12333 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12334 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12336 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12337 icode
= CODE_FOR_avx512f_scattersiv8di
;
12339 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12340 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12342 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12343 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12345 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12346 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12348 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12349 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12351 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12352 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12354 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12355 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12357 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12358 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12360 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12361 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12363 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12364 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12366 case IX86_BUILTIN_GATHERPFDPS
:
12367 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12368 goto vec_prefetch_gen
;
12369 case IX86_BUILTIN_GATHERPFQPD
:
12370 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12371 goto vec_prefetch_gen
;
12372 case IX86_BUILTIN_GATHERPFQPS
:
12373 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12374 goto vec_prefetch_gen
;
12375 case IX86_BUILTIN_SCATTERPFDPD
:
12376 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12377 goto vec_prefetch_gen
;
12378 case IX86_BUILTIN_SCATTERPFDPS
:
12379 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12380 goto vec_prefetch_gen
;
12381 case IX86_BUILTIN_SCATTERPFQPD
:
12382 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12383 goto vec_prefetch_gen
;
12384 case IX86_BUILTIN_SCATTERPFQPS
:
12385 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12386 goto vec_prefetch_gen
;
12390 rtx (*gen
) (rtx
, rtx
);
12392 arg0
= CALL_EXPR_ARG (exp
, 0);
12393 arg1
= CALL_EXPR_ARG (exp
, 1);
12394 arg2
= CALL_EXPR_ARG (exp
, 2);
12395 arg3
= CALL_EXPR_ARG (exp
, 3);
12396 arg4
= CALL_EXPR_ARG (exp
, 4);
12397 op0
= expand_normal (arg0
);
12398 op1
= expand_normal (arg1
);
12399 op2
= expand_normal (arg2
);
12400 op3
= expand_normal (arg3
);
12401 op4
= expand_normal (arg4
);
12402 /* Note the arg order is different from the operand order. */
12403 mode0
= insn_data
[icode
].operand
[1].mode
;
12404 mode2
= insn_data
[icode
].operand
[3].mode
;
12405 mode3
= insn_data
[icode
].operand
[4].mode
;
12406 mode4
= insn_data
[icode
].operand
[5].mode
;
12408 if (target
== NULL_RTX
12409 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12410 || !insn_data
[icode
].operand
[0].predicate (target
,
12411 GET_MODE (target
)))
12412 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12414 subtarget
= target
;
12418 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12419 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12420 half
= gen_reg_rtx (V8SImode
);
12421 if (!nonimmediate_operand (op2
, V16SImode
))
12422 op2
= copy_to_mode_reg (V16SImode
, op2
);
12423 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12426 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12427 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12428 case IX86_BUILTIN_GATHERALTSIV4DF
:
12429 case IX86_BUILTIN_GATHERALTSIV4DI
:
12430 half
= gen_reg_rtx (V4SImode
);
12431 if (!nonimmediate_operand (op2
, V8SImode
))
12432 op2
= copy_to_mode_reg (V8SImode
, op2
);
12433 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12436 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12437 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12438 half
= gen_reg_rtx (mode0
);
12439 if (mode0
== V8SFmode
)
12440 gen
= gen_vec_extract_lo_v16sf
;
12442 gen
= gen_vec_extract_lo_v16si
;
12443 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12444 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12445 emit_insn (gen (half
, op0
));
12447 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12449 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12450 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12451 case IX86_BUILTIN_GATHERALTDIV8SF
:
12452 case IX86_BUILTIN_GATHERALTDIV8SI
:
12453 half
= gen_reg_rtx (mode0
);
12454 if (mode0
== V4SFmode
)
12455 gen
= gen_vec_extract_lo_v8sf
;
12457 gen
= gen_vec_extract_lo_v8si
;
12458 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12459 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12460 emit_insn (gen (half
, op0
));
12462 if (VECTOR_MODE_P (GET_MODE (op3
)))
12464 half
= gen_reg_rtx (mode0
);
12465 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12466 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12467 emit_insn (gen (half
, op3
));
12475 /* Force memory operand only with base register here. But we
12476 don't want to do it on memory operand for other builtin
12478 op1
= ix86_zero_extend_to_Pmode (op1
);
12480 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12481 op0
= copy_to_mode_reg (mode0
, op0
);
12482 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12483 op1
= copy_to_mode_reg (Pmode
, op1
);
12484 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12485 op2
= copy_to_mode_reg (mode2
, op2
);
12487 op3
= fixup_modeless_constant (op3
, mode3
);
12489 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12491 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12492 op3
= copy_to_mode_reg (mode3
, op3
);
12496 op3
= copy_to_reg (op3
);
12497 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12499 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12501 error ("the last argument must be scale 1, 2, 4, 8");
12505 /* Optimize. If mask is known to have all high bits set,
12506 replace op0 with pc_rtx to signal that the instruction
12507 overwrites the whole destination and doesn't use its
12508 previous contents. */
12511 if (TREE_CODE (arg3
) == INTEGER_CST
)
12513 if (integer_all_onesp (arg3
))
12516 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12518 unsigned int negative
= 0;
12519 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12521 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12522 if (TREE_CODE (cst
) == INTEGER_CST
12523 && tree_int_cst_sign_bit (cst
))
12525 else if (TREE_CODE (cst
) == REAL_CST
12526 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12529 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12532 else if (TREE_CODE (arg3
) == SSA_NAME
12533 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12535 /* Recognize also when mask is like:
12536 __v2df src = _mm_setzero_pd ();
12537 __v2df mask = _mm_cmpeq_pd (src, src);
12539 __v8sf src = _mm256_setzero_ps ();
12540 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12541 as that is a cheaper way to load all ones into
12542 a register than having to load a constant from
12544 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12545 if (is_gimple_call (def_stmt
))
12547 tree fndecl
= gimple_call_fndecl (def_stmt
);
12549 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12550 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12552 case IX86_BUILTIN_CMPPD
:
12553 case IX86_BUILTIN_CMPPS
:
12554 case IX86_BUILTIN_CMPPD256
:
12555 case IX86_BUILTIN_CMPPS256
:
12556 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12559 case IX86_BUILTIN_CMPEQPD
:
12560 case IX86_BUILTIN_CMPEQPS
:
12561 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12562 && initializer_zerop (gimple_call_arg (def_stmt
,
12573 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12580 case IX86_BUILTIN_GATHER3DIV16SF
:
12581 if (target
== NULL_RTX
)
12582 target
= gen_reg_rtx (V8SFmode
);
12583 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12585 case IX86_BUILTIN_GATHER3DIV16SI
:
12586 if (target
== NULL_RTX
)
12587 target
= gen_reg_rtx (V8SImode
);
12588 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12590 case IX86_BUILTIN_GATHER3DIV8SF
:
12591 case IX86_BUILTIN_GATHERDIV8SF
:
12592 if (target
== NULL_RTX
)
12593 target
= gen_reg_rtx (V4SFmode
);
12594 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12596 case IX86_BUILTIN_GATHER3DIV8SI
:
12597 case IX86_BUILTIN_GATHERDIV8SI
:
12598 if (target
== NULL_RTX
)
12599 target
= gen_reg_rtx (V4SImode
);
12600 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12603 target
= subtarget
;
12609 arg0
= CALL_EXPR_ARG (exp
, 0);
12610 arg1
= CALL_EXPR_ARG (exp
, 1);
12611 arg2
= CALL_EXPR_ARG (exp
, 2);
12612 arg3
= CALL_EXPR_ARG (exp
, 3);
12613 arg4
= CALL_EXPR_ARG (exp
, 4);
12614 op0
= expand_normal (arg0
);
12615 op1
= expand_normal (arg1
);
12616 op2
= expand_normal (arg2
);
12617 op3
= expand_normal (arg3
);
12618 op4
= expand_normal (arg4
);
12619 mode1
= insn_data
[icode
].operand
[1].mode
;
12620 mode2
= insn_data
[icode
].operand
[2].mode
;
12621 mode3
= insn_data
[icode
].operand
[3].mode
;
12622 mode4
= insn_data
[icode
].operand
[4].mode
;
12624 /* Scatter instruction stores operand op3 to memory with
12625 indices from op2 and scale from op4 under writemask op1.
12626 If index operand op2 has more elements then source operand
12627 op3 one need to use only its low half. And vice versa. */
12630 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12631 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12632 half
= gen_reg_rtx (V8SImode
);
12633 if (!nonimmediate_operand (op2
, V16SImode
))
12634 op2
= copy_to_mode_reg (V16SImode
, op2
);
12635 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12638 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12639 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12640 half
= gen_reg_rtx (mode3
);
12641 if (mode3
== V8SFmode
)
12642 gen
= gen_vec_extract_lo_v16sf
;
12644 gen
= gen_vec_extract_lo_v16si
;
12645 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12646 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12647 emit_insn (gen (half
, op3
));
12650 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12651 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12652 half
= gen_reg_rtx (V4SImode
);
12653 if (!nonimmediate_operand (op2
, V8SImode
))
12654 op2
= copy_to_mode_reg (V8SImode
, op2
);
12655 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12658 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12659 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12660 half
= gen_reg_rtx (mode3
);
12661 if (mode3
== V4SFmode
)
12662 gen
= gen_vec_extract_lo_v8sf
;
12664 gen
= gen_vec_extract_lo_v8si
;
12665 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12666 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12667 emit_insn (gen (half
, op3
));
12670 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12671 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12672 if (!nonimmediate_operand (op2
, V4SImode
))
12673 op2
= copy_to_mode_reg (V4SImode
, op2
);
12675 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12676 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12677 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12678 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12684 /* Force memory operand only with base register here. But we
12685 don't want to do it on memory operand for other builtin
12687 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12689 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12690 op0
= copy_to_mode_reg (Pmode
, op0
);
12692 op1
= fixup_modeless_constant (op1
, mode1
);
12694 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12696 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12697 op1
= copy_to_mode_reg (mode1
, op1
);
12701 op1
= copy_to_reg (op1
);
12702 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12705 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12706 op2
= copy_to_mode_reg (mode2
, op2
);
12708 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12709 op3
= copy_to_mode_reg (mode3
, op3
);
12711 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12713 error ("the last argument must be scale 1, 2, 4, 8");
12717 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12725 arg0
= CALL_EXPR_ARG (exp
, 0);
12726 arg1
= CALL_EXPR_ARG (exp
, 1);
12727 arg2
= CALL_EXPR_ARG (exp
, 2);
12728 arg3
= CALL_EXPR_ARG (exp
, 3);
12729 arg4
= CALL_EXPR_ARG (exp
, 4);
12730 op0
= expand_normal (arg0
);
12731 op1
= expand_normal (arg1
);
12732 op2
= expand_normal (arg2
);
12733 op3
= expand_normal (arg3
);
12734 op4
= expand_normal (arg4
);
12735 mode0
= insn_data
[icode
].operand
[0].mode
;
12736 mode1
= insn_data
[icode
].operand
[1].mode
;
12737 mode3
= insn_data
[icode
].operand
[3].mode
;
12738 mode4
= insn_data
[icode
].operand
[4].mode
;
12740 op0
= fixup_modeless_constant (op0
, mode0
);
12742 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12744 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12745 op0
= copy_to_mode_reg (mode0
, op0
);
12749 op0
= copy_to_reg (op0
);
12750 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12753 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12754 op1
= copy_to_mode_reg (mode1
, op1
);
12756 /* Force memory operand only with base register here. But we
12757 don't want to do it on memory operand for other builtin
12759 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12761 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12762 op2
= copy_to_mode_reg (Pmode
, op2
);
12764 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12766 error ("the forth argument must be scale 1, 2, 4, 8");
12770 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12772 error ("incorrect hint operand");
12776 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12784 case IX86_BUILTIN_XABORT
:
12785 icode
= CODE_FOR_xabort
;
12786 arg0
= CALL_EXPR_ARG (exp
, 0);
12787 op0
= expand_normal (arg0
);
12788 mode0
= insn_data
[icode
].operand
[0].mode
;
12789 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12791 error ("the argument to %<xabort%> intrinsic must "
12792 "be an 8-bit immediate");
12795 emit_insn (gen_xabort (op0
));
12798 case IX86_BUILTIN_RSTORSSP
:
12799 case IX86_BUILTIN_CLRSSBSY
:
12800 arg0
= CALL_EXPR_ARG (exp
, 0);
12801 op0
= expand_normal (arg0
);
12802 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12803 ? CODE_FOR_rstorssp
12804 : CODE_FOR_clrssbsy
);
12805 if (!address_operand (op0
, VOIDmode
))
12807 op1
= convert_memory_address (Pmode
, op0
);
12808 op0
= copy_addr_to_reg (op1
);
12810 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (Pmode
, op0
)));
12813 case IX86_BUILTIN_WRSSD
:
12814 case IX86_BUILTIN_WRSSQ
:
12815 case IX86_BUILTIN_WRUSSD
:
12816 case IX86_BUILTIN_WRUSSQ
:
12817 arg0
= CALL_EXPR_ARG (exp
, 0);
12818 op0
= expand_normal (arg0
);
12819 arg1
= CALL_EXPR_ARG (exp
, 1);
12820 op1
= expand_normal (arg1
);
12823 case IX86_BUILTIN_WRSSD
:
12824 icode
= CODE_FOR_wrsssi
;
12827 case IX86_BUILTIN_WRSSQ
:
12828 icode
= CODE_FOR_wrssdi
;
12831 case IX86_BUILTIN_WRUSSD
:
12832 icode
= CODE_FOR_wrusssi
;
12835 case IX86_BUILTIN_WRUSSQ
:
12836 icode
= CODE_FOR_wrussdi
;
12840 op0
= force_reg (mode
, op0
);
12841 if (!address_operand (op1
, VOIDmode
))
12843 op2
= convert_memory_address (Pmode
, op1
);
12844 op1
= copy_addr_to_reg (op2
);
12846 emit_insn (GEN_FCN (icode
) (op0
, gen_rtx_MEM (mode
, op1
)));
12853 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12854 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12856 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12857 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12861 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12862 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12864 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12865 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12866 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12867 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12869 machine_mode mode
, wide_mode
, nar_mode
;
12871 nar_mode
= V4SFmode
;
12873 wide_mode
= V64SFmode
;
12874 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12875 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12879 case IX86_BUILTIN_4FMAPS
:
12880 fcn
= gen_avx5124fmaddps_4fmaddps
;
12884 case IX86_BUILTIN_4DPWSSD
:
12885 nar_mode
= V4SImode
;
12887 wide_mode
= V64SImode
;
12888 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12892 case IX86_BUILTIN_4DPWSSDS
:
12893 nar_mode
= V4SImode
;
12895 wide_mode
= V64SImode
;
12896 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12900 case IX86_BUILTIN_4FNMAPS
:
12901 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12905 case IX86_BUILTIN_4FNMAPS_MASK
:
12906 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12907 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12910 case IX86_BUILTIN_4DPWSSD_MASK
:
12911 nar_mode
= V4SImode
;
12913 wide_mode
= V64SImode
;
12914 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12915 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12918 case IX86_BUILTIN_4DPWSSDS_MASK
:
12919 nar_mode
= V4SImode
;
12921 wide_mode
= V64SImode
;
12922 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12923 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12926 case IX86_BUILTIN_4FMAPS_MASK
:
12936 wide_reg
= gen_reg_rtx (wide_mode
);
12937 for (i
= 0; i
< 4; i
++)
12939 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12940 ops
[i
] = expand_normal (args
[i
]);
12942 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12946 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12947 accum
= force_reg (mode
, accum
);
12949 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12950 addr
= force_reg (Pmode
, addr
);
12952 mem
= gen_rtx_MEM (nar_mode
, addr
);
12954 target
= gen_reg_rtx (mode
);
12956 emit_move_insn (target
, accum
);
12959 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12963 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12965 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12967 if (CONST_INT_P (mask
))
12968 mask
= fixup_modeless_constant (mask
, HImode
);
12970 mask
= force_reg (HImode
, mask
);
12972 if (GET_MODE (mask
) != HImode
)
12973 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12975 /* If merge is 0 then we're about to emit z-masked variant. */
12976 if (const0_operand (merge
, mode
))
12977 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12978 /* If merge is the same as accum then emit merge-masked variant. */
12979 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12981 merge
= force_reg (mode
, merge
);
12982 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12984 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12987 target
= gen_reg_rtx (mode
);
12988 emit_move_insn (target
, merge
);
12989 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12995 case IX86_BUILTIN_4FNMASS
:
12996 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13000 case IX86_BUILTIN_4FMASS
:
13001 fcn
= gen_avx5124fmaddps_4fmaddss
;
13005 case IX86_BUILTIN_4FNMASS_MASK
:
13006 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13007 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13010 case IX86_BUILTIN_4FMASS_MASK
:
13019 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13020 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13024 wide_reg
= gen_reg_rtx (V64SFmode
);
13025 for (i
= 0; i
< 4; i
++)
13028 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13029 ops
[i
] = expand_normal (args
[i
]);
13031 tmp
= gen_reg_rtx (SFmode
);
13032 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13034 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13035 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13038 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13039 accum
= force_reg (V4SFmode
, accum
);
13041 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13042 addr
= force_reg (Pmode
, addr
);
13044 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13046 target
= gen_reg_rtx (V4SFmode
);
13048 emit_move_insn (target
, accum
);
13051 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13055 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13057 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13059 if (CONST_INT_P (mask
))
13060 mask
= fixup_modeless_constant (mask
, QImode
);
13062 mask
= force_reg (QImode
, mask
);
13064 if (GET_MODE (mask
) != QImode
)
13065 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13067 /* If merge is 0 then we're about to emit z-masked variant. */
13068 if (const0_operand (merge
, mode
))
13069 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13070 /* If merge is the same as accum then emit merge-masked
13072 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13074 merge
= force_reg (mode
, merge
);
13075 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13077 /* Merge with something unknown might happen if we z-mask
13081 target
= gen_reg_rtx (mode
);
13082 emit_move_insn (target
, merge
);
13083 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13088 case IX86_BUILTIN_RDPID
:
13089 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13091 case IX86_BUILTIN_FABSQ
:
13092 case IX86_BUILTIN_COPYSIGNQ
:
13094 /* Emit a normal call if SSE isn't available. */
13095 return expand_call (exp
, target
, ignore
);
13098 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13102 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13103 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13105 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13106 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13109 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13110 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13112 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13113 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13116 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13117 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13119 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13120 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13123 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13124 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13126 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13127 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13130 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13131 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13133 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13134 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13135 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13136 (enum ix86_builtin_func_type
)
13137 d
->flag
, d
->comparison
);
13140 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13141 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13143 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13144 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13148 if (fcode
>= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13149 && fcode
<= IX86_BUILTIN__BDESC_CET_NORMAL_LAST
)
13151 i
= fcode
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
;
13152 return ix86_expand_special_args_builtin (bdesc_cet_rdssp
+ i
, exp
,
13156 gcc_unreachable ();
13159 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13160 fill target with val via vec_duplicate. */
13163 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13169 /* First attempt to recognize VAL as-is. */
13170 dup
= gen_vec_duplicate (mode
, val
);
13171 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13172 if (recog_memoized (insn
) < 0)
13175 machine_mode innermode
= GET_MODE_INNER (mode
);
13178 /* If that fails, force VAL into a register. */
13181 reg
= force_reg (innermode
, val
);
13182 if (GET_MODE (reg
) != innermode
)
13183 reg
= gen_lowpart (innermode
, reg
);
13184 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13185 seq
= get_insns ();
13188 emit_insn_before (seq
, insn
);
13190 ok
= recog_memoized (insn
) >= 0;
13196 /* Get a vector mode of the same size as the original but with elements
13197 twice as wide. This is only guaranteed to apply to integral vectors. */
13199 static machine_mode
13200 get_mode_wider_vector (machine_mode o
)
13202 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13203 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13204 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13205 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13209 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13210 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13212 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13213 with all elements equal to VAR. Return true if successful. */
13216 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13217 rtx target
, rtx val
)
13241 return ix86_vector_duplicate_value (mode
, target
, val
);
13246 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13250 val
= gen_lowpart (SImode
, val
);
13251 x
= gen_rtx_TRUNCATE (HImode
, val
);
13252 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13253 emit_insn (gen_rtx_SET (target
, x
));
13265 return ix86_vector_duplicate_value (mode
, target
, val
);
13269 struct expand_vec_perm_d dperm
;
13273 memset (&dperm
, 0, sizeof (dperm
));
13274 dperm
.target
= target
;
13275 dperm
.vmode
= mode
;
13276 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13277 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13278 dperm
.one_operand_p
= true;
13280 /* Extend to SImode using a paradoxical SUBREG. */
13281 tmp1
= gen_reg_rtx (SImode
);
13282 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13284 /* Insert the SImode value as low element of a V4SImode vector. */
13285 tmp2
= gen_reg_rtx (V4SImode
);
13286 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13287 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13289 ok
= (expand_vec_perm_1 (&dperm
)
13290 || expand_vec_perm_broadcast_1 (&dperm
));
13298 return ix86_vector_duplicate_value (mode
, target
, val
);
13305 /* Replicate the value once into the next wider mode and recurse. */
13307 machine_mode smode
, wsmode
, wvmode
;
13310 smode
= GET_MODE_INNER (mode
);
13311 wvmode
= get_mode_wider_vector (mode
);
13312 wsmode
= GET_MODE_INNER (wvmode
);
13314 val
= convert_modes (wsmode
, smode
, val
, true);
13315 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13316 GEN_INT (GET_MODE_BITSIZE (smode
)),
13317 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13318 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13320 x
= gen_reg_rtx (wvmode
);
13321 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13323 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13330 return ix86_vector_duplicate_value (mode
, target
, val
);
13333 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13334 rtx x
= gen_reg_rtx (hvmode
);
13336 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13339 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13340 emit_insn (gen_rtx_SET (target
, x
));
13346 if (TARGET_AVX512BW
)
13347 return ix86_vector_duplicate_value (mode
, target
, val
);
13350 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13351 rtx x
= gen_reg_rtx (hvmode
);
13353 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13356 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13357 emit_insn (gen_rtx_SET (target
, x
));
13366 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13367 whose ONE_VAR element is VAR, and other elements are zero. Return true
13371 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13372 rtx target
, rtx var
, int one_var
)
13374 machine_mode vsimode
;
13377 bool use_vector_set
= false;
13378 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13383 /* For SSE4.1, we normally use vector set. But if the second
13384 element is zero and inter-unit moves are OK, we use movq
13386 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13387 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13393 use_vector_set
= TARGET_SSE4_1
;
13396 use_vector_set
= TARGET_SSE2
;
13399 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13402 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13406 use_vector_set
= TARGET_AVX
;
13409 use_vector_set
= TARGET_AVX
;
13410 gen_vec_set_0
= gen_vec_setv8si_0
;
13413 use_vector_set
= TARGET_AVX
;
13414 gen_vec_set_0
= gen_vec_setv8sf_0
;
13417 use_vector_set
= TARGET_AVX
;
13418 gen_vec_set_0
= gen_vec_setv4df_0
;
13421 /* Use ix86_expand_vector_set in 64bit mode only. */
13422 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13423 gen_vec_set_0
= gen_vec_setv4di_0
;
13426 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13427 gen_vec_set_0
= gen_vec_setv16si_0
;
13430 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13431 gen_vec_set_0
= gen_vec_setv16sf_0
;
13434 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13435 gen_vec_set_0
= gen_vec_setv8df_0
;
13438 /* Use ix86_expand_vector_set in 64bit mode only. */
13439 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13440 gen_vec_set_0
= gen_vec_setv8di_0
;
13446 if (use_vector_set
)
13448 if (gen_vec_set_0
&& one_var
== 0)
13450 var
= force_reg (GET_MODE_INNER (mode
), var
);
13451 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13454 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13455 var
= force_reg (GET_MODE_INNER (mode
), var
);
13456 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13472 var
= force_reg (GET_MODE_INNER (mode
), var
);
13473 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13474 emit_insn (gen_rtx_SET (target
, x
));
13479 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13480 new_target
= gen_reg_rtx (mode
);
13482 new_target
= target
;
13483 var
= force_reg (GET_MODE_INNER (mode
), var
);
13484 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13485 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13486 emit_insn (gen_rtx_SET (new_target
, x
));
13489 /* We need to shuffle the value to the correct position, so
13490 create a new pseudo to store the intermediate result. */
13492 /* With SSE2, we can use the integer shuffle insns. */
13493 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13495 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13497 GEN_INT (one_var
== 1 ? 0 : 1),
13498 GEN_INT (one_var
== 2 ? 0 : 1),
13499 GEN_INT (one_var
== 3 ? 0 : 1)));
13500 if (target
!= new_target
)
13501 emit_move_insn (target
, new_target
);
13505 /* Otherwise convert the intermediate result to V4SFmode and
13506 use the SSE1 shuffle instructions. */
13507 if (mode
!= V4SFmode
)
13509 tmp
= gen_reg_rtx (V4SFmode
);
13510 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13515 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13517 GEN_INT (one_var
== 1 ? 0 : 1),
13518 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13519 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13521 if (mode
!= V4SFmode
)
13522 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13523 else if (tmp
!= target
)
13524 emit_move_insn (target
, tmp
);
13526 else if (target
!= new_target
)
13527 emit_move_insn (target
, new_target
);
13532 vsimode
= V4SImode
;
13538 vsimode
= V2SImode
;
13544 /* Zero extend the variable element to SImode and recurse. */
13545 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13547 x
= gen_reg_rtx (vsimode
);
13548 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13550 gcc_unreachable ();
13552 emit_move_insn (target
, gen_lowpart (mode
, x
));
13560 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13561 consisting of the values in VALS. It is known that all elements
13562 except ONE_VAR are constants. Return true if successful. */
13565 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13566 rtx target
, rtx vals
, int one_var
)
13568 rtx var
= XVECEXP (vals
, 0, one_var
);
13569 machine_mode wmode
;
13572 const_vec
= copy_rtx (vals
);
13573 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13574 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13582 /* For the two element vectors, it's just as easy to use
13583 the general case. */
13587 /* Use ix86_expand_vector_set in 64bit mode only. */
13608 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13613 /* There's no way to set one QImode entry easily. Combine
13614 the variable value with its adjacent constant value, and
13615 promote to an HImode set. */
13616 x
= XVECEXP (vals
, 0, one_var
^ 1);
13619 var
= convert_modes (HImode
, QImode
, var
, true);
13620 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13621 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13622 x
= GEN_INT (INTVAL (x
) & 0xff);
13626 var
= convert_modes (HImode
, QImode
, var
, true);
13627 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13629 if (x
!= const0_rtx
)
13630 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13631 1, OPTAB_LIB_WIDEN
);
13633 x
= gen_reg_rtx (wmode
);
13634 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13635 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13637 emit_move_insn (target
, gen_lowpart (mode
, x
));
13644 emit_move_insn (target
, const_vec
);
13645 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13649 /* A subroutine of ix86_expand_vector_init_general. Use vector
13650 concatenate to handle the most general case: all values variable,
13651 and none identical. */
13654 ix86_expand_vector_init_concat (machine_mode mode
,
13655 rtx target
, rtx
*ops
, int n
)
13657 machine_mode cmode
, hmode
= VOIDmode
, gmode
= VOIDmode
;
13658 rtx first
[16], second
[8], third
[4];
13710 gcc_unreachable ();
13713 if (!register_operand (ops
[1], cmode
))
13714 ops
[1] = force_reg (cmode
, ops
[1]);
13715 if (!register_operand (ops
[0], cmode
))
13716 ops
[0] = force_reg (cmode
, ops
[0]);
13717 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13737 gcc_unreachable ();
13761 gcc_unreachable ();
13779 gcc_unreachable ();
13784 /* FIXME: We process inputs backward to help RA. PR 36222. */
13787 for (; i
> 0; i
-= 2, j
--)
13789 first
[j
] = gen_reg_rtx (cmode
);
13790 v
= gen_rtvec (2, ops
[i
- 1], ops
[i
]);
13791 ix86_expand_vector_init (false, first
[j
],
13792 gen_rtx_PARALLEL (cmode
, v
));
13798 gcc_assert (hmode
!= VOIDmode
);
13799 gcc_assert (gmode
!= VOIDmode
);
13800 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13802 second
[j
] = gen_reg_rtx (hmode
);
13803 ix86_expand_vector_init_concat (hmode
, second
[j
],
13807 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13809 third
[j
] = gen_reg_rtx (gmode
);
13810 ix86_expand_vector_init_concat (gmode
, third
[j
],
13814 ix86_expand_vector_init_concat (mode
, target
, third
, n
);
13818 gcc_assert (hmode
!= VOIDmode
);
13819 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13821 second
[j
] = gen_reg_rtx (hmode
);
13822 ix86_expand_vector_init_concat (hmode
, second
[j
],
13826 ix86_expand_vector_init_concat (mode
, target
, second
, n
);
13829 ix86_expand_vector_init_concat (mode
, target
, first
, n
);
13833 gcc_unreachable ();
13837 /* A subroutine of ix86_expand_vector_init_general. Use vector
13838 interleave to handle the most general case: all values variable,
13839 and none identical. */
13842 ix86_expand_vector_init_interleave (machine_mode mode
,
13843 rtx target
, rtx
*ops
, int n
)
13845 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13848 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13849 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13850 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13855 gen_load_even
= gen_vec_setv8hi
;
13856 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13857 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13858 inner_mode
= HImode
;
13859 first_imode
= V4SImode
;
13860 second_imode
= V2DImode
;
13861 third_imode
= VOIDmode
;
13864 gen_load_even
= gen_vec_setv16qi
;
13865 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13866 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13867 inner_mode
= QImode
;
13868 first_imode
= V8HImode
;
13869 second_imode
= V4SImode
;
13870 third_imode
= V2DImode
;
13873 gcc_unreachable ();
13876 for (i
= 0; i
< n
; i
++)
13878 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13879 op0
= gen_reg_rtx (SImode
);
13880 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13882 /* Insert the SImode value as low element of V4SImode vector. */
13883 op1
= gen_reg_rtx (V4SImode
);
13884 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13885 gen_rtx_VEC_DUPLICATE (V4SImode
,
13887 CONST0_RTX (V4SImode
),
13889 emit_insn (gen_rtx_SET (op1
, op0
));
13891 /* Cast the V4SImode vector back to a vector in orignal mode. */
13892 op0
= gen_reg_rtx (mode
);
13893 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13895 /* Load even elements into the second position. */
13896 emit_insn (gen_load_even (op0
,
13897 force_reg (inner_mode
,
13901 /* Cast vector to FIRST_IMODE vector. */
13902 ops
[i
] = gen_reg_rtx (first_imode
);
13903 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13906 /* Interleave low FIRST_IMODE vectors. */
13907 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13909 op0
= gen_reg_rtx (first_imode
);
13910 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13912 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13913 ops
[j
] = gen_reg_rtx (second_imode
);
13914 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13917 /* Interleave low SECOND_IMODE vectors. */
13918 switch (second_imode
)
13921 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13923 op0
= gen_reg_rtx (second_imode
);
13924 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13927 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13929 ops
[j
] = gen_reg_rtx (third_imode
);
13930 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13932 second_imode
= V2DImode
;
13933 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13937 op0
= gen_reg_rtx (second_imode
);
13938 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13941 /* Cast the SECOND_IMODE vector back to a vector on original
13943 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13947 gcc_unreachable ();
13951 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13952 all values variable, and none identical. */
13955 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13956 rtx target
, rtx vals
)
13958 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13959 machine_mode half_mode
= VOIDmode
;
13960 machine_mode quarter_mode
= VOIDmode
;
13967 if (!mmx_ok
&& !TARGET_SSE
)
13983 n
= GET_MODE_NUNITS (mode
);
13984 for (i
= 0; i
< n
; i
++)
13985 ops
[i
] = XVECEXP (vals
, 0, i
);
13986 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13990 for (i
= 0; i
< 2; i
++)
13991 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13992 op0
= gen_reg_rtx (V4DImode
);
13993 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13994 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13998 for (i
= 0; i
< 4; i
++)
13999 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14000 ops
[4] = gen_reg_rtx (V4DImode
);
14001 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
14002 ops
[5] = gen_reg_rtx (V4DImode
);
14003 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
14004 op0
= gen_reg_rtx (V8DImode
);
14005 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
14006 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14010 half_mode
= V16QImode
;
14014 half_mode
= V8HImode
;
14018 n
= GET_MODE_NUNITS (mode
);
14019 for (i
= 0; i
< n
; i
++)
14020 ops
[i
] = XVECEXP (vals
, 0, i
);
14021 op0
= gen_reg_rtx (half_mode
);
14022 op1
= gen_reg_rtx (half_mode
);
14023 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14025 ix86_expand_vector_init_interleave (half_mode
, op1
,
14026 &ops
[n
>> 1], n
>> 2);
14027 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14031 quarter_mode
= V16QImode
;
14032 half_mode
= V32QImode
;
14036 quarter_mode
= V8HImode
;
14037 half_mode
= V16HImode
;
14041 n
= GET_MODE_NUNITS (mode
);
14042 for (i
= 0; i
< n
; i
++)
14043 ops
[i
] = XVECEXP (vals
, 0, i
);
14044 op0
= gen_reg_rtx (quarter_mode
);
14045 op1
= gen_reg_rtx (quarter_mode
);
14046 op2
= gen_reg_rtx (quarter_mode
);
14047 op3
= gen_reg_rtx (quarter_mode
);
14048 op4
= gen_reg_rtx (half_mode
);
14049 op5
= gen_reg_rtx (half_mode
);
14050 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14052 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14053 &ops
[n
>> 2], n
>> 3);
14054 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14055 &ops
[n
>> 1], n
>> 3);
14056 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14057 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14058 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14059 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14060 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14064 if (!TARGET_SSE4_1
)
14072 /* Don't use ix86_expand_vector_init_interleave if we can't
14073 move from GPR to SSE register directly. */
14074 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14077 n
= GET_MODE_NUNITS (mode
);
14078 for (i
= 0; i
< n
; i
++)
14079 ops
[i
] = XVECEXP (vals
, 0, i
);
14080 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14088 gcc_unreachable ();
14092 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14093 machine_mode inner_mode
;
14094 rtx words
[4], shift
;
14096 inner_mode
= GET_MODE_INNER (mode
);
14097 n_elts
= GET_MODE_NUNITS (mode
);
14098 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14099 n_elt_per_word
= n_elts
/ n_words
;
14100 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14102 for (i
= 0; i
< n_words
; ++i
)
14104 rtx word
= NULL_RTX
;
14106 for (j
= 0; j
< n_elt_per_word
; ++j
)
14108 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14109 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14115 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14116 word
, 1, OPTAB_LIB_WIDEN
);
14117 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14118 word
, 1, OPTAB_LIB_WIDEN
);
14126 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14127 else if (n_words
== 2)
14129 rtx tmp
= gen_reg_rtx (mode
);
14130 emit_clobber (tmp
);
14131 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14132 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14133 emit_move_insn (target
, tmp
);
14135 else if (n_words
== 4)
14137 rtx tmp
= gen_reg_rtx (V4SImode
);
14138 gcc_assert (word_mode
== SImode
);
14139 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14140 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14141 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14144 gcc_unreachable ();
14148 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14149 instructions unless MMX_OK is true. */
14152 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14154 machine_mode mode
= GET_MODE (target
);
14155 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14156 int n_elts
= GET_MODE_NUNITS (mode
);
14157 int n_var
= 0, one_var
= -1;
14158 bool all_same
= true, all_const_zero
= true;
14162 /* Handle first initialization from vector elts. */
14163 if (n_elts
!= XVECLEN (vals
, 0))
14165 rtx subtarget
= target
;
14166 x
= XVECEXP (vals
, 0, 0);
14167 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14168 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14170 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14171 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14173 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14174 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14175 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14176 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14177 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14178 subtarget
= gen_reg_rtx (mode
);
14180 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14181 if (subtarget
!= target
)
14182 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14185 gcc_unreachable ();
14188 for (i
= 0; i
< n_elts
; ++i
)
14190 x
= XVECEXP (vals
, 0, i
);
14191 if (!(CONST_SCALAR_INT_P (x
)
14192 || CONST_DOUBLE_P (x
)
14193 || CONST_FIXED_P (x
)))
14194 n_var
++, one_var
= i
;
14195 else if (x
!= CONST0_RTX (inner_mode
))
14196 all_const_zero
= false;
14197 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14201 /* Constants are best loaded from the constant pool. */
14204 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14208 /* If all values are identical, broadcast the value. */
14210 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14211 XVECEXP (vals
, 0, 0)))
14214 /* Values where only one field is non-constant are best loaded from
14215 the pool and overwritten via move later. */
14219 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14220 XVECEXP (vals
, 0, one_var
),
14224 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14228 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14232 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14234 machine_mode mode
= GET_MODE (target
);
14235 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14236 machine_mode half_mode
;
14237 bool use_vec_merge
= false;
14239 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14241 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14242 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14243 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14244 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14245 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14246 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14248 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14250 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14251 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14252 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14253 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14254 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14255 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14258 machine_mode mmode
= VOIDmode
;
14259 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14264 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14272 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14273 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14275 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14277 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14278 emit_insn (gen_rtx_SET (target
, tmp
));
14284 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14288 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14289 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14291 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14293 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14294 emit_insn (gen_rtx_SET (target
, tmp
));
14298 /* NB: For ELT == 0, use standard scalar operation patterns which
14299 preserve the rest of the vector for combiner:
14302 (vec_duplicate:V2DF (reg:DF))
14312 /* For the two element vectors, we implement a VEC_CONCAT with
14313 the extraction of the other element. */
14315 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14316 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14319 op0
= val
, op1
= tmp
;
14321 op0
= tmp
, op1
= val
;
14323 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14324 emit_insn (gen_rtx_SET (target
, tmp
));
14329 use_vec_merge
= TARGET_SSE4_1
;
14336 use_vec_merge
= true;
14340 /* tmp = target = A B C D */
14341 tmp
= copy_to_reg (target
);
14342 /* target = A A B B */
14343 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14344 /* target = X A B B */
14345 ix86_expand_vector_set (false, target
, val
, 0);
14346 /* target = A X C D */
14347 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14348 const1_rtx
, const0_rtx
,
14349 GEN_INT (2+4), GEN_INT (3+4)));
14353 /* tmp = target = A B C D */
14354 tmp
= copy_to_reg (target
);
14355 /* tmp = X B C D */
14356 ix86_expand_vector_set (false, tmp
, val
, 0);
14357 /* target = A B X D */
14358 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14359 const0_rtx
, const1_rtx
,
14360 GEN_INT (0+4), GEN_INT (3+4)));
14364 /* tmp = target = A B C D */
14365 tmp
= copy_to_reg (target
);
14366 /* tmp = X B C D */
14367 ix86_expand_vector_set (false, tmp
, val
, 0);
14368 /* target = A B X D */
14369 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14370 const0_rtx
, const1_rtx
,
14371 GEN_INT (2+4), GEN_INT (0+4)));
14375 gcc_unreachable ();
14380 use_vec_merge
= TARGET_SSE4_1
;
14384 /* Element 0 handled by vec_merge below. */
14387 use_vec_merge
= true;
14393 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14394 store into element 0, then shuffle them back. */
14398 order
[0] = GEN_INT (elt
);
14399 order
[1] = const1_rtx
;
14400 order
[2] = const2_rtx
;
14401 order
[3] = GEN_INT (3);
14402 order
[elt
] = const0_rtx
;
14404 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14405 order
[1], order
[2], order
[3]));
14407 ix86_expand_vector_set (false, target
, val
, 0);
14409 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14410 order
[1], order
[2], order
[3]));
14414 /* For SSE1, we have to reuse the V4SF code. */
14415 rtx t
= gen_reg_rtx (V4SFmode
);
14416 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14417 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14418 emit_move_insn (target
, gen_lowpart (mode
, t
));
14423 use_vec_merge
= TARGET_SSE2
;
14426 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14430 use_vec_merge
= TARGET_SSE4_1
;
14434 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14438 half_mode
= V16QImode
;
14444 half_mode
= V8HImode
;
14450 half_mode
= V4SImode
;
14456 half_mode
= V2DImode
;
14462 half_mode
= V4SFmode
;
14468 half_mode
= V2DFmode
;
14474 /* Compute offset. */
14478 gcc_assert (i
<= 1);
14480 /* Extract the half. */
14481 tmp
= gen_reg_rtx (half_mode
);
14482 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14484 /* Put val in tmp at elt. */
14485 ix86_expand_vector_set (false, tmp
, val
, elt
);
14488 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14492 if (TARGET_AVX512F
)
14495 gen_blendm
= gen_avx512f_blendmv8df
;
14500 if (TARGET_AVX512F
)
14503 gen_blendm
= gen_avx512f_blendmv8di
;
14508 if (TARGET_AVX512F
)
14511 gen_blendm
= gen_avx512f_blendmv16sf
;
14516 if (TARGET_AVX512F
)
14519 gen_blendm
= gen_avx512f_blendmv16si
;
14524 if (TARGET_AVX512BW
)
14527 gen_blendm
= gen_avx512bw_blendmv32hi
;
14529 else if (TARGET_AVX512F
)
14531 half_mode
= E_V8HImode
;
14538 if (TARGET_AVX512BW
)
14541 gen_blendm
= gen_avx512bw_blendmv64qi
;
14543 else if (TARGET_AVX512F
)
14545 half_mode
= E_V16QImode
;
14552 /* Compute offset. */
14556 gcc_assert (i
<= 3);
14559 /* Extract the quarter. */
14560 tmp
= gen_reg_rtx (V4SImode
);
14561 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14562 rtx mask
= gen_reg_rtx (QImode
);
14564 emit_move_insn (mask
, constm1_rtx
);
14565 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14568 tmp2
= gen_reg_rtx (half_mode
);
14569 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14572 /* Put val in tmp at elt. */
14573 ix86_expand_vector_set (false, tmp
, val
, elt
);
14576 tmp2
= gen_reg_rtx (V16SImode
);
14577 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14578 mask
= gen_reg_rtx (HImode
);
14579 emit_move_insn (mask
, constm1_rtx
);
14580 tmp
= gen_lowpart (V4SImode
, tmp
);
14581 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14583 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14591 if (mmode
!= VOIDmode
)
14593 tmp
= gen_reg_rtx (mode
);
14594 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14595 /* The avx512*_blendm<mode> expanders have different operand order
14596 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14597 elements where the mask is set and second input operand otherwise,
14598 in {sse,avx}*_*blend* the first input operand is used for elements
14599 where the mask is clear and second input operand otherwise. */
14600 emit_insn (gen_blendm (target
, target
, tmp
,
14602 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14605 else if (use_vec_merge
)
14608 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14609 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14610 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14611 emit_insn (gen_rtx_SET (target
, tmp
));
14615 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14617 emit_move_insn (mem
, target
);
14619 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14620 emit_move_insn (tmp
, val
);
14622 emit_move_insn (target
, mem
);
14627 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14629 machine_mode mode
= GET_MODE (vec
);
14630 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14631 bool use_vec_extr
= false;
14637 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14651 use_vec_extr
= true;
14655 use_vec_extr
= TARGET_SSE4_1
;
14667 tmp
= gen_reg_rtx (mode
);
14668 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14669 GEN_INT (elt
), GEN_INT (elt
),
14670 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14674 tmp
= gen_reg_rtx (mode
);
14675 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14679 gcc_unreachable ();
14682 use_vec_extr
= true;
14687 use_vec_extr
= TARGET_SSE4_1
;
14701 tmp
= gen_reg_rtx (mode
);
14702 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14703 GEN_INT (elt
), GEN_INT (elt
),
14704 GEN_INT (elt
), GEN_INT (elt
)));
14708 tmp
= gen_reg_rtx (mode
);
14709 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14713 gcc_unreachable ();
14716 use_vec_extr
= true;
14721 /* For SSE1, we have to reuse the V4SF code. */
14722 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14723 gen_lowpart (V4SFmode
, vec
), elt
);
14729 use_vec_extr
= TARGET_SSE2
;
14732 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14736 use_vec_extr
= TARGET_SSE4_1
;
14740 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
14742 tmp
= gen_reg_rtx (SImode
);
14743 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
14745 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
14753 tmp
= gen_reg_rtx (V4SFmode
);
14755 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14757 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14758 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14766 tmp
= gen_reg_rtx (V2DFmode
);
14768 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14770 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14771 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14779 tmp
= gen_reg_rtx (V16QImode
);
14781 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14783 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14784 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14792 tmp
= gen_reg_rtx (V8HImode
);
14794 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14796 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14797 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14805 tmp
= gen_reg_rtx (V4SImode
);
14807 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14809 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14810 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14818 tmp
= gen_reg_rtx (V2DImode
);
14820 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14822 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14823 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14829 if (TARGET_AVX512BW
)
14831 tmp
= gen_reg_rtx (V16HImode
);
14833 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14835 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14836 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14842 if (TARGET_AVX512BW
)
14844 tmp
= gen_reg_rtx (V32QImode
);
14846 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14848 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14849 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14855 tmp
= gen_reg_rtx (V8SFmode
);
14857 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14859 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14860 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14864 tmp
= gen_reg_rtx (V4DFmode
);
14866 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14868 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14869 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14873 tmp
= gen_reg_rtx (V8SImode
);
14875 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14877 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14878 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14882 tmp
= gen_reg_rtx (V4DImode
);
14884 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14886 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14887 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14891 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14892 /* ??? Could extract the appropriate HImode element and shift. */
14901 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14902 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14904 /* Let the rtl optimizers know about the zero extension performed. */
14905 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14907 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14908 target
= gen_lowpart (SImode
, target
);
14911 emit_insn (gen_rtx_SET (target
, tmp
));
14915 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14917 emit_move_insn (mem
, vec
);
14919 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14920 emit_move_insn (target
, tmp
);
14924 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14925 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14926 The upper bits of DEST are undefined, though they shouldn't cause
14927 exceptions (some bits from src or all zeros are ok). */
14930 emit_reduc_half (rtx dest
, rtx src
, int i
)
14933 switch (GET_MODE (src
))
14937 tem
= gen_sse_movhlps (dest
, src
, src
);
14939 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14940 GEN_INT (1 + 4), GEN_INT (1 + 4));
14943 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14949 d
= gen_reg_rtx (V1TImode
);
14950 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14955 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14957 tem
= gen_avx_shufps256 (dest
, src
, src
,
14958 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14962 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14964 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14972 if (GET_MODE (dest
) != V4DImode
)
14973 d
= gen_reg_rtx (V4DImode
);
14974 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14975 gen_lowpart (V4DImode
, src
),
14980 d
= gen_reg_rtx (V2TImode
);
14981 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14992 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14993 gen_lowpart (V16SImode
, src
),
14994 gen_lowpart (V16SImode
, src
),
14995 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14996 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14997 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14998 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14999 GEN_INT (0xC), GEN_INT (0xD),
15000 GEN_INT (0xE), GEN_INT (0xF),
15001 GEN_INT (0x10), GEN_INT (0x11),
15002 GEN_INT (0x12), GEN_INT (0x13),
15003 GEN_INT (0x14), GEN_INT (0x15),
15004 GEN_INT (0x16), GEN_INT (0x17));
15006 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
15007 gen_lowpart (V16SImode
, src
),
15008 GEN_INT (i
== 128 ? 0x2 : 0x1),
15012 GEN_INT (i
== 128 ? 0x6 : 0x5),
15016 GEN_INT (i
== 128 ? 0xA : 0x9),
15020 GEN_INT (i
== 128 ? 0xE : 0xD),
15026 gcc_unreachable ();
15030 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15033 /* Expand a vector reduction. FN is the binary pattern to reduce;
15034 DEST is the destination; IN is the input vector. */
15037 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15039 rtx half
, dst
, vec
= in
;
15040 machine_mode mode
= GET_MODE (in
);
15043 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15045 && mode
== V8HImode
15046 && fn
== gen_uminv8hi3
)
15048 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15052 for (i
= GET_MODE_BITSIZE (mode
);
15053 i
> GET_MODE_UNIT_BITSIZE (mode
);
15056 half
= gen_reg_rtx (mode
);
15057 emit_reduc_half (half
, vec
, i
);
15058 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15061 dst
= gen_reg_rtx (mode
);
15062 emit_insn (fn (dst
, half
, vec
));
15067 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15068 FP status register is set. */
15071 ix86_emit_fp_unordered_jump (rtx label
)
15073 rtx reg
= gen_reg_rtx (HImode
);
15077 emit_insn (gen_x86_fnstsw_1 (reg
));
15079 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15081 emit_insn (gen_x86_sahf_1 (reg
));
15083 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15084 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15088 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15090 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15091 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15094 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15095 gen_rtx_LABEL_REF (VOIDmode
, label
),
15097 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15098 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15099 JUMP_LABEL (insn
) = label
;
15102 /* Output code to perform an sinh XFmode calculation. */
15104 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15106 rtx e1
= gen_reg_rtx (XFmode
);
15107 rtx e2
= gen_reg_rtx (XFmode
);
15108 rtx scratch
= gen_reg_rtx (HImode
);
15109 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15110 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15112 rtx_code_label
*jump_label
= gen_label_rtx ();
15115 /* scratch = fxam (op1) */
15116 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15118 /* e1 = expm1 (|op1|) */
15119 emit_insn (gen_absxf2 (e2
, op1
));
15120 emit_insn (gen_expm1xf2 (e1
, e2
));
15122 /* e2 = e1 / (e1 + 1.0) + e1 */
15123 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15124 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15125 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15126 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15128 /* flags = signbit (op1) */
15129 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15131 /* if (flags) then e2 = -e2 */
15132 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15133 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15134 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15136 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15137 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15138 JUMP_LABEL (insn
) = jump_label
;
15140 emit_insn (gen_negxf2 (e2
, e2
));
15142 emit_label (jump_label
);
15143 LABEL_NUSES (jump_label
) = 1;
15145 /* op0 = 0.5 * e2 */
15146 half
= force_reg (XFmode
, half
);
15147 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15150 /* Output code to perform an cosh XFmode calculation. */
15152 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15154 rtx e1
= gen_reg_rtx (XFmode
);
15155 rtx e2
= gen_reg_rtx (XFmode
);
15156 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15159 /* e1 = exp (op1) */
15160 emit_insn (gen_expxf2 (e1
, op1
));
15162 /* e2 = e1 + 1.0 / e1 */
15163 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15164 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15165 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15167 /* op0 = 0.5 * e2 */
15168 half
= force_reg (XFmode
, half
);
15169 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15172 /* Output code to perform an tanh XFmode calculation. */
15174 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15176 rtx e1
= gen_reg_rtx (XFmode
);
15177 rtx e2
= gen_reg_rtx (XFmode
);
15178 rtx scratch
= gen_reg_rtx (HImode
);
15179 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15181 rtx_code_label
*jump_label
= gen_label_rtx ();
15184 /* scratch = fxam (op1) */
15185 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15187 /* e1 = expm1 (-|2 * op1|) */
15188 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15189 emit_insn (gen_absxf2 (e2
, e2
));
15190 emit_insn (gen_negxf2 (e2
, e2
));
15191 emit_insn (gen_expm1xf2 (e1
, e2
));
15193 /* e2 = e1 / (e1 + 2.0) */
15194 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15195 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15196 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15198 /* flags = signbit (op1) */
15199 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15201 /* if (!flags) then e2 = -e2 */
15202 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15203 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15204 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15206 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15207 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15208 JUMP_LABEL (insn
) = jump_label
;
15210 emit_insn (gen_negxf2 (e2
, e2
));
15212 emit_label (jump_label
);
15213 LABEL_NUSES (jump_label
) = 1;
15215 emit_move_insn (op0
, e2
);
15218 /* Output code to perform an asinh XFmode calculation. */
15220 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15222 rtx e1
= gen_reg_rtx (XFmode
);
15223 rtx e2
= gen_reg_rtx (XFmode
);
15224 rtx scratch
= gen_reg_rtx (HImode
);
15225 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15227 rtx_code_label
*jump_label
= gen_label_rtx ();
15230 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15231 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15232 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15233 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15234 emit_insn (gen_sqrtxf2 (e2
, e2
));
15235 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15238 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15240 /* scratch = fxam (op1) */
15241 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15243 /* e1 = e1 + |op1| */
15244 emit_insn (gen_absxf2 (e2
, op1
));
15245 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15247 /* e2 = log1p (e1) */
15248 ix86_emit_i387_log1p (e2
, e1
);
15250 /* flags = signbit (op1) */
15251 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15253 /* if (flags) then e2 = -e2 */
15254 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15255 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15256 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15258 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15259 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15260 JUMP_LABEL (insn
) = jump_label
;
15262 emit_insn (gen_negxf2 (e2
, e2
));
15264 emit_label (jump_label
);
15265 LABEL_NUSES (jump_label
) = 1;
15267 emit_move_insn (op0
, e2
);
15270 /* Output code to perform an acosh XFmode calculation. */
15272 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15274 rtx e1
= gen_reg_rtx (XFmode
);
15275 rtx e2
= gen_reg_rtx (XFmode
);
15276 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15278 /* e2 = sqrt (op1 + 1.0) */
15279 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15280 emit_insn (gen_sqrtxf2 (e2
, e2
));
15282 /* e1 = sqrt (op1 - 1.0) */
15283 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15284 emit_insn (gen_sqrtxf2 (e1
, e1
));
15287 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15289 /* e1 = e1 + op1 */
15290 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15292 /* op0 = log (e1) */
15293 emit_insn (gen_logxf2 (op0
, e1
));
15296 /* Output code to perform an atanh XFmode calculation. */
15298 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15300 rtx e1
= gen_reg_rtx (XFmode
);
15301 rtx e2
= gen_reg_rtx (XFmode
);
15302 rtx scratch
= gen_reg_rtx (HImode
);
15303 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15304 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15306 rtx_code_label
*jump_label
= gen_label_rtx ();
15309 /* scratch = fxam (op1) */
15310 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15313 emit_insn (gen_absxf2 (e2
, op1
));
15315 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15316 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15317 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15318 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15319 emit_insn (gen_negxf2 (e2
, e2
));
15320 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15322 /* e2 = log1p (e1) */
15323 ix86_emit_i387_log1p (e2
, e1
);
15325 /* flags = signbit (op1) */
15326 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15328 /* if (!flags) then e2 = -e2 */
15329 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15330 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15331 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15333 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15334 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15335 JUMP_LABEL (insn
) = jump_label
;
15337 emit_insn (gen_negxf2 (e2
, e2
));
15339 emit_label (jump_label
);
15340 LABEL_NUSES (jump_label
) = 1;
15342 /* op0 = 0.5 * e2 */
15343 half
= force_reg (XFmode
, half
);
15344 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15347 /* Output code to perform a log1p XFmode calculation. */
15349 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15351 rtx_code_label
*label1
= gen_label_rtx ();
15352 rtx_code_label
*label2
= gen_label_rtx ();
15354 rtx tmp
= gen_reg_rtx (XFmode
);
15355 rtx res
= gen_reg_rtx (XFmode
);
15356 rtx cst
, cstln2
, cst1
;
15359 cst
= const_double_from_real_value
15360 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15361 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15363 emit_insn (gen_absxf2 (tmp
, op1
));
15365 cst
= force_reg (XFmode
, cst
);
15366 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15367 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15368 insn
= get_last_insn ();
15369 JUMP_LABEL (insn
) = label1
;
15371 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15372 emit_jump (label2
);
15374 emit_label (label1
);
15375 LABEL_NUSES (label1
) = 1;
15377 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15378 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15379 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15381 emit_label (label2
);
15382 LABEL_NUSES (label2
) = 1;
15384 emit_move_insn (op0
, res
);
15387 /* Emit code for round calculation. */
15388 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15390 machine_mode inmode
= GET_MODE (op1
);
15391 machine_mode outmode
= GET_MODE (op0
);
15392 rtx e1
= gen_reg_rtx (XFmode
);
15393 rtx e2
= gen_reg_rtx (XFmode
);
15394 rtx scratch
= gen_reg_rtx (HImode
);
15395 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15396 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15397 rtx res
= gen_reg_rtx (outmode
);
15398 rtx_code_label
*jump_label
= gen_label_rtx ();
15399 rtx (*floor_insn
) (rtx
, rtx
);
15400 rtx (*neg_insn
) (rtx
, rtx
);
15408 tmp
= gen_reg_rtx (XFmode
);
15410 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15416 gcc_unreachable ();
15422 floor_insn
= gen_frndintxf2_floor
;
15423 neg_insn
= gen_negsf2
;
15426 floor_insn
= gen_frndintxf2_floor
;
15427 neg_insn
= gen_negdf2
;
15430 floor_insn
= gen_frndintxf2_floor
;
15431 neg_insn
= gen_negxf2
;
15434 floor_insn
= gen_lfloorxfhi2
;
15435 neg_insn
= gen_neghi2
;
15438 floor_insn
= gen_lfloorxfsi2
;
15439 neg_insn
= gen_negsi2
;
15442 floor_insn
= gen_lfloorxfdi2
;
15443 neg_insn
= gen_negdi2
;
15446 gcc_unreachable ();
15449 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15451 /* scratch = fxam(op1) */
15452 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15454 /* e1 = fabs(op1) */
15455 emit_insn (gen_absxf2 (e1
, op1
));
15457 /* e2 = e1 + 0.5 */
15458 half
= force_reg (XFmode
, half
);
15459 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15461 /* res = floor(e2) */
15467 tmp
= gen_reg_rtx (XFmode
);
15469 emit_insn (floor_insn (tmp
, e2
));
15470 emit_insn (gen_rtx_SET (res
,
15471 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15472 UNSPEC_TRUNC_NOOP
)));
15476 emit_insn (floor_insn (res
, e2
));
15479 /* flags = signbit(a) */
15480 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15482 /* if (flags) then res = -res */
15483 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15484 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15485 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15487 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15488 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15489 JUMP_LABEL (insn
) = jump_label
;
15491 emit_insn (neg_insn (res
, res
));
15493 emit_label (jump_label
);
15494 LABEL_NUSES (jump_label
) = 1;
15496 emit_move_insn (op0
, res
);
15499 /* Output code to perform a Newton-Rhapson approximation of a single precision
15500 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15502 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15504 rtx x0
, x1
, e0
, e1
;
15506 x0
= gen_reg_rtx (mode
);
15507 e0
= gen_reg_rtx (mode
);
15508 e1
= gen_reg_rtx (mode
);
15509 x1
= gen_reg_rtx (mode
);
15511 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15513 b
= force_reg (mode
, b
);
15515 /* x0 = rcp(b) estimate */
15516 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15518 if (TARGET_AVX512ER
)
15520 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15523 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15527 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15531 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15535 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15538 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15541 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15544 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15547 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15550 /* Output code to perform a Newton-Rhapson approximation of a
15551 single precision floating point [reciprocal] square root. */
15553 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15555 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15559 x0
= gen_reg_rtx (mode
);
15560 e0
= gen_reg_rtx (mode
);
15561 e1
= gen_reg_rtx (mode
);
15562 e2
= gen_reg_rtx (mode
);
15563 e3
= gen_reg_rtx (mode
);
15565 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15568 /* res = rsqrt28(a) estimate */
15569 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15573 /* x0 = rsqrt28(a) estimate */
15574 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15576 /* res = rcp28(x0) estimate */
15577 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15583 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15584 mthree
= const_double_from_real_value (r
, SFmode
);
15586 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15587 mhalf
= const_double_from_real_value (r
, SFmode
);
15588 unspec
= UNSPEC_RSQRT
;
15590 if (VECTOR_MODE_P (mode
))
15592 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15593 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15594 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15595 if (GET_MODE_SIZE (mode
) == 64)
15596 unspec
= UNSPEC_RSQRT14
;
15599 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15600 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15602 a
= force_reg (mode
, a
);
15604 /* x0 = rsqrt(a) estimate */
15605 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15608 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15611 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15614 /* Handle masked compare. */
15615 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15617 mask
= gen_reg_rtx (HImode
);
15618 /* Imm value 0x4 corresponds to not-equal comparison. */
15619 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15620 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15624 mask
= gen_reg_rtx (mode
);
15625 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15626 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15631 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15633 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15636 mthree
= force_reg (mode
, mthree
);
15637 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15639 mhalf
= force_reg (mode
, mhalf
);
15641 /* e3 = -.5 * x0 */
15642 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15644 /* e3 = -.5 * e0 */
15645 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15646 /* ret = e2 * e3 */
15647 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15650 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15651 mask for masking out the sign-bit is stored in *SMASK, if that is
15655 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15657 machine_mode vmode
, mode
= GET_MODE (op0
);
15660 xa
= gen_reg_rtx (mode
);
15661 if (mode
== SFmode
)
15663 else if (mode
== DFmode
)
15667 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15668 if (!VECTOR_MODE_P (mode
))
15670 /* We need to generate a scalar mode mask in this case. */
15671 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15672 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15673 mask
= gen_reg_rtx (mode
);
15674 emit_insn (gen_rtx_SET (mask
, tmp
));
15676 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15684 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15685 swapping the operands if SWAP_OPERANDS is true. The expanded
15686 code is a forward jump to a newly created label in case the
15687 comparison is true. The generated label rtx is returned. */
15688 static rtx_code_label
*
15689 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15690 bool swap_operands
)
15692 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15693 rtx_code_label
*label
;
15697 std::swap (op0
, op1
);
15699 label
= gen_label_rtx ();
15700 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15701 if (unordered_compare
)
15702 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15703 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15704 emit_insn (gen_rtx_SET (reg
, tmp
));
15705 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15706 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15707 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15708 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15709 JUMP_LABEL (tmp
) = label
;
15714 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15715 using comparison code CODE. Operands are swapped for the comparison if
15716 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15718 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15719 bool swap_operands
)
15721 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15722 machine_mode mode
= GET_MODE (op0
);
15723 rtx mask
= gen_reg_rtx (mode
);
15726 std::swap (op0
, op1
);
15728 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15730 emit_insn (insn (mask
, op0
, op1
,
15731 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15735 /* Expand copysign from SIGN to the positive value ABS_VALUE
15736 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15740 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15742 machine_mode mode
= GET_MODE (sign
);
15743 rtx sgn
= gen_reg_rtx (mode
);
15744 if (mask
== NULL_RTX
)
15746 machine_mode vmode
;
15748 if (mode
== SFmode
)
15750 else if (mode
== DFmode
)
15755 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15756 if (!VECTOR_MODE_P (mode
))
15758 /* We need to generate a scalar mode mask in this case. */
15759 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15760 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15761 mask
= gen_reg_rtx (mode
);
15762 emit_insn (gen_rtx_SET (mask
, tmp
));
15766 mask
= gen_rtx_NOT (mode
, mask
);
15767 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15768 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15771 /* Expand SSE sequence for computing lround from OP1 storing
15775 ix86_expand_lround (rtx op0
, rtx op1
)
15777 /* C code for the stuff we're doing below:
15778 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15781 machine_mode mode
= GET_MODE (op1
);
15782 const struct real_format
*fmt
;
15783 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15786 /* load nextafter (0.5, 0.0) */
15787 fmt
= REAL_MODE_FORMAT (mode
);
15788 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15789 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15791 /* adj = copysign (0.5, op1) */
15792 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15793 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15795 /* adj = op1 + adj */
15796 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15798 /* op0 = (imode)adj */
15799 expand_fix (op0
, adj
, 0);
15802 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15806 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15808 /* C code for the stuff we're doing below (for do_floor):
15810 xi -= (double)xi > op1 ? 1 : 0;
15813 machine_mode fmode
= GET_MODE (op1
);
15814 machine_mode imode
= GET_MODE (op0
);
15815 rtx ireg
, freg
, tmp
;
15816 rtx_code_label
*label
;
15818 /* reg = (long)op1 */
15819 ireg
= gen_reg_rtx (imode
);
15820 expand_fix (ireg
, op1
, 0);
15822 /* freg = (double)reg */
15823 freg
= gen_reg_rtx (fmode
);
15824 expand_float (freg
, ireg
, 0);
15826 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15827 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15828 freg
, op1
, !do_floor
);
15829 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15830 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15831 emit_move_insn (ireg
, tmp
);
15833 emit_label (label
);
15834 LABEL_NUSES (label
) = 1;
15836 emit_move_insn (op0
, ireg
);
15839 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15840 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15843 ix86_gen_TWO52 (machine_mode mode
)
15845 REAL_VALUE_TYPE TWO52r
;
15848 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15849 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15850 TWO52
= force_reg (mode
, TWO52
);
15855 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15858 ix86_expand_rint (rtx operand0
, rtx operand1
)
15860 /* C code for the stuff we're doing below:
15861 xa = fabs (operand1);
15862 if (!isless (xa, 2**52))
15865 if (flag_rounding_math)
15867 two52 = copysign (two52, operand1);
15870 xa = xa + two52 - two52;
15871 return copysign (xa, operand1);
15873 machine_mode mode
= GET_MODE (operand0
);
15874 rtx res
, xa
, TWO52
, two52
, mask
;
15875 rtx_code_label
*label
;
15877 res
= gen_reg_rtx (mode
);
15878 emit_move_insn (res
, operand1
);
15880 /* xa = abs (operand1) */
15881 xa
= ix86_expand_sse_fabs (res
, &mask
);
15883 /* if (!isless (xa, TWO52)) goto label; */
15884 TWO52
= ix86_gen_TWO52 (mode
);
15885 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15888 if (flag_rounding_math
)
15890 two52
= gen_reg_rtx (mode
);
15891 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15895 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15896 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15898 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15900 emit_label (label
);
15901 LABEL_NUSES (label
) = 1;
15903 emit_move_insn (operand0
, res
);
15906 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15909 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15911 /* C code for the stuff we expand below.
15912 double xa = fabs (x), x2;
15913 if (!isless (xa, TWO52))
15915 xa = xa + TWO52 - TWO52;
15916 x2 = copysign (xa, x);
15923 if (HONOR_SIGNED_ZEROS (mode))
15924 x2 = copysign (x2, x);
15927 machine_mode mode
= GET_MODE (operand0
);
15928 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15929 rtx_code_label
*label
;
15931 TWO52
= ix86_gen_TWO52 (mode
);
15933 /* Temporary for holding the result, initialized to the input
15934 operand to ease control flow. */
15935 res
= gen_reg_rtx (mode
);
15936 emit_move_insn (res
, operand1
);
15938 /* xa = abs (operand1) */
15939 xa
= ix86_expand_sse_fabs (res
, &mask
);
15941 /* if (!isless (xa, TWO52)) goto label; */
15942 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15944 /* xa = xa + TWO52 - TWO52; */
15945 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15946 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
15948 /* xa = copysign (xa, operand1) */
15949 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
15952 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15954 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15955 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15956 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15957 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15958 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15959 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
15960 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
15961 emit_move_insn (res
, tmp
);
15963 emit_label (label
);
15964 LABEL_NUSES (label
) = 1;
15966 emit_move_insn (operand0
, res
);
15969 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15972 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15974 /* C code for the stuff we expand below.
15975 double xa = fabs (x), x2;
15976 if (!isless (xa, TWO52))
15978 x2 = (double)(long)x;
15985 if (HONOR_SIGNED_ZEROS (mode))
15986 return copysign (x2, x);
15989 machine_mode mode
= GET_MODE (operand0
);
15990 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15991 rtx_code_label
*label
;
15993 TWO52
= ix86_gen_TWO52 (mode
);
15995 /* Temporary for holding the result, initialized to the input
15996 operand to ease control flow. */
15997 res
= gen_reg_rtx (mode
);
15998 emit_move_insn (res
, operand1
);
16000 /* xa = abs (operand1) */
16001 xa
= ix86_expand_sse_fabs (res
, &mask
);
16003 /* if (!isless (xa, TWO52)) goto label; */
16004 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16006 /* xa = (double)(long)x */
16007 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16008 expand_fix (xi
, res
, 0);
16009 expand_float (xa
, xi
, 0);
16012 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16014 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16015 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16016 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16017 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16018 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16019 emit_move_insn (res
, tmp
);
16021 if (HONOR_SIGNED_ZEROS (mode
))
16022 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
16024 emit_label (label
);
16025 LABEL_NUSES (label
) = 1;
16027 emit_move_insn (operand0
, res
);
16030 /* Expand SSE sequence for computing round from OPERAND1 storing
16031 into OPERAND0. Sequence that works without relying on DImode truncation
16032 via cvttsd2siq that is only available on 64bit targets. */
16034 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16036 /* C code for the stuff we expand below.
16037 double xa = fabs (x), xa2, x2;
16038 if (!isless (xa, TWO52))
16040 Using the absolute value and copying back sign makes
16041 -0.0 -> -0.0 correct.
16042 xa2 = xa + TWO52 - TWO52;
16047 else if (dxa > 0.5)
16049 x2 = copysign (xa2, x);
16052 machine_mode mode
= GET_MODE (operand0
);
16053 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16054 rtx_code_label
*label
;
16056 TWO52
= ix86_gen_TWO52 (mode
);
16058 /* Temporary for holding the result, initialized to the input
16059 operand to ease control flow. */
16060 res
= gen_reg_rtx (mode
);
16061 emit_move_insn (res
, operand1
);
16063 /* xa = abs (operand1) */
16064 xa
= ix86_expand_sse_fabs (res
, &mask
);
16066 /* if (!isless (xa, TWO52)) goto label; */
16067 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16069 /* xa2 = xa + TWO52 - TWO52; */
16070 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16071 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16073 /* dxa = xa2 - xa; */
16074 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16076 /* generate 0.5, 1.0 and -0.5 */
16077 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16078 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16079 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16083 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16084 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16085 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16086 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16087 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16088 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16089 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16090 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16092 /* res = copysign (xa2, operand1) */
16093 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
16095 emit_label (label
);
16096 LABEL_NUSES (label
) = 1;
16098 emit_move_insn (operand0
, res
);
16101 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16104 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16106 /* C code for SSE variant we expand below.
16107 double xa = fabs (x), x2;
16108 if (!isless (xa, TWO52))
16110 x2 = (double)(long)x;
16111 if (HONOR_SIGNED_ZEROS (mode))
16112 return copysign (x2, x);
16115 machine_mode mode
= GET_MODE (operand0
);
16116 rtx xa
, xi
, TWO52
, res
, mask
;
16117 rtx_code_label
*label
;
16119 TWO52
= ix86_gen_TWO52 (mode
);
16121 /* Temporary for holding the result, initialized to the input
16122 operand to ease control flow. */
16123 res
= gen_reg_rtx (mode
);
16124 emit_move_insn (res
, operand1
);
16126 /* xa = abs (operand1) */
16127 xa
= ix86_expand_sse_fabs (res
, &mask
);
16129 /* if (!isless (xa, TWO52)) goto label; */
16130 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16132 /* x = (double)(long)x */
16133 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16134 expand_fix (xi
, res
, 0);
16135 expand_float (res
, xi
, 0);
16137 if (HONOR_SIGNED_ZEROS (mode
))
16138 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
16140 emit_label (label
);
16141 LABEL_NUSES (label
) = 1;
16143 emit_move_insn (operand0
, res
);
16146 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16149 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16151 machine_mode mode
= GET_MODE (operand0
);
16152 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
16153 rtx_code_label
*label
;
16155 /* C code for SSE variant we expand below.
16156 double xa = fabs (x), x2;
16157 if (!isless (xa, TWO52))
16159 xa2 = xa + TWO52 - TWO52;
16163 x2 = copysign (xa2, x);
16167 TWO52
= ix86_gen_TWO52 (mode
);
16169 /* Temporary for holding the result, initialized to the input
16170 operand to ease control flow. */
16171 res
= gen_reg_rtx (mode
);
16172 emit_move_insn (res
, operand1
);
16174 /* xa = abs (operand1) */
16175 xa
= ix86_expand_sse_fabs (res
, &smask
);
16177 /* if (!isless (xa, TWO52)) goto label; */
16178 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16180 /* res = xa + TWO52 - TWO52; */
16181 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16182 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
16183 emit_move_insn (res
, tmp
);
16186 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16188 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16189 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
16190 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
16191 tmp
= expand_simple_binop (mode
, MINUS
,
16192 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
16193 emit_move_insn (res
, tmp
);
16195 /* res = copysign (res, operand1) */
16196 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
16198 emit_label (label
);
16199 LABEL_NUSES (label
) = 1;
16201 emit_move_insn (operand0
, res
);
16204 /* Expand SSE sequence for computing round from OPERAND1 storing
16207 ix86_expand_round (rtx operand0
, rtx operand1
)
16209 /* C code for the stuff we're doing below:
16210 double xa = fabs (x);
16211 if (!isless (xa, TWO52))
16213 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16214 return copysign (xa, x);
16216 machine_mode mode
= GET_MODE (operand0
);
16217 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16218 rtx_code_label
*label
;
16219 const struct real_format
*fmt
;
16220 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16222 /* Temporary for holding the result, initialized to the input
16223 operand to ease control flow. */
16224 res
= gen_reg_rtx (mode
);
16225 emit_move_insn (res
, operand1
);
16227 TWO52
= ix86_gen_TWO52 (mode
);
16228 xa
= ix86_expand_sse_fabs (res
, &mask
);
16229 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16231 /* load nextafter (0.5, 0.0) */
16232 fmt
= REAL_MODE_FORMAT (mode
);
16233 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16234 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16236 /* xa = xa + 0.5 */
16237 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16238 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16240 /* xa = (double)(int64_t)xa */
16241 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16242 expand_fix (xi
, xa
, 0);
16243 expand_float (xa
, xi
, 0);
16245 /* res = copysign (xa, operand1) */
16246 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16248 emit_label (label
);
16249 LABEL_NUSES (label
) = 1;
16251 emit_move_insn (operand0
, res
);
16254 /* Expand SSE sequence for computing round
16255 from OP1 storing into OP0 using sse4 round insn. */
16257 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16259 machine_mode mode
= GET_MODE (op0
);
16260 rtx e1
, e2
, res
, half
;
16261 const struct real_format
*fmt
;
16262 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16263 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16264 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16269 gen_copysign
= gen_copysignsf3
;
16270 gen_round
= gen_sse4_1_roundsf2
;
16273 gen_copysign
= gen_copysigndf3
;
16274 gen_round
= gen_sse4_1_rounddf2
;
16277 gcc_unreachable ();
16280 /* round (a) = trunc (a + copysign (0.5, a)) */
16282 /* load nextafter (0.5, 0.0) */
16283 fmt
= REAL_MODE_FORMAT (mode
);
16284 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16285 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16286 half
= const_double_from_real_value (pred_half
, mode
);
16288 /* e1 = copysign (0.5, op1) */
16289 e1
= gen_reg_rtx (mode
);
16290 emit_insn (gen_copysign (e1
, half
, op1
));
16292 /* e2 = op1 + e1 */
16293 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16295 /* res = trunc (e2) */
16296 res
= gen_reg_rtx (mode
);
16297 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16299 emit_move_insn (op0
, res
);
16302 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16303 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16304 insn every time. */
16306 static GTY(()) rtx_insn
*vselect_insn
;
16308 /* Initialize vselect_insn. */
16311 init_vselect_insn (void)
16316 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16317 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16318 XVECEXP (x
, 0, i
) = const0_rtx
;
16319 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16321 x
= gen_rtx_SET (const0_rtx
, x
);
16323 vselect_insn
= emit_insn (x
);
16327 /* Construct (set target (vec_select op0 (parallel perm))) and
16328 return true if that's a valid instruction in the active ISA. */
16331 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16332 unsigned nelt
, bool testing_p
)
16335 rtx x
, save_vconcat
;
16338 if (vselect_insn
== NULL_RTX
)
16339 init_vselect_insn ();
16341 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16342 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16343 for (i
= 0; i
< nelt
; ++i
)
16344 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16345 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16346 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16347 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16348 SET_DEST (PATTERN (vselect_insn
)) = target
;
16349 icode
= recog_memoized (vselect_insn
);
16351 if (icode
>= 0 && !testing_p
)
16352 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16354 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16355 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16356 INSN_CODE (vselect_insn
) = -1;
16361 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16364 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16365 const unsigned char *perm
, unsigned nelt
,
16368 machine_mode v2mode
;
16372 if (vselect_insn
== NULL_RTX
)
16373 init_vselect_insn ();
16375 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16377 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16378 PUT_MODE (x
, v2mode
);
16381 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16382 XEXP (x
, 0) = const0_rtx
;
16383 XEXP (x
, 1) = const0_rtx
;
16387 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16388 using movss or movsd. */
16390 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16392 machine_mode vmode
= d
->vmode
;
16393 unsigned i
, nelt
= d
->nelt
;
16396 if (d
->one_operand_p
)
16399 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16400 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16403 /* Only the first element is changed. */
16404 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16406 for (i
= 1; i
< nelt
; ++i
)
16407 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16413 if (d
->perm
[0] == nelt
)
16414 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16416 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16418 emit_insn (gen_rtx_SET (d
->target
, x
));
16423 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16424 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16427 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16429 machine_mode mmode
, vmode
= d
->vmode
;
16430 unsigned i
, nelt
= d
->nelt
;
16431 unsigned HOST_WIDE_INT mask
;
16432 rtx target
, op0
, op1
, maskop
, x
;
16433 rtx rperm
[32], vperm
;
16435 if (d
->one_operand_p
)
16437 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16438 && (TARGET_AVX512BW
16439 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16441 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16443 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16445 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16450 /* This is a blend, not a permute. Elements must stay in their
16451 respective lanes. */
16452 for (i
= 0; i
< nelt
; ++i
)
16454 unsigned e
= d
->perm
[i
];
16455 if (!(e
== i
|| e
== i
+ nelt
))
16462 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16463 decision should be extracted elsewhere, so that we only try that
16464 sequence once all budget==3 options have been tried. */
16465 target
= d
->target
;
16484 for (i
= 0; i
< nelt
; ++i
)
16485 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16489 for (i
= 0; i
< 2; ++i
)
16490 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16495 for (i
= 0; i
< 4; ++i
)
16496 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16501 /* See if bytes move in pairs so we can use pblendw with
16502 an immediate argument, rather than pblendvb with a vector
16504 for (i
= 0; i
< 16; i
+= 2)
16505 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16508 for (i
= 0; i
< nelt
; ++i
)
16509 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16512 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16513 vperm
= force_reg (vmode
, vperm
);
16515 if (GET_MODE_SIZE (vmode
) == 16)
16516 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16518 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16519 if (target
!= d
->target
)
16520 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16524 for (i
= 0; i
< 8; ++i
)
16525 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16530 target
= gen_reg_rtx (vmode
);
16531 op0
= gen_lowpart (vmode
, op0
);
16532 op1
= gen_lowpart (vmode
, op1
);
16536 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16537 for (i
= 0; i
< 32; i
+= 2)
16538 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16540 /* See if bytes move in quadruplets. If yes, vpblendd
16541 with immediate can be used. */
16542 for (i
= 0; i
< 32; i
+= 4)
16543 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16547 /* See if bytes move the same in both lanes. If yes,
16548 vpblendw with immediate can be used. */
16549 for (i
= 0; i
< 16; i
+= 2)
16550 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16553 /* Use vpblendw. */
16554 for (i
= 0; i
< 16; ++i
)
16555 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16560 /* Use vpblendd. */
16561 for (i
= 0; i
< 8; ++i
)
16562 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16567 /* See if words move in pairs. If yes, vpblendd can be used. */
16568 for (i
= 0; i
< 16; i
+= 2)
16569 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16573 /* See if words move the same in both lanes. If not,
16574 vpblendvb must be used. */
16575 for (i
= 0; i
< 8; i
++)
16576 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16578 /* Use vpblendvb. */
16579 for (i
= 0; i
< 32; ++i
)
16580 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16584 target
= gen_reg_rtx (vmode
);
16585 op0
= gen_lowpart (vmode
, op0
);
16586 op1
= gen_lowpart (vmode
, op1
);
16587 goto finish_pblendvb
;
16590 /* Use vpblendw. */
16591 for (i
= 0; i
< 16; ++i
)
16592 mask
|= (d
->perm
[i
] >= 16) << i
;
16596 /* Use vpblendd. */
16597 for (i
= 0; i
< 8; ++i
)
16598 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16603 /* Use vpblendd. */
16604 for (i
= 0; i
< 4; ++i
)
16605 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16610 gcc_unreachable ();
16633 if (mmode
!= VOIDmode
)
16634 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16636 maskop
= GEN_INT (mask
);
16638 /* This matches five different patterns with the different modes. */
16639 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16640 x
= gen_rtx_SET (target
, x
);
16642 if (target
!= d
->target
)
16643 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16648 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16649 in terms of the variable form of vpermilps.
16651 Note that we will have already failed the immediate input vpermilps,
16652 which requires that the high and low part shuffle be identical; the
16653 variable form doesn't require that. */
16656 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16658 rtx rperm
[8], vperm
;
16661 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16664 /* We can only permute within the 128-bit lane. */
16665 for (i
= 0; i
< 8; ++i
)
16667 unsigned e
= d
->perm
[i
];
16668 if (i
< 4 ? e
>= 4 : e
< 4)
16675 for (i
= 0; i
< 8; ++i
)
16677 unsigned e
= d
->perm
[i
];
16679 /* Within each 128-bit lane, the elements of op0 are numbered
16680 from 0 and the elements of op1 are numbered from 4. */
16686 rperm
[i
] = GEN_INT (e
);
16689 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16690 vperm
= force_reg (V8SImode
, vperm
);
16691 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16696 /* Return true if permutation D can be performed as VMODE permutation
16700 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16702 unsigned int i
, j
, chunk
;
16704 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16705 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16706 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16709 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16712 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16713 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16714 if (d
->perm
[i
] & (chunk
- 1))
16717 for (j
= 1; j
< chunk
; ++j
)
16718 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16724 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16725 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16728 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16730 unsigned i
, nelt
, eltsz
, mask
;
16731 unsigned char perm
[64];
16732 machine_mode vmode
= V16QImode
;
16733 rtx rperm
[64], vperm
, target
, op0
, op1
;
16737 if (!d
->one_operand_p
)
16739 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16742 && valid_perm_using_mode_p (V2TImode
, d
))
16747 /* Use vperm2i128 insn. The pattern uses
16748 V4DImode instead of V2TImode. */
16749 target
= d
->target
;
16750 if (d
->vmode
!= V4DImode
)
16751 target
= gen_reg_rtx (V4DImode
);
16752 op0
= gen_lowpart (V4DImode
, d
->op0
);
16753 op1
= gen_lowpart (V4DImode
, d
->op1
);
16755 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16756 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16757 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16758 if (target
!= d
->target
)
16759 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16767 if (GET_MODE_SIZE (d
->vmode
) == 16)
16772 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16777 /* V4DImode should be already handled through
16778 expand_vselect by vpermq instruction. */
16779 gcc_assert (d
->vmode
!= V4DImode
);
16782 if (d
->vmode
== V8SImode
16783 || d
->vmode
== V16HImode
16784 || d
->vmode
== V32QImode
)
16786 /* First see if vpermq can be used for
16787 V8SImode/V16HImode/V32QImode. */
16788 if (valid_perm_using_mode_p (V4DImode
, d
))
16790 for (i
= 0; i
< 4; i
++)
16791 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16794 target
= gen_reg_rtx (V4DImode
);
16795 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16798 emit_move_insn (d
->target
,
16799 gen_lowpart (d
->vmode
, target
));
16805 /* Next see if vpermd can be used. */
16806 if (valid_perm_using_mode_p (V8SImode
, d
))
16809 /* Or if vpermps can be used. */
16810 else if (d
->vmode
== V8SFmode
)
16813 if (vmode
== V32QImode
)
16815 /* vpshufb only works intra lanes, it is not
16816 possible to shuffle bytes in between the lanes. */
16817 for (i
= 0; i
< nelt
; ++i
)
16818 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16822 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16824 if (!TARGET_AVX512BW
)
16827 /* If vpermq didn't work, vpshufb won't work either. */
16828 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16832 if (d
->vmode
== V16SImode
16833 || d
->vmode
== V32HImode
16834 || d
->vmode
== V64QImode
)
16836 /* First see if vpermq can be used for
16837 V16SImode/V32HImode/V64QImode. */
16838 if (valid_perm_using_mode_p (V8DImode
, d
))
16840 for (i
= 0; i
< 8; i
++)
16841 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16844 target
= gen_reg_rtx (V8DImode
);
16845 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16848 emit_move_insn (d
->target
,
16849 gen_lowpart (d
->vmode
, target
));
16855 /* Next see if vpermd can be used. */
16856 if (valid_perm_using_mode_p (V16SImode
, d
))
16859 /* Or if vpermps can be used. */
16860 else if (d
->vmode
== V16SFmode
)
16862 if (vmode
== V64QImode
)
16864 /* vpshufb only works intra lanes, it is not
16865 possible to shuffle bytes in between the lanes. */
16866 for (i
= 0; i
< nelt
; ++i
)
16867 if ((d
->perm
[i
] ^ i
) & (nelt
/ 4))
16878 if (vmode
== V8SImode
)
16879 for (i
= 0; i
< 8; ++i
)
16880 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16881 else if (vmode
== V16SImode
)
16882 for (i
= 0; i
< 16; ++i
)
16883 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16886 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16887 if (!d
->one_operand_p
)
16888 mask
= 2 * nelt
- 1;
16889 else if (vmode
== V16QImode
)
16891 else if (vmode
== V64QImode
)
16892 mask
= nelt
/ 4 - 1;
16894 mask
= nelt
/ 2 - 1;
16896 for (i
= 0; i
< nelt
; ++i
)
16898 unsigned j
, e
= d
->perm
[i
] & mask
;
16899 for (j
= 0; j
< eltsz
; ++j
)
16900 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16904 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16905 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16906 vperm
= force_reg (vmode
, vperm
);
16908 target
= d
->target
;
16909 if (d
->vmode
!= vmode
)
16910 target
= gen_reg_rtx (vmode
);
16911 op0
= gen_lowpart (vmode
, d
->op0
);
16912 if (d
->one_operand_p
)
16914 if (vmode
== V16QImode
)
16915 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16916 else if (vmode
== V32QImode
)
16917 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16918 else if (vmode
== V64QImode
)
16919 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16920 else if (vmode
== V8SFmode
)
16921 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16922 else if (vmode
== V8SImode
)
16923 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16924 else if (vmode
== V16SFmode
)
16925 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16926 else if (vmode
== V16SImode
)
16927 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16929 gcc_unreachable ();
16933 op1
= gen_lowpart (vmode
, d
->op1
);
16934 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16936 if (target
!= d
->target
)
16937 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16942 /* For V*[QHS]Imode permutations, check if the same permutation
16943 can't be performed in a 2x, 4x or 8x wider inner mode. */
16946 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16947 struct expand_vec_perm_d
*nd
)
16950 machine_mode mode
= VOIDmode
;
16954 case E_V16QImode
: mode
= V8HImode
; break;
16955 case E_V32QImode
: mode
= V16HImode
; break;
16956 case E_V64QImode
: mode
= V32HImode
; break;
16957 case E_V8HImode
: mode
= V4SImode
; break;
16958 case E_V16HImode
: mode
= V8SImode
; break;
16959 case E_V32HImode
: mode
= V16SImode
; break;
16960 case E_V4SImode
: mode
= V2DImode
; break;
16961 case E_V8SImode
: mode
= V4DImode
; break;
16962 case E_V16SImode
: mode
= V8DImode
; break;
16963 default: return false;
16965 for (i
= 0; i
< d
->nelt
; i
+= 2)
16966 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16969 nd
->nelt
= d
->nelt
/ 2;
16970 for (i
= 0; i
< nd
->nelt
; i
++)
16971 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16972 if (GET_MODE_INNER (mode
) != DImode
)
16973 canonicalize_vector_int_perm (nd
, nd
);
16976 nd
->one_operand_p
= d
->one_operand_p
;
16977 nd
->testing_p
= d
->testing_p
;
16978 if (d
->op0
== d
->op1
)
16979 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16982 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16983 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16986 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16988 nd
->target
= gen_reg_rtx (nd
->vmode
);
16993 /* Try to expand one-operand permutation with constant mask. */
16996 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16998 machine_mode mode
= GET_MODE (d
->op0
);
16999 machine_mode maskmode
= mode
;
17000 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
17001 rtx target
, op0
, mask
;
17004 if (!rtx_equal_p (d
->op0
, d
->op1
))
17007 if (!TARGET_AVX512F
)
17013 gen
= gen_avx512f_permvarv16si
;
17016 gen
= gen_avx512f_permvarv16sf
;
17017 maskmode
= V16SImode
;
17020 gen
= gen_avx512f_permvarv8di
;
17023 gen
= gen_avx512f_permvarv8df
;
17024 maskmode
= V8DImode
;
17030 target
= d
->target
;
17032 for (int i
= 0; i
< d
->nelt
; ++i
)
17033 vec
[i
] = GEN_INT (d
->perm
[i
]);
17034 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17035 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17039 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17042 in a single instruction. */
17045 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17047 unsigned i
, nelt
= d
->nelt
;
17048 struct expand_vec_perm_d nd
;
17050 /* Check plain VEC_SELECT first, because AVX has instructions that could
17051 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17052 input where SEL+CONCAT may not. */
17053 if (d
->one_operand_p
)
17055 int mask
= nelt
- 1;
17056 bool identity_perm
= true;
17057 bool broadcast_perm
= true;
17059 for (i
= 0; i
< nelt
; i
++)
17061 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17062 if (nd
.perm
[i
] != i
)
17063 identity_perm
= false;
17065 broadcast_perm
= false;
17071 emit_move_insn (d
->target
, d
->op0
);
17074 else if (broadcast_perm
&& TARGET_AVX2
)
17076 /* Use vpbroadcast{b,w,d}. */
17077 rtx (*gen
) (rtx
, rtx
) = NULL
;
17081 if (TARGET_AVX512BW
)
17082 gen
= gen_avx512bw_vec_dupv64qi_1
;
17085 gen
= gen_avx2_pbroadcastv32qi_1
;
17088 if (TARGET_AVX512BW
)
17089 gen
= gen_avx512bw_vec_dupv32hi_1
;
17092 gen
= gen_avx2_pbroadcastv16hi_1
;
17095 if (TARGET_AVX512F
)
17096 gen
= gen_avx512f_vec_dupv16si_1
;
17099 gen
= gen_avx2_pbroadcastv8si_1
;
17102 gen
= gen_avx2_pbroadcastv16qi
;
17105 gen
= gen_avx2_pbroadcastv8hi
;
17108 if (TARGET_AVX512F
)
17109 gen
= gen_avx512f_vec_dupv16sf_1
;
17112 gen
= gen_avx2_vec_dupv8sf_1
;
17115 if (TARGET_AVX512F
)
17116 gen
= gen_avx512f_vec_dupv8df_1
;
17119 if (TARGET_AVX512F
)
17120 gen
= gen_avx512f_vec_dupv8di_1
;
17122 /* For other modes prefer other shuffles this function creates. */
17128 emit_insn (gen (d
->target
, d
->op0
));
17133 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17136 /* There are plenty of patterns in sse.md that are written for
17137 SEL+CONCAT and are not replicated for a single op. Perhaps
17138 that should be changed, to avoid the nastiness here. */
17140 /* Recognize interleave style patterns, which means incrementing
17141 every other permutation operand. */
17142 for (i
= 0; i
< nelt
; i
+= 2)
17144 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17145 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17147 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17151 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17154 for (i
= 0; i
< nelt
; i
+= 4)
17156 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17157 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17158 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17159 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17162 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17168 /* Try movss/movsd instructions. */
17169 if (expand_vec_perm_movs (d
))
17172 /* Finally, try the fully general two operand permute. */
17173 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17177 /* Recognize interleave style patterns with reversed operands. */
17178 if (!d
->one_operand_p
)
17180 for (i
= 0; i
< nelt
; ++i
)
17182 unsigned e
= d
->perm
[i
];
17190 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17195 /* Try the SSE4.1 blend variable merge instructions. */
17196 if (expand_vec_perm_blend (d
))
17199 /* Try one of the AVX vpermil variable permutations. */
17200 if (expand_vec_perm_vpermil (d
))
17203 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17204 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17205 if (expand_vec_perm_pshufb (d
))
17208 /* Try the AVX2 vpalignr instruction. */
17209 if (expand_vec_perm_palignr (d
, true))
17212 /* Try the AVX512F vperm{s,d} instructions. */
17213 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17216 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17217 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17220 /* See if we can get the same permutation in different vector integer
17222 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17225 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17231 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17232 in terms of a pair of pshuflw + pshufhw instructions. */
17235 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17237 unsigned char perm2
[MAX_VECT_LEN
];
17241 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17244 /* The two permutations only operate in 64-bit lanes. */
17245 for (i
= 0; i
< 4; ++i
)
17246 if (d
->perm
[i
] >= 4)
17248 for (i
= 4; i
< 8; ++i
)
17249 if (d
->perm
[i
] < 4)
17255 /* Emit the pshuflw. */
17256 memcpy (perm2
, d
->perm
, 4);
17257 for (i
= 4; i
< 8; ++i
)
17259 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17262 /* Emit the pshufhw. */
17263 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17264 for (i
= 0; i
< 4; ++i
)
17266 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17272 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17273 the permutation using the SSSE3 palignr instruction. This succeeds
17274 when all of the elements in PERM fit within one vector and we merely
17275 need to shift them down so that a single vector permutation has a
17276 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17277 the vpalignr instruction itself can perform the requested permutation. */
17280 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17282 unsigned i
, nelt
= d
->nelt
;
17283 unsigned min
, max
, minswap
, maxswap
;
17284 bool in_order
, ok
, swap
= false;
17286 struct expand_vec_perm_d dcopy
;
17288 /* Even with AVX, palignr only operates on 128-bit vectors,
17289 in AVX2 palignr operates on both 128-bit lanes. */
17290 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17291 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17296 minswap
= 2 * nelt
;
17298 for (i
= 0; i
< nelt
; ++i
)
17300 unsigned e
= d
->perm
[i
];
17301 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17302 if (GET_MODE_SIZE (d
->vmode
) == 32)
17304 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17305 eswap
= e
^ (nelt
/ 2);
17311 if (eswap
< minswap
)
17313 if (eswap
> maxswap
)
17317 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17319 if (d
->one_operand_p
17321 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17322 ? nelt
/ 2 : nelt
))
17329 /* Given that we have SSSE3, we know we'll be able to implement the
17330 single operand permutation after the palignr with pshufb for
17331 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17333 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17339 dcopy
.op0
= d
->op1
;
17340 dcopy
.op1
= d
->op0
;
17341 for (i
= 0; i
< nelt
; ++i
)
17342 dcopy
.perm
[i
] ^= nelt
;
17346 for (i
= 0; i
< nelt
; ++i
)
17348 unsigned e
= dcopy
.perm
[i
];
17349 if (GET_MODE_SIZE (d
->vmode
) == 32
17351 && (e
& (nelt
/ 2 - 1)) < min
)
17352 e
= e
- min
- (nelt
/ 2);
17359 dcopy
.one_operand_p
= true;
17361 if (single_insn_only_p
&& !in_order
)
17364 /* For AVX2, test whether we can permute the result in one instruction. */
17369 dcopy
.op1
= dcopy
.op0
;
17370 return expand_vec_perm_1 (&dcopy
);
17373 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17374 if (GET_MODE_SIZE (d
->vmode
) == 16)
17376 target
= gen_reg_rtx (TImode
);
17377 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17378 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17382 target
= gen_reg_rtx (V2TImode
);
17383 emit_insn (gen_avx2_palignrv2ti (target
,
17384 gen_lowpart (V2TImode
, dcopy
.op1
),
17385 gen_lowpart (V2TImode
, dcopy
.op0
),
17389 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17391 /* Test for the degenerate case where the alignment by itself
17392 produces the desired permutation. */
17395 emit_move_insn (d
->target
, dcopy
.op0
);
17399 ok
= expand_vec_perm_1 (&dcopy
);
17400 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17405 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17406 the permutation using the SSE4_1 pblendv instruction. Potentially
17407 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17410 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17412 unsigned i
, which
, nelt
= d
->nelt
;
17413 struct expand_vec_perm_d dcopy
, dcopy1
;
17414 machine_mode vmode
= d
->vmode
;
17417 /* Use the same checks as in expand_vec_perm_blend. */
17418 if (d
->one_operand_p
)
17420 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17422 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17424 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17429 /* Figure out where permutation elements stay not in their
17430 respective lanes. */
17431 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17433 unsigned e
= d
->perm
[i
];
17435 which
|= (e
< nelt
? 1 : 2);
17437 /* We can pblend the part where elements stay not in their
17438 respective lanes only when these elements are all in one
17439 half of a permutation.
17440 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17441 lanes, but both 8 and 9 >= 8
17442 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17443 respective lanes and 8 >= 8, but 2 not. */
17444 if (which
!= 1 && which
!= 2)
17446 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17449 /* First we apply one operand permutation to the part where
17450 elements stay not in their respective lanes. */
17453 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17455 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17457 dcopy
.target
= gen_reg_rtx (vmode
);
17458 dcopy
.one_operand_p
= true;
17460 for (i
= 0; i
< nelt
; ++i
)
17461 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17463 ok
= expand_vec_perm_1 (&dcopy
);
17464 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17471 /* Next we put permuted elements into their positions. */
17474 dcopy1
.op1
= dcopy
.target
;
17476 dcopy1
.op0
= dcopy
.target
;
17478 for (i
= 0; i
< nelt
; ++i
)
17479 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17481 ok
= expand_vec_perm_blend (&dcopy1
);
17487 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17489 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17490 a two vector permutation into a single vector permutation by using
17491 an interleave operation to merge the vectors. */
17494 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17496 struct expand_vec_perm_d dremap
, dfinal
;
17497 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17498 unsigned HOST_WIDE_INT contents
;
17499 unsigned char remap
[2 * MAX_VECT_LEN
];
17501 bool ok
, same_halves
= false;
17503 if (GET_MODE_SIZE (d
->vmode
) == 16)
17505 if (d
->one_operand_p
)
17508 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17512 /* For 32-byte modes allow even d->one_operand_p.
17513 The lack of cross-lane shuffling in some instructions
17514 might prevent a single insn shuffle. */
17516 dfinal
.testing_p
= true;
17517 /* If expand_vec_perm_interleave3 can expand this into
17518 a 3 insn sequence, give up and let it be expanded as
17519 3 insn sequence. While that is one insn longer,
17520 it doesn't need a memory operand and in the common
17521 case that both interleave low and high permutations
17522 with the same operands are adjacent needs 4 insns
17523 for both after CSE. */
17524 if (expand_vec_perm_interleave3 (&dfinal
))
17530 /* Examine from whence the elements come. */
17532 for (i
= 0; i
< nelt
; ++i
)
17533 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17535 memset (remap
, 0xff, sizeof (remap
));
17538 if (GET_MODE_SIZE (d
->vmode
) == 16)
17540 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17542 /* Split the two input vectors into 4 halves. */
17543 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17548 /* If the elements from the low halves use interleave low, and similarly
17549 for interleave high. If the elements are from mis-matched halves, we
17550 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17551 if ((contents
& (h1
| h3
)) == contents
)
17554 for (i
= 0; i
< nelt2
; ++i
)
17557 remap
[i
+ nelt
] = i
* 2 + 1;
17558 dremap
.perm
[i
* 2] = i
;
17559 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17561 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17562 dremap
.vmode
= V4SFmode
;
17564 else if ((contents
& (h2
| h4
)) == contents
)
17567 for (i
= 0; i
< nelt2
; ++i
)
17569 remap
[i
+ nelt2
] = i
* 2;
17570 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17571 dremap
.perm
[i
* 2] = i
+ nelt2
;
17572 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17574 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17575 dremap
.vmode
= V4SFmode
;
17577 else if ((contents
& (h1
| h4
)) == contents
)
17580 for (i
= 0; i
< nelt2
; ++i
)
17583 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17584 dremap
.perm
[i
] = i
;
17585 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17590 dremap
.vmode
= V2DImode
;
17592 dremap
.perm
[0] = 0;
17593 dremap
.perm
[1] = 3;
17596 else if ((contents
& (h2
| h3
)) == contents
)
17599 for (i
= 0; i
< nelt2
; ++i
)
17601 remap
[i
+ nelt2
] = i
;
17602 remap
[i
+ nelt
] = i
+ nelt2
;
17603 dremap
.perm
[i
] = i
+ nelt2
;
17604 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17609 dremap
.vmode
= V2DImode
;
17611 dremap
.perm
[0] = 1;
17612 dremap
.perm
[1] = 2;
17620 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17621 unsigned HOST_WIDE_INT q
[8];
17622 unsigned int nonzero_halves
[4];
17624 /* Split the two input vectors into 8 quarters. */
17625 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17626 for (i
= 1; i
< 8; ++i
)
17627 q
[i
] = q
[0] << (nelt4
* i
);
17628 for (i
= 0; i
< 4; ++i
)
17629 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17631 nonzero_halves
[nzcnt
] = i
;
17637 gcc_assert (d
->one_operand_p
);
17638 nonzero_halves
[1] = nonzero_halves
[0];
17639 same_halves
= true;
17641 else if (d
->one_operand_p
)
17643 gcc_assert (nonzero_halves
[0] == 0);
17644 gcc_assert (nonzero_halves
[1] == 1);
17649 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17651 /* Attempt to increase the likelihood that dfinal
17652 shuffle will be intra-lane. */
17653 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17656 /* vperm2f128 or vperm2i128. */
17657 for (i
= 0; i
< nelt2
; ++i
)
17659 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17660 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17661 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17662 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17665 if (d
->vmode
!= V8SFmode
17666 && d
->vmode
!= V4DFmode
17667 && d
->vmode
!= V8SImode
)
17669 dremap
.vmode
= V8SImode
;
17671 for (i
= 0; i
< 4; ++i
)
17673 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17674 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17678 else if (d
->one_operand_p
)
17680 else if (TARGET_AVX2
17681 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17684 for (i
= 0; i
< nelt4
; ++i
)
17687 remap
[i
+ nelt
] = i
* 2 + 1;
17688 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17689 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17690 dremap
.perm
[i
* 2] = i
;
17691 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17692 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17693 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17696 else if (TARGET_AVX2
17697 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17700 for (i
= 0; i
< nelt4
; ++i
)
17702 remap
[i
+ nelt4
] = i
* 2;
17703 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17704 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17705 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17706 dremap
.perm
[i
* 2] = i
+ nelt4
;
17707 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17708 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17709 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17716 /* Use the remapping array set up above to move the elements from their
17717 swizzled locations into their final destinations. */
17719 for (i
= 0; i
< nelt
; ++i
)
17721 unsigned e
= remap
[d
->perm
[i
]];
17722 gcc_assert (e
< nelt
);
17723 /* If same_halves is true, both halves of the remapped vector are the
17724 same. Avoid cross-lane accesses if possible. */
17725 if (same_halves
&& i
>= nelt2
)
17727 gcc_assert (e
< nelt2
);
17728 dfinal
.perm
[i
] = e
+ nelt2
;
17731 dfinal
.perm
[i
] = e
;
17735 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17736 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17738 dfinal
.op1
= dfinal
.op0
;
17739 dfinal
.one_operand_p
= true;
17741 /* Test if the final remap can be done with a single insn. For V4SFmode or
17742 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17744 ok
= expand_vec_perm_1 (&dfinal
);
17745 seq
= get_insns ();
17754 if (dremap
.vmode
!= dfinal
.vmode
)
17756 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17757 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17760 ok
= expand_vec_perm_1 (&dremap
);
17767 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17768 a single vector cross-lane permutation into vpermq followed
17769 by any of the single insn permutations. */
17772 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17774 struct expand_vec_perm_d dremap
, dfinal
;
17775 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17776 unsigned contents
[2];
17780 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17781 && d
->one_operand_p
))
17786 for (i
= 0; i
< nelt2
; ++i
)
17788 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17789 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17792 for (i
= 0; i
< 2; ++i
)
17794 unsigned int cnt
= 0;
17795 for (j
= 0; j
< 4; ++j
)
17796 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17804 dremap
.vmode
= V4DImode
;
17806 dremap
.target
= gen_reg_rtx (V4DImode
);
17807 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17808 dremap
.op1
= dremap
.op0
;
17809 dremap
.one_operand_p
= true;
17810 for (i
= 0; i
< 2; ++i
)
17812 unsigned int cnt
= 0;
17813 for (j
= 0; j
< 4; ++j
)
17814 if ((contents
[i
] & (1u << j
)) != 0)
17815 dremap
.perm
[2 * i
+ cnt
++] = j
;
17816 for (; cnt
< 2; ++cnt
)
17817 dremap
.perm
[2 * i
+ cnt
] = 0;
17821 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17822 dfinal
.op1
= dfinal
.op0
;
17823 dfinal
.one_operand_p
= true;
17824 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17828 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17829 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17831 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17832 dfinal
.perm
[i
] |= nelt4
;
17834 gcc_unreachable ();
17837 ok
= expand_vec_perm_1 (&dremap
);
17840 ok
= expand_vec_perm_1 (&dfinal
);
17846 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17848 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17849 a vector permutation using two instructions, vperm2f128 resp.
17850 vperm2i128 followed by any single in-lane permutation. */
17853 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17855 struct expand_vec_perm_d dfirst
, dsecond
;
17856 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17860 || GET_MODE_SIZE (d
->vmode
) != 32
17861 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17865 dsecond
.one_operand_p
= false;
17866 dsecond
.testing_p
= true;
17868 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17869 immediate. For perm < 16 the second permutation uses
17870 d->op0 as first operand, for perm >= 16 it uses d->op1
17871 as first operand. The second operand is the result of
17873 for (perm
= 0; perm
< 32; perm
++)
17875 /* Ignore permutations which do not move anything cross-lane. */
17878 /* The second shuffle for e.g. V4DFmode has
17879 0123 and ABCD operands.
17880 Ignore AB23, as 23 is already in the second lane
17881 of the first operand. */
17882 if ((perm
& 0xc) == (1 << 2)) continue;
17883 /* And 01CD, as 01 is in the first lane of the first
17885 if ((perm
& 3) == 0) continue;
17886 /* And 4567, as then the vperm2[fi]128 doesn't change
17887 anything on the original 4567 second operand. */
17888 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17892 /* The second shuffle for e.g. V4DFmode has
17893 4567 and ABCD operands.
17894 Ignore AB67, as 67 is already in the second lane
17895 of the first operand. */
17896 if ((perm
& 0xc) == (3 << 2)) continue;
17897 /* And 45CD, as 45 is in the first lane of the first
17899 if ((perm
& 3) == 2) continue;
17900 /* And 0123, as then the vperm2[fi]128 doesn't change
17901 anything on the original 0123 first operand. */
17902 if ((perm
& 0xf) == (1 << 2)) continue;
17905 for (i
= 0; i
< nelt
; i
++)
17907 j
= d
->perm
[i
] / nelt2
;
17908 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17909 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17910 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17911 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17919 ok
= expand_vec_perm_1 (&dsecond
);
17930 /* Found a usable second shuffle. dfirst will be
17931 vperm2f128 on d->op0 and d->op1. */
17932 dsecond
.testing_p
= false;
17934 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17935 for (i
= 0; i
< nelt
; i
++)
17936 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17937 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17939 canonicalize_perm (&dfirst
);
17940 ok
= expand_vec_perm_1 (&dfirst
);
17943 /* And dsecond is some single insn shuffle, taking
17944 d->op0 and result of vperm2f128 (if perm < 16) or
17945 d->op1 and result of vperm2f128 (otherwise). */
17947 dsecond
.op0
= dsecond
.op1
;
17948 dsecond
.op1
= dfirst
.target
;
17950 ok
= expand_vec_perm_1 (&dsecond
);
17956 /* For one operand, the only useful vperm2f128 permutation is 0x01
17958 if (d
->one_operand_p
)
17965 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17966 a two vector permutation using 2 intra-lane interleave insns
17967 and cross-lane shuffle for 32-byte vectors. */
17970 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17973 rtx (*gen
) (rtx
, rtx
, rtx
);
17975 if (d
->one_operand_p
)
17977 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17979 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17985 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17987 for (i
= 0; i
< nelt
; i
+= 2)
17988 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17989 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17999 gen
= gen_vec_interleave_highv32qi
;
18001 gen
= gen_vec_interleave_lowv32qi
;
18005 gen
= gen_vec_interleave_highv16hi
;
18007 gen
= gen_vec_interleave_lowv16hi
;
18011 gen
= gen_vec_interleave_highv8si
;
18013 gen
= gen_vec_interleave_lowv8si
;
18017 gen
= gen_vec_interleave_highv4di
;
18019 gen
= gen_vec_interleave_lowv4di
;
18023 gen
= gen_vec_interleave_highv8sf
;
18025 gen
= gen_vec_interleave_lowv8sf
;
18029 gen
= gen_vec_interleave_highv4df
;
18031 gen
= gen_vec_interleave_lowv4df
;
18034 gcc_unreachable ();
18037 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18042 a single vector permutation using a single intra-lane vector
18043 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18044 the non-swapped and swapped vectors together. */
18047 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18049 struct expand_vec_perm_d dfirst
, dsecond
;
18050 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18053 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18057 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18058 || !d
->one_operand_p
)
18062 for (i
= 0; i
< nelt
; i
++)
18063 dfirst
.perm
[i
] = 0xff;
18064 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18066 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18067 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18069 dfirst
.perm
[j
] = d
->perm
[i
];
18073 for (i
= 0; i
< nelt
; i
++)
18074 if (dfirst
.perm
[i
] == 0xff)
18075 dfirst
.perm
[i
] = i
;
18078 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18081 ok
= expand_vec_perm_1 (&dfirst
);
18082 seq
= get_insns ();
18094 dsecond
.op0
= dfirst
.target
;
18095 dsecond
.op1
= dfirst
.target
;
18096 dsecond
.one_operand_p
= true;
18097 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18098 for (i
= 0; i
< nelt
; i
++)
18099 dsecond
.perm
[i
] = i
^ nelt2
;
18101 ok
= expand_vec_perm_1 (&dsecond
);
18104 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18105 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18109 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18110 permutation using two vperm2f128, followed by a vshufpd insn blending
18111 the two vectors together. */
18114 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18116 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18119 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18129 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18130 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18131 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18132 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18133 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18134 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18135 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18136 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18137 dthird
.perm
[0] = (d
->perm
[0] % 2);
18138 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18139 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18140 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18142 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18143 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18144 dthird
.op0
= dfirst
.target
;
18145 dthird
.op1
= dsecond
.target
;
18146 dthird
.one_operand_p
= false;
18148 canonicalize_perm (&dfirst
);
18149 canonicalize_perm (&dsecond
);
18151 ok
= expand_vec_perm_1 (&dfirst
)
18152 && expand_vec_perm_1 (&dsecond
)
18153 && expand_vec_perm_1 (&dthird
);
18160 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18162 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18163 a two vector permutation using two intra-lane vector
18164 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18165 the non-swapped and swapped vectors together. */
18168 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18170 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18171 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18172 rtx_insn
*seq1
, *seq2
;
18174 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18178 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18179 || d
->one_operand_p
)
18184 for (i
= 0; i
< nelt
; i
++)
18186 dfirst
.perm
[i
] = 0xff;
18187 dsecond
.perm
[i
] = 0xff;
18189 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18191 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18194 dfirst
.perm
[j
] = d
->perm
[i
];
18195 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18199 dsecond
.perm
[j
] = d
->perm
[i
];
18200 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18204 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18209 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18210 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18213 for (i
= 0; i
< nelt
; i
++)
18215 if (dfirst
.perm
[i
] == 0xff)
18216 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18217 if (dsecond
.perm
[i
] == 0xff)
18218 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18220 canonicalize_perm (&dfirst
);
18222 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18223 seq1
= get_insns ();
18229 canonicalize_perm (&dsecond
);
18231 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18232 seq2
= get_insns ();
18245 dthird
.op0
= dsecond
.target
;
18246 dthird
.op1
= dsecond
.target
;
18247 dthird
.one_operand_p
= true;
18248 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18249 for (i
= 0; i
< nelt
; i
++)
18250 dthird
.perm
[i
] = i
^ nelt2
;
18252 ok
= expand_vec_perm_1 (&dthird
);
18255 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18256 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18260 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18261 permutation with two pshufb insns and an ior. We should have already
18262 failed all two instruction sequences. */
18265 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18267 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18268 unsigned int i
, nelt
, eltsz
;
18270 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18272 gcc_assert (!d
->one_operand_p
);
18278 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18280 /* Generate two permutation masks. If the required element is within
18281 the given vector it is shuffled into the proper lane. If the required
18282 element is in the other vector, force a zero into the lane by setting
18283 bit 7 in the permutation mask. */
18284 m128
= GEN_INT (-128);
18285 for (i
= 0; i
< nelt
; ++i
)
18287 unsigned j
, e
= d
->perm
[i
];
18288 unsigned which
= (e
>= nelt
);
18292 for (j
= 0; j
< eltsz
; ++j
)
18294 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18295 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18299 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18300 vperm
= force_reg (V16QImode
, vperm
);
18302 l
= gen_reg_rtx (V16QImode
);
18303 op
= gen_lowpart (V16QImode
, d
->op0
);
18304 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18306 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18307 vperm
= force_reg (V16QImode
, vperm
);
18309 h
= gen_reg_rtx (V16QImode
);
18310 op
= gen_lowpart (V16QImode
, d
->op1
);
18311 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18314 if (d
->vmode
!= V16QImode
)
18315 op
= gen_reg_rtx (V16QImode
);
18316 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18317 if (op
!= d
->target
)
18318 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18323 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18324 with two vpshufb insns, vpermq and vpor. We should have already failed
18325 all two or three instruction sequences. */
18328 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18330 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18331 unsigned int i
, nelt
, eltsz
;
18334 || !d
->one_operand_p
18335 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18342 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18344 /* Generate two permutation masks. If the required element is within
18345 the same lane, it is shuffled in. If the required element from the
18346 other lane, force a zero by setting bit 7 in the permutation mask.
18347 In the other mask the mask has non-negative elements if element
18348 is requested from the other lane, but also moved to the other lane,
18349 so that the result of vpshufb can have the two V2TImode halves
18351 m128
= GEN_INT (-128);
18352 for (i
= 0; i
< nelt
; ++i
)
18354 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18355 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18357 for (j
= 0; j
< eltsz
; ++j
)
18359 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18360 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18364 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18365 vperm
= force_reg (V32QImode
, vperm
);
18367 h
= gen_reg_rtx (V32QImode
);
18368 op
= gen_lowpart (V32QImode
, d
->op0
);
18369 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18371 /* Swap the 128-byte lanes of h into hp. */
18372 hp
= gen_reg_rtx (V4DImode
);
18373 op
= gen_lowpart (V4DImode
, h
);
18374 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18377 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18378 vperm
= force_reg (V32QImode
, vperm
);
18380 l
= gen_reg_rtx (V32QImode
);
18381 op
= gen_lowpart (V32QImode
, d
->op0
);
18382 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18385 if (d
->vmode
!= V32QImode
)
18386 op
= gen_reg_rtx (V32QImode
);
18387 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18388 if (op
!= d
->target
)
18389 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18394 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18395 and extract-odd permutations of two V32QImode and V16QImode operand
18396 with two vpshufb insns, vpor and vpermq. We should have already
18397 failed all two or three instruction sequences. */
18400 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18402 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18403 unsigned int i
, nelt
, eltsz
;
18406 || d
->one_operand_p
18407 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18410 for (i
= 0; i
< d
->nelt
; ++i
)
18411 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18418 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18420 /* Generate two permutation masks. In the first permutation mask
18421 the first quarter will contain indexes for the first half
18422 of the op0, the second quarter will contain bit 7 set, third quarter
18423 will contain indexes for the second half of the op0 and the
18424 last quarter bit 7 set. In the second permutation mask
18425 the first quarter will contain bit 7 set, the second quarter
18426 indexes for the first half of the op1, the third quarter bit 7 set
18427 and last quarter indexes for the second half of the op1.
18428 I.e. the first mask e.g. for V32QImode extract even will be:
18429 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18430 (all values masked with 0xf except for -128) and second mask
18431 for extract even will be
18432 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18433 m128
= GEN_INT (-128);
18434 for (i
= 0; i
< nelt
; ++i
)
18436 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18437 unsigned which
= d
->perm
[i
] >= nelt
;
18438 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18440 for (j
= 0; j
< eltsz
; ++j
)
18442 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18443 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18447 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18448 vperm
= force_reg (V32QImode
, vperm
);
18450 l
= gen_reg_rtx (V32QImode
);
18451 op
= gen_lowpart (V32QImode
, d
->op0
);
18452 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18454 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18455 vperm
= force_reg (V32QImode
, vperm
);
18457 h
= gen_reg_rtx (V32QImode
);
18458 op
= gen_lowpart (V32QImode
, d
->op1
);
18459 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18461 ior
= gen_reg_rtx (V32QImode
);
18462 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18464 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18465 op
= gen_reg_rtx (V4DImode
);
18466 ior
= gen_lowpart (V4DImode
, ior
);
18467 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18468 const1_rtx
, GEN_INT (3)));
18469 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18474 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18475 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18476 with two "and" and "pack" or two "shift" and "pack" insns. We should
18477 have already failed all two instruction sequences. */
18480 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18482 rtx op
, dop0
, dop1
, t
;
18483 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18484 bool end_perm
= false;
18485 machine_mode half_mode
;
18486 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18487 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18488 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18490 if (d
->one_operand_p
)
18496 /* Required for "pack". */
18497 if (!TARGET_SSE4_1
)
18501 half_mode
= V4SImode
;
18502 gen_and
= gen_andv4si3
;
18503 gen_pack
= gen_sse4_1_packusdw
;
18504 gen_shift
= gen_lshrv4si3
;
18507 /* No check as all instructions are SSE2. */
18510 half_mode
= V8HImode
;
18511 gen_and
= gen_andv8hi3
;
18512 gen_pack
= gen_sse2_packuswb
;
18513 gen_shift
= gen_lshrv8hi3
;
18520 half_mode
= V8SImode
;
18521 gen_and
= gen_andv8si3
;
18522 gen_pack
= gen_avx2_packusdw
;
18523 gen_shift
= gen_lshrv8si3
;
18531 half_mode
= V16HImode
;
18532 gen_and
= gen_andv16hi3
;
18533 gen_pack
= gen_avx2_packuswb
;
18534 gen_shift
= gen_lshrv16hi3
;
18538 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18539 general shuffles. */
18543 /* Check that permutation is even or odd. */
18548 for (i
= 1; i
< nelt
; ++i
)
18549 if (d
->perm
[i
] != 2 * i
+ odd
)
18555 dop0
= gen_reg_rtx (half_mode
);
18556 dop1
= gen_reg_rtx (half_mode
);
18559 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18560 t
= force_reg (half_mode
, t
);
18561 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18562 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18566 emit_insn (gen_shift (dop0
,
18567 gen_lowpart (half_mode
, d
->op0
),
18569 emit_insn (gen_shift (dop1
,
18570 gen_lowpart (half_mode
, d
->op1
),
18573 /* In AVX2 for 256 bit case we need to permute pack result. */
18574 if (TARGET_AVX2
&& end_perm
)
18576 op
= gen_reg_rtx (d
->vmode
);
18577 t
= gen_reg_rtx (V4DImode
);
18578 emit_insn (gen_pack (op
, dop0
, dop1
));
18579 emit_insn (gen_avx2_permv4di_1 (t
,
18580 gen_lowpart (V4DImode
, op
),
18585 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18588 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18593 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18594 and extract-odd permutations of two V64QI operands
18595 with two "shifts", two "truncs" and one "concat" insns for "odd"
18596 and two "truncs" and one concat insn for "even."
18597 Have already failed all two instruction sequences. */
18600 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18602 rtx t1
, t2
, t3
, t4
;
18603 unsigned i
, odd
, nelt
= d
->nelt
;
18605 if (!TARGET_AVX512BW
18606 || d
->one_operand_p
18607 || d
->vmode
!= V64QImode
)
18610 /* Check that permutation is even or odd. */
18615 for (i
= 1; i
< nelt
; ++i
)
18616 if (d
->perm
[i
] != 2 * i
+ odd
)
18625 t1
= gen_reg_rtx (V32HImode
);
18626 t2
= gen_reg_rtx (V32HImode
);
18627 emit_insn (gen_lshrv32hi3 (t1
,
18628 gen_lowpart (V32HImode
, d
->op0
),
18630 emit_insn (gen_lshrv32hi3 (t2
,
18631 gen_lowpart (V32HImode
, d
->op1
),
18636 t1
= gen_lowpart (V32HImode
, d
->op0
);
18637 t2
= gen_lowpart (V32HImode
, d
->op1
);
18640 t3
= gen_reg_rtx (V32QImode
);
18641 t4
= gen_reg_rtx (V32QImode
);
18642 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18643 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18644 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18649 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18650 and extract-odd permutations. */
18653 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18655 rtx t1
, t2
, t3
, t4
, t5
;
18662 t1
= gen_reg_rtx (V4DFmode
);
18663 t2
= gen_reg_rtx (V4DFmode
);
18665 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18666 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18667 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18669 /* Now an unpck[lh]pd will produce the result required. */
18671 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18673 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18679 int mask
= odd
? 0xdd : 0x88;
18683 t1
= gen_reg_rtx (V8SFmode
);
18684 t2
= gen_reg_rtx (V8SFmode
);
18685 t3
= gen_reg_rtx (V8SFmode
);
18687 /* Shuffle within the 128-bit lanes to produce:
18688 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18689 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18692 /* Shuffle the lanes around to produce:
18693 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18694 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18697 /* Shuffle within the 128-bit lanes to produce:
18698 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18699 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18701 /* Shuffle within the 128-bit lanes to produce:
18702 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18703 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18705 /* Shuffle the lanes around to produce:
18706 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18707 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18716 /* These are always directly implementable by expand_vec_perm_1. */
18717 gcc_unreachable ();
18721 return expand_vec_perm_even_odd_pack (d
);
18722 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18723 return expand_vec_perm_pshufb2 (d
);
18728 /* We need 2*log2(N)-1 operations to achieve odd/even
18729 with interleave. */
18730 t1
= gen_reg_rtx (V8HImode
);
18731 t2
= gen_reg_rtx (V8HImode
);
18732 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18733 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18734 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18735 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18737 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18739 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18745 return expand_vec_perm_even_odd_pack (d
);
18749 return expand_vec_perm_even_odd_pack (d
);
18752 return expand_vec_perm_even_odd_trunc (d
);
18757 struct expand_vec_perm_d d_copy
= *d
;
18758 d_copy
.vmode
= V4DFmode
;
18760 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18762 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18763 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18764 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18765 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18768 emit_move_insn (d
->target
,
18769 gen_lowpart (V4DImode
, d_copy
.target
));
18778 t1
= gen_reg_rtx (V4DImode
);
18779 t2
= gen_reg_rtx (V4DImode
);
18781 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18782 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18783 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18785 /* Now an vpunpck[lh]qdq will produce the result required. */
18787 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18789 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18796 struct expand_vec_perm_d d_copy
= *d
;
18797 d_copy
.vmode
= V8SFmode
;
18799 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18801 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18802 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18803 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18804 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18807 emit_move_insn (d
->target
,
18808 gen_lowpart (V8SImode
, d_copy
.target
));
18817 t1
= gen_reg_rtx (V8SImode
);
18818 t2
= gen_reg_rtx (V8SImode
);
18819 t3
= gen_reg_rtx (V4DImode
);
18820 t4
= gen_reg_rtx (V4DImode
);
18821 t5
= gen_reg_rtx (V4DImode
);
18823 /* Shuffle the lanes around into
18824 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18825 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18826 gen_lowpart (V4DImode
, d
->op1
),
18828 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18829 gen_lowpart (V4DImode
, d
->op1
),
18832 /* Swap the 2nd and 3rd position in each lane into
18833 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18834 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18835 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18836 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18837 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18839 /* Now an vpunpck[lh]qdq will produce
18840 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18842 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18843 gen_lowpart (V4DImode
, t2
));
18845 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18846 gen_lowpart (V4DImode
, t2
));
18848 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18852 gcc_unreachable ();
18858 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18859 extract-even and extract-odd permutations. */
18862 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18864 unsigned i
, odd
, nelt
= d
->nelt
;
18867 if (odd
!= 0 && odd
!= 1)
18870 for (i
= 1; i
< nelt
; ++i
)
18871 if (d
->perm
[i
] != 2 * i
+ odd
)
18874 return expand_vec_perm_even_odd_1 (d
, odd
);
18877 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18878 permutations. We assume that expand_vec_perm_1 has already failed. */
18881 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18883 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18884 machine_mode vmode
= d
->vmode
;
18885 unsigned char perm2
[4];
18886 rtx op0
= d
->op0
, dest
;
18893 /* These are special-cased in sse.md so that we can optionally
18894 use the vbroadcast instruction. They expand to two insns
18895 if the input happens to be in a register. */
18896 gcc_unreachable ();
18902 /* These are always implementable using standard shuffle patterns. */
18903 gcc_unreachable ();
18907 /* These can be implemented via interleave. We save one insn by
18908 stopping once we have promoted to V4SImode and then use pshufd. */
18914 rtx (*gen
) (rtx
, rtx
, rtx
)
18915 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18916 : gen_vec_interleave_lowv8hi
;
18920 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18921 : gen_vec_interleave_highv8hi
;
18926 dest
= gen_reg_rtx (vmode
);
18927 emit_insn (gen (dest
, op0
, op0
));
18928 vmode
= get_mode_wider_vector (vmode
);
18929 op0
= gen_lowpart (vmode
, dest
);
18931 while (vmode
!= V4SImode
);
18933 memset (perm2
, elt
, 4);
18934 dest
= gen_reg_rtx (V4SImode
);
18935 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18938 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18946 /* For AVX2 broadcasts of the first element vpbroadcast* or
18947 vpermq should be used by expand_vec_perm_1. */
18948 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18952 gcc_unreachable ();
18956 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18957 broadcast permutations. */
18960 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18962 unsigned i
, elt
, nelt
= d
->nelt
;
18964 if (!d
->one_operand_p
)
18968 for (i
= 1; i
< nelt
; ++i
)
18969 if (d
->perm
[i
] != elt
)
18972 return expand_vec_perm_broadcast_1 (d
);
18975 /* Implement arbitrary permutations of two V64QImode operands
18976 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18978 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
18980 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
18986 struct expand_vec_perm_d ds
[2];
18987 rtx rperm
[128], vperm
, target0
, target1
;
18988 unsigned int i
, nelt
;
18989 machine_mode vmode
;
18994 for (i
= 0; i
< 2; i
++)
18997 ds
[i
].vmode
= V32HImode
;
18999 ds
[i
].target
= gen_reg_rtx (V32HImode
);
19000 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
19001 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
19004 /* Prepare permutations such that the first one takes care of
19005 putting the even bytes into the right positions or one higher
19006 positions (ds[0]) and the second one takes care of
19007 putting the odd bytes into the right positions or one below
19010 for (i
= 0; i
< nelt
; i
++)
19012 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
19015 rperm
[i
] = constm1_rtx
;
19016 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19020 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19021 rperm
[i
+ 64] = constm1_rtx
;
19025 bool ok
= expand_vec_perm_1 (&ds
[0]);
19027 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
19029 ok
= expand_vec_perm_1 (&ds
[1]);
19031 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
19033 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
19034 vperm
= force_reg (vmode
, vperm
);
19035 target0
= gen_reg_rtx (V64QImode
);
19036 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
19038 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
19039 vperm
= force_reg (vmode
, vperm
);
19040 target1
= gen_reg_rtx (V64QImode
);
19041 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
19043 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
19047 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19048 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19049 all the shorter instruction sequences. */
19052 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19054 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19055 unsigned int i
, nelt
, eltsz
;
19059 || d
->one_operand_p
19060 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19067 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19069 /* Generate 4 permutation masks. If the required element is within
19070 the same lane, it is shuffled in. If the required element from the
19071 other lane, force a zero by setting bit 7 in the permutation mask.
19072 In the other mask the mask has non-negative elements if element
19073 is requested from the other lane, but also moved to the other lane,
19074 so that the result of vpshufb can have the two V2TImode halves
19076 m128
= GEN_INT (-128);
19077 for (i
= 0; i
< 32; ++i
)
19079 rperm
[0][i
] = m128
;
19080 rperm
[1][i
] = m128
;
19081 rperm
[2][i
] = m128
;
19082 rperm
[3][i
] = m128
;
19088 for (i
= 0; i
< nelt
; ++i
)
19090 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19091 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19092 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19094 for (j
= 0; j
< eltsz
; ++j
)
19095 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19096 used
[which
] = true;
19099 for (i
= 0; i
< 2; ++i
)
19101 if (!used
[2 * i
+ 1])
19106 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19107 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19108 vperm
= force_reg (V32QImode
, vperm
);
19109 h
[i
] = gen_reg_rtx (V32QImode
);
19110 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19111 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19114 /* Swap the 128-byte lanes of h[X]. */
19115 for (i
= 0; i
< 2; ++i
)
19117 if (h
[i
] == NULL_RTX
)
19119 op
= gen_reg_rtx (V4DImode
);
19120 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19121 const2_rtx
, GEN_INT (3), const0_rtx
,
19123 h
[i
] = gen_lowpart (V32QImode
, op
);
19126 for (i
= 0; i
< 2; ++i
)
19133 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19134 vperm
= force_reg (V32QImode
, vperm
);
19135 l
[i
] = gen_reg_rtx (V32QImode
);
19136 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19137 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19140 for (i
= 0; i
< 2; ++i
)
19144 op
= gen_reg_rtx (V32QImode
);
19145 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19152 gcc_assert (l
[0] && l
[1]);
19154 if (d
->vmode
!= V32QImode
)
19155 op
= gen_reg_rtx (V32QImode
);
19156 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19157 if (op
!= d
->target
)
19158 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19162 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19163 taken care of, perform the expansion in D and return true on success. */
19166 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19168 /* Try a single instruction expansion. */
19169 if (expand_vec_perm_1 (d
))
19172 /* Try sequences of two instructions. */
19174 if (expand_vec_perm_pshuflw_pshufhw (d
))
19177 if (expand_vec_perm_palignr (d
, false))
19180 if (expand_vec_perm_interleave2 (d
))
19183 if (expand_vec_perm_broadcast (d
))
19186 if (expand_vec_perm_vpermq_perm_1 (d
))
19189 if (expand_vec_perm_vperm2f128 (d
))
19192 if (expand_vec_perm_pblendv (d
))
19195 /* Try sequences of three instructions. */
19197 if (expand_vec_perm_even_odd_pack (d
))
19200 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19203 if (expand_vec_perm_pshufb2 (d
))
19206 if (expand_vec_perm_interleave3 (d
))
19209 if (expand_vec_perm_vperm2f128_vblend (d
))
19212 /* Try sequences of four instructions. */
19214 if (expand_vec_perm_even_odd_trunc (d
))
19216 if (expand_vec_perm_vpshufb2_vpermq (d
))
19219 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19222 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19225 /* ??? Look for narrow permutations whose element orderings would
19226 allow the promotion to a wider mode. */
19228 /* ??? Look for sequences of interleave or a wider permute that place
19229 the data into the correct lanes for a half-vector shuffle like
19230 pshuf[lh]w or vpermilps. */
19232 /* ??? Look for sequences of interleave that produce the desired results.
19233 The combinatorics of punpck[lh] get pretty ugly... */
19235 if (expand_vec_perm_even_odd (d
))
19238 /* Even longer sequences. */
19239 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19242 /* See if we can get the same permutation in different vector integer
19244 struct expand_vec_perm_d nd
;
19245 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19248 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19252 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19253 if (expand_vec_perm2_vperm2f128_vblend (d
))
19259 /* If a permutation only uses one operand, make it clear. Returns true
19260 if the permutation references both operands. */
19263 canonicalize_perm (struct expand_vec_perm_d
*d
)
19265 int i
, which
, nelt
= d
->nelt
;
19267 for (i
= which
= 0; i
< nelt
; ++i
)
19268 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19270 d
->one_operand_p
= true;
19277 if (!rtx_equal_p (d
->op0
, d
->op1
))
19279 d
->one_operand_p
= false;
19282 /* The elements of PERM do not suggest that only the first operand
19283 is used, but both operands are identical. Allow easier matching
19284 of the permutation by folding the permutation into the single
19289 for (i
= 0; i
< nelt
; ++i
)
19290 d
->perm
[i
] &= nelt
- 1;
19299 return (which
== 3);
19302 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19305 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19306 rtx op1
, const vec_perm_indices
&sel
)
19308 struct expand_vec_perm_d d
;
19309 unsigned char perm
[MAX_VECT_LEN
];
19310 unsigned int i
, nelt
, which
;
19318 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19319 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19320 d
.testing_p
= !target
;
19322 gcc_assert (sel
.length () == nelt
);
19323 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19325 /* Given sufficient ISA support we can just return true here
19326 for selected vector modes. */
19333 if (!TARGET_AVX512F
)
19335 /* All implementable with a single vperm[it]2 insn. */
19340 if (!TARGET_AVX512BW
)
19343 /* All implementable with a single vperm[it]2 insn. */
19347 if (!TARGET_AVX512BW
)
19350 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19359 if (d
.testing_p
&& TARGET_AVX512VL
)
19360 /* All implementable with a single vperm[it]2 insn. */
19366 if (d
.testing_p
&& TARGET_AVX2
)
19367 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19373 if (d
.testing_p
&& TARGET_AVX2
)
19374 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19381 /* Fall through. */
19386 /* All implementable with a single vpperm insn. */
19387 if (d
.testing_p
&& TARGET_XOP
)
19389 /* All implementable with 2 pshufb + 1 ior. */
19390 if (d
.testing_p
&& TARGET_SSSE3
)
19397 /* All implementable with shufpd or unpck[lh]pd. */
19405 for (i
= which
= 0; i
< nelt
; ++i
)
19407 unsigned char e
= sel
[i
];
19408 gcc_assert (e
< 2 * nelt
);
19411 which
|= (e
< nelt
? 1 : 2);
19416 /* For all elements from second vector, fold the elements to first. */
19418 for (i
= 0; i
< nelt
; ++i
)
19421 /* Check whether the mask can be applied to the vector type. */
19422 d
.one_operand_p
= (which
!= 3);
19424 /* Implementable with shufps or pshufd. */
19425 if (d
.one_operand_p
&& (d
.vmode
== V4SFmode
|| d
.vmode
== V4SImode
))
19428 /* Otherwise we have to go through the motions and see if we can
19429 figure out how to generate the requested permutation. */
19430 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19431 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19432 if (!d
.one_operand_p
)
19433 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19436 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19442 two_args
= canonicalize_perm (&d
);
19444 if (ix86_expand_vec_perm_const_1 (&d
))
19447 /* If the selector says both arguments are needed, but the operands are the
19448 same, the above tried to expand with one_operand_p and flattened selector.
19449 If that didn't work, retry without one_operand_p; we succeeded with that
19451 if (two_args
&& d
.one_operand_p
)
19453 d
.one_operand_p
= false;
19454 memcpy (d
.perm
, perm
, sizeof (perm
));
19455 return ix86_expand_vec_perm_const_1 (&d
);
19462 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19464 struct expand_vec_perm_d d
;
19470 d
.vmode
= GET_MODE (targ
);
19471 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19472 d
.one_operand_p
= false;
19473 d
.testing_p
= false;
19475 for (i
= 0; i
< nelt
; ++i
)
19476 d
.perm
[i
] = i
* 2 + odd
;
19478 /* We'll either be able to implement the permutation directly... */
19479 if (expand_vec_perm_1 (&d
))
19482 /* ... or we use the special-case patterns. */
19483 expand_vec_perm_even_odd_1 (&d
, odd
);
19487 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19489 struct expand_vec_perm_d d
;
19490 unsigned i
, nelt
, base
;
19496 d
.vmode
= GET_MODE (targ
);
19497 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19498 d
.one_operand_p
= false;
19499 d
.testing_p
= false;
19501 base
= high_p
? nelt
/ 2 : 0;
19502 for (i
= 0; i
< nelt
/ 2; ++i
)
19504 d
.perm
[i
* 2] = i
+ base
;
19505 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19508 /* Note that for AVX this isn't one instruction. */
19509 ok
= ix86_expand_vec_perm_const_1 (&d
);
19514 /* Expand a vector operation CODE for a V*QImode in terms of the
19515 same operation on V*HImode. */
19518 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19520 machine_mode qimode
= GET_MODE (dest
);
19521 machine_mode himode
;
19522 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19523 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19524 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19525 struct expand_vec_perm_d d
;
19526 bool ok
, full_interleave
;
19527 bool uns_p
= false;
19534 gen_il
= gen_vec_interleave_lowv16qi
;
19535 gen_ih
= gen_vec_interleave_highv16qi
;
19538 himode
= V16HImode
;
19539 gen_il
= gen_avx2_interleave_lowv32qi
;
19540 gen_ih
= gen_avx2_interleave_highv32qi
;
19543 himode
= V32HImode
;
19544 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19545 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19548 gcc_unreachable ();
19551 op2_l
= op2_h
= op2
;
19555 /* Unpack data such that we've got a source byte in each low byte of
19556 each word. We don't care what goes into the high byte of each word.
19557 Rather than trying to get zero in there, most convenient is to let
19558 it be a copy of the low byte. */
19559 op2_l
= gen_reg_rtx (qimode
);
19560 op2_h
= gen_reg_rtx (qimode
);
19561 emit_insn (gen_il (op2_l
, op2
, op2
));
19562 emit_insn (gen_ih (op2_h
, op2
, op2
));
19564 op1_l
= gen_reg_rtx (qimode
);
19565 op1_h
= gen_reg_rtx (qimode
);
19566 emit_insn (gen_il (op1_l
, op1
, op1
));
19567 emit_insn (gen_ih (op1_h
, op1
, op1
));
19568 full_interleave
= qimode
== V16QImode
;
19576 op1_l
= gen_reg_rtx (himode
);
19577 op1_h
= gen_reg_rtx (himode
);
19578 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19579 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19580 full_interleave
= true;
19583 gcc_unreachable ();
19586 /* Perform the operation. */
19587 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19589 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19591 gcc_assert (res_l
&& res_h
);
19593 /* Merge the data back into the right place. */
19595 d
.op0
= gen_lowpart (qimode
, res_l
);
19596 d
.op1
= gen_lowpart (qimode
, res_h
);
19598 d
.nelt
= GET_MODE_NUNITS (qimode
);
19599 d
.one_operand_p
= false;
19600 d
.testing_p
= false;
19602 if (full_interleave
)
19604 /* For SSE2, we used an full interleave, so the desired
19605 results are in the even elements. */
19606 for (i
= 0; i
< d
.nelt
; ++i
)
19611 /* For AVX, the interleave used above was not cross-lane. So the
19612 extraction is evens but with the second and third quarter swapped.
19613 Happily, that is even one insn shorter than even extraction.
19614 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19615 always first from the first and then from the second source operand,
19616 the index bits above the low 4 bits remains the same.
19617 Thus, for d.nelt == 32 we want permutation
19618 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19619 and for d.nelt == 64 we want permutation
19620 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19621 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19622 for (i
= 0; i
< d
.nelt
; ++i
)
19623 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19626 ok
= ix86_expand_vec_perm_const_1 (&d
);
19629 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19630 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19633 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19634 if op is CONST_VECTOR with all odd elements equal to their
19635 preceding element. */
19638 const_vector_equal_evenodd_p (rtx op
)
19640 machine_mode mode
= GET_MODE (op
);
19641 int i
, nunits
= GET_MODE_NUNITS (mode
);
19642 if (GET_CODE (op
) != CONST_VECTOR
19643 || nunits
!= CONST_VECTOR_NUNITS (op
))
19645 for (i
= 0; i
< nunits
; i
+= 2)
19646 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19652 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19653 bool uns_p
, bool odd_p
)
19655 machine_mode mode
= GET_MODE (op1
);
19656 machine_mode wmode
= GET_MODE (dest
);
19658 rtx orig_op1
= op1
, orig_op2
= op2
;
19660 if (!nonimmediate_operand (op1
, mode
))
19661 op1
= force_reg (mode
, op1
);
19662 if (!nonimmediate_operand (op2
, mode
))
19663 op2
= force_reg (mode
, op2
);
19665 /* We only play even/odd games with vectors of SImode. */
19666 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19668 /* If we're looking for the odd results, shift those members down to
19669 the even slots. For some cpus this is faster than a PSHUFD. */
19672 /* For XOP use vpmacsdqh, but only for smult, as it is only
19674 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19676 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19677 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19681 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19682 if (!const_vector_equal_evenodd_p (orig_op1
))
19683 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19684 x
, NULL
, 1, OPTAB_DIRECT
);
19685 if (!const_vector_equal_evenodd_p (orig_op2
))
19686 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19687 x
, NULL
, 1, OPTAB_DIRECT
);
19688 op1
= gen_lowpart (mode
, op1
);
19689 op2
= gen_lowpart (mode
, op2
);
19692 if (mode
== V16SImode
)
19695 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19697 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19699 else if (mode
== V8SImode
)
19702 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19704 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19707 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19708 else if (TARGET_SSE4_1
)
19709 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19712 rtx s1
, s2
, t0
, t1
, t2
;
19714 /* The easiest way to implement this without PMULDQ is to go through
19715 the motions as if we are performing a full 64-bit multiply. With
19716 the exception that we need to do less shuffling of the elements. */
19718 /* Compute the sign-extension, aka highparts, of the two operands. */
19719 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19720 op1
, pc_rtx
, pc_rtx
);
19721 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19722 op2
, pc_rtx
, pc_rtx
);
19724 /* Multiply LO(A) * HI(B), and vice-versa. */
19725 t1
= gen_reg_rtx (wmode
);
19726 t2
= gen_reg_rtx (wmode
);
19727 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19728 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19730 /* Multiply LO(A) * LO(B). */
19731 t0
= gen_reg_rtx (wmode
);
19732 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19734 /* Combine and shift the highparts into place. */
19735 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19736 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19739 /* Combine high and low parts. */
19740 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19747 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19748 bool uns_p
, bool high_p
)
19750 machine_mode wmode
= GET_MODE (dest
);
19751 machine_mode mode
= GET_MODE (op1
);
19752 rtx t1
, t2
, t3
, t4
, mask
;
19757 t1
= gen_reg_rtx (mode
);
19758 t2
= gen_reg_rtx (mode
);
19759 if (TARGET_XOP
&& !uns_p
)
19761 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19762 shuffle the elements once so that all elements are in the right
19763 place for immediate use: { A C B D }. */
19764 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19765 const1_rtx
, GEN_INT (3)));
19766 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19767 const1_rtx
, GEN_INT (3)));
19771 /* Put the elements into place for the multiply. */
19772 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19773 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19776 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19780 /* Shuffle the elements between the lanes. After this we
19781 have { A B E F | C D G H } for each operand. */
19782 t1
= gen_reg_rtx (V4DImode
);
19783 t2
= gen_reg_rtx (V4DImode
);
19784 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19785 const0_rtx
, const2_rtx
,
19786 const1_rtx
, GEN_INT (3)));
19787 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19788 const0_rtx
, const2_rtx
,
19789 const1_rtx
, GEN_INT (3)));
19791 /* Shuffle the elements within the lanes. After this we
19792 have { A A B B | C C D D } or { E E F F | G G H H }. */
19793 t3
= gen_reg_rtx (V8SImode
);
19794 t4
= gen_reg_rtx (V8SImode
);
19795 mask
= GEN_INT (high_p
19796 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19797 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19798 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19799 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19801 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19806 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19807 uns_p
, OPTAB_DIRECT
);
19808 t2
= expand_binop (mode
,
19809 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
19810 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
19811 gcc_assert (t1
&& t2
);
19813 t3
= gen_reg_rtx (mode
);
19814 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
19815 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
19823 t1
= gen_reg_rtx (wmode
);
19824 t2
= gen_reg_rtx (wmode
);
19825 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
19826 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
19828 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
19832 gcc_unreachable ();
19837 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
19839 rtx res_1
, res_2
, res_3
, res_4
;
19841 res_1
= gen_reg_rtx (V4SImode
);
19842 res_2
= gen_reg_rtx (V4SImode
);
19843 res_3
= gen_reg_rtx (V2DImode
);
19844 res_4
= gen_reg_rtx (V2DImode
);
19845 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
19846 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
19848 /* Move the results in element 2 down to element 1; we don't care
19849 what goes in elements 2 and 3. Then we can merge the parts
19850 back together with an interleave.
19852 Note that two other sequences were tried:
19853 (1) Use interleaves at the start instead of psrldq, which allows
19854 us to use a single shufps to merge things back at the end.
19855 (2) Use shufps here to combine the two vectors, then pshufd to
19856 put the elements in the correct order.
19857 In both cases the cost of the reformatting stall was too high
19858 and the overall sequence slower. */
19860 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
19861 const0_rtx
, const2_rtx
,
19862 const0_rtx
, const0_rtx
));
19863 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
19864 const0_rtx
, const2_rtx
,
19865 const0_rtx
, const0_rtx
));
19866 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
19868 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
19872 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
19874 machine_mode mode
= GET_MODE (op0
);
19875 rtx t1
, t2
, t3
, t4
, t5
, t6
;
19877 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
19878 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
19879 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
19880 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
19881 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
19882 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
19883 else if (TARGET_XOP
&& mode
== V2DImode
)
19885 /* op1: A,B,C,D, op2: E,F,G,H */
19886 op1
= gen_lowpart (V4SImode
, op1
);
19887 op2
= gen_lowpart (V4SImode
, op2
);
19889 t1
= gen_reg_rtx (V4SImode
);
19890 t2
= gen_reg_rtx (V4SImode
);
19891 t3
= gen_reg_rtx (V2DImode
);
19892 t4
= gen_reg_rtx (V2DImode
);
19895 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
19901 /* t2: (B*E),(A*F),(D*G),(C*H) */
19902 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
19904 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19905 emit_insn (gen_xop_phadddq (t3
, t2
));
19907 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19908 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
19910 /* Multiply lower parts and add all */
19911 t5
= gen_reg_rtx (V2DImode
);
19912 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
19913 gen_lowpart (V4SImode
, op1
),
19914 gen_lowpart (V4SImode
, op2
)));
19915 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
19919 machine_mode nmode
;
19920 rtx (*umul
) (rtx
, rtx
, rtx
);
19922 if (mode
== V2DImode
)
19924 umul
= gen_vec_widen_umult_even_v4si
;
19927 else if (mode
== V4DImode
)
19929 umul
= gen_vec_widen_umult_even_v8si
;
19932 else if (mode
== V8DImode
)
19934 umul
= gen_vec_widen_umult_even_v16si
;
19938 gcc_unreachable ();
19941 /* Multiply low parts. */
19942 t1
= gen_reg_rtx (mode
);
19943 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
19945 /* Shift input vectors right 32 bits so we can multiply high parts. */
19947 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
19948 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
19950 /* Multiply high parts by low parts. */
19951 t4
= gen_reg_rtx (mode
);
19952 t5
= gen_reg_rtx (mode
);
19953 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
19954 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
19956 /* Combine and shift the highparts back. */
19957 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
19958 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
19960 /* Combine high and low parts. */
19961 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
19964 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19965 gen_rtx_MULT (mode
, op1
, op2
));
19968 /* Return 1 if control tansfer instruction INSN
19969 should be encoded with notrack prefix. */
19972 ix86_notrack_prefixed_insn_p (rtx insn
)
19974 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
19979 rtx call
= get_call_rtx_from (insn
);
19980 gcc_assert (call
!= NULL_RTX
);
19981 rtx addr
= XEXP (call
, 0);
19983 /* Do not emit 'notrack' if it's not an indirect call. */
19985 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
19988 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
19991 if (JUMP_P (insn
) && !flag_cet_switch
)
19993 rtx target
= JUMP_LABEL (insn
);
19994 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
19997 /* Check the jump is a switch table. */
19998 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
19999 rtx_insn
*table
= next_insn (label
);
20000 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20008 /* Calculate integer abs() using only SSE2 instructions. */
20011 ix86_expand_sse2_abs (rtx target
, rtx input
)
20013 machine_mode mode
= GET_MODE (target
);
20020 /* For 64-bit signed integer X, with SSE4.2 use
20021 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20022 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20023 32 and use logical instead of arithmetic right shift (which is
20024 unimplemented) and subtract. */
20027 tmp0
= gen_reg_rtx (mode
);
20028 tmp1
= gen_reg_rtx (mode
);
20029 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20030 if (mode
== E_V2DImode
)
20031 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20033 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20037 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20038 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20039 - 1), NULL
, 0, OPTAB_DIRECT
);
20040 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20043 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20044 NULL
, 0, OPTAB_DIRECT
);
20045 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20046 target
, 0, OPTAB_DIRECT
);
20050 /* For 32-bit signed integer X, the best way to calculate the absolute
20051 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20052 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20053 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20054 NULL
, 0, OPTAB_DIRECT
);
20055 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20056 NULL
, 0, OPTAB_DIRECT
);
20057 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20058 target
, 0, OPTAB_DIRECT
);
20062 /* For 16-bit signed integer X, the best way to calculate the absolute
20063 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20064 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20066 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20067 target
, 0, OPTAB_DIRECT
);
20071 /* For 8-bit signed integer X, the best way to calculate the absolute
20072 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20073 as SSE2 provides the PMINUB insn. */
20074 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20076 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20077 target
, 0, OPTAB_DIRECT
);
20081 gcc_unreachable ();
20085 emit_move_insn (target
, x
);
20088 /* Expand an extract from a vector register through pextr insn.
20089 Return true if successful. */
20092 ix86_expand_pextr (rtx
*operands
)
20094 rtx dst
= operands
[0];
20095 rtx src
= operands
[1];
20097 unsigned int size
= INTVAL (operands
[2]);
20098 unsigned int pos
= INTVAL (operands
[3]);
20100 if (SUBREG_P (dst
))
20102 /* Reject non-lowpart subregs. */
20103 if (SUBREG_BYTE (dst
) > 0)
20105 dst
= SUBREG_REG (dst
);
20108 if (SUBREG_P (src
))
20110 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20111 src
= SUBREG_REG (src
);
20114 switch (GET_MODE (src
))
20123 machine_mode srcmode
, dstmode
;
20126 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20132 if (!TARGET_SSE4_1
)
20134 srcmode
= V16QImode
;
20140 srcmode
= V8HImode
;
20144 if (!TARGET_SSE4_1
)
20146 srcmode
= V4SImode
;
20150 gcc_assert (TARGET_64BIT
);
20151 if (!TARGET_SSE4_1
)
20153 srcmode
= V2DImode
;
20160 /* Reject extractions from misaligned positions. */
20161 if (pos
& (size
-1))
20164 if (GET_MODE (dst
) == dstmode
)
20167 d
= gen_reg_rtx (dstmode
);
20169 /* Construct insn pattern. */
20170 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20171 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20173 /* Let the rtl optimizers know about the zero extension performed. */
20174 if (dstmode
== QImode
|| dstmode
== HImode
)
20176 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20177 d
= gen_lowpart (SImode
, d
);
20180 emit_insn (gen_rtx_SET (d
, pat
));
20183 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20192 /* Expand an insert into a vector register through pinsr insn.
20193 Return true if successful. */
20196 ix86_expand_pinsr (rtx
*operands
)
20198 rtx dst
= operands
[0];
20199 rtx src
= operands
[3];
20201 unsigned int size
= INTVAL (operands
[1]);
20202 unsigned int pos
= INTVAL (operands
[2]);
20204 if (SUBREG_P (dst
))
20206 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20207 dst
= SUBREG_REG (dst
);
20210 switch (GET_MODE (dst
))
20219 machine_mode srcmode
, dstmode
;
20220 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20223 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20229 if (!TARGET_SSE4_1
)
20231 dstmode
= V16QImode
;
20232 pinsr
= gen_sse4_1_pinsrb
;
20238 dstmode
= V8HImode
;
20239 pinsr
= gen_sse2_pinsrw
;
20243 if (!TARGET_SSE4_1
)
20245 dstmode
= V4SImode
;
20246 pinsr
= gen_sse4_1_pinsrd
;
20250 gcc_assert (TARGET_64BIT
);
20251 if (!TARGET_SSE4_1
)
20253 dstmode
= V2DImode
;
20254 pinsr
= gen_sse4_1_pinsrq
;
20261 /* Reject insertions to misaligned positions. */
20262 if (pos
& (size
-1))
20265 if (SUBREG_P (src
))
20267 unsigned int srcpos
= SUBREG_BYTE (src
);
20273 extr_ops
[0] = gen_reg_rtx (srcmode
);
20274 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20275 extr_ops
[2] = GEN_INT (size
);
20276 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20278 if (!ix86_expand_pextr (extr_ops
))
20284 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20287 if (GET_MODE (dst
) == dstmode
)
20290 d
= gen_reg_rtx (dstmode
);
20292 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20293 gen_lowpart (srcmode
, src
),
20294 GEN_INT (1 << (pos
/ size
))));
20296 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20305 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20306 upper against lower halves up to SSE reg size. */
20309 ix86_split_reduction (machine_mode mode
)
20311 /* Reduce lowpart against highpart until we reach SSE reg width to
20312 avoid cross-lane operations. */
20338 /* Generate call to __divmoddi4. */
20341 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20343 rtx
*quot_p
, rtx
*rem_p
)
20345 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20347 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20348 mode
, op0
, mode
, op1
, mode
,
20349 XEXP (rem
, 0), Pmode
);
20354 #include "gt-i386-expand.h"