1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
123 byte
= GET_MODE_SIZE (half_mode
);
127 rtx op
= operands
[num
];
129 /* simplify_subreg refuse to split volatile memory addresses,
130 but we still have to handle it. */
133 if (mem_op
&& rtx_equal_p (op
, mem_op
))
135 lo_half
[num
] = lo_half
[mem_num
];
136 hi_half
[num
] = hi_half
[mem_num
];
142 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
143 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
148 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
149 GET_MODE (op
) == VOIDmode
150 ? mode
: GET_MODE (op
), 0);
151 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
152 GET_MODE (op
) == VOIDmode
153 ? mode
: GET_MODE (op
), byte
);
158 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
162 ix86_expand_clear (rtx dest
)
166 /* We play register width games, which are only valid after reload. */
167 gcc_assert (reload_completed
);
169 /* Avoid HImode and its attendant prefix byte. */
170 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
171 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
172 tmp
= gen_rtx_SET (dest
, const0_rtx
);
174 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
176 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
177 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
184 ix86_expand_move (machine_mode mode
, rtx operands
[])
187 rtx tmp
, addend
= NULL_RTX
;
188 enum tls_model model
;
193 switch (GET_CODE (op1
))
198 if (GET_CODE (tmp
) != PLUS
199 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
203 addend
= XEXP (tmp
, 1);
207 model
= SYMBOL_REF_TLS_MODEL (op1
);
210 op1
= legitimize_tls_address (op1
, model
, true);
211 else if (ix86_force_load_from_GOT_p (op1
))
213 /* Load the external function address via GOT slot to avoid PLT. */
214 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
218 op1
= gen_rtx_CONST (Pmode
, op1
);
219 op1
= gen_const_mem (Pmode
, op1
);
220 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
224 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
240 op1
= force_operand (op1
, NULL_RTX
);
241 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
242 op0
, 1, OPTAB_DIRECT
);
245 op1
= force_operand (op1
, op0
);
250 op1
= convert_to_mode (mode
, op1
, 1);
256 if ((flag_pic
|| MACHOPIC_INDIRECT
)
257 && symbolic_operand (op1
, mode
))
259 if (TARGET_MACHO
&& !TARGET_64BIT
)
263 if (MACHOPIC_INDIRECT
)
265 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
266 ? op0
: gen_reg_rtx (Pmode
);
267 op1
= machopic_indirect_data_reference (op1
, temp
);
269 op1
= machopic_legitimize_pic_address (op1
, mode
,
270 temp
== op1
? 0 : temp
);
272 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
274 rtx insn
= gen_rtx_SET (op0
, op1
);
278 if (GET_CODE (op0
) == MEM
)
279 op1
= force_reg (Pmode
, op1
);
283 if (GET_CODE (temp
) != REG
)
284 temp
= gen_reg_rtx (Pmode
);
285 temp
= legitimize_pic_address (op1
, temp
);
296 op1
= force_reg (mode
, op1
);
297 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
299 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
300 op1
= legitimize_pic_address (op1
, reg
);
303 op1
= convert_to_mode (mode
, op1
, 1);
310 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
311 || !push_operand (op0
, mode
))
313 op1
= force_reg (mode
, op1
);
315 if (push_operand (op0
, mode
)
316 && ! general_no_elim_operand (op1
, mode
))
317 op1
= copy_to_mode_reg (mode
, op1
);
319 /* Force large constants in 64bit compilation into register
320 to get them CSEed. */
321 if (can_create_pseudo_p ()
322 && (mode
== DImode
) && TARGET_64BIT
323 && immediate_operand (op1
, mode
)
324 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
325 && !register_operand (op0
, mode
)
327 op1
= copy_to_mode_reg (mode
, op1
);
329 if (can_create_pseudo_p ()
330 && CONST_DOUBLE_P (op1
))
332 /* If we are loading a floating point constant to a register,
333 force the value to memory now, since we'll get better code
336 op1
= validize_mem (force_const_mem (mode
, op1
));
337 if (!register_operand (op0
, mode
))
339 rtx temp
= gen_reg_rtx (mode
);
340 emit_insn (gen_rtx_SET (temp
, op1
));
341 emit_move_insn (op0
, temp
);
347 emit_insn (gen_rtx_SET (op0
, op1
));
351 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
353 rtx op0
= operands
[0], op1
= operands
[1];
354 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
355 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
356 unsigned int align
= (TARGET_IAMCU
357 ? GET_MODE_BITSIZE (mode
)
358 : GET_MODE_ALIGNMENT (mode
));
360 if (push_operand (op0
, VOIDmode
))
361 op0
= emit_move_resolve_push (mode
, op0
);
363 /* Force constants other than zero into memory. We do not know how
364 the instructions used to build constants modify the upper 64 bits
365 of the register, once we have that information we may be able
366 to handle some of them more efficiently. */
367 if (can_create_pseudo_p ()
370 && CONSTANT_P (SUBREG_REG (op1
))))
371 && ((register_operand (op0
, mode
)
372 && !standard_sse_constant_p (op1
, mode
))
373 /* ix86_expand_vector_move_misalign() does not like constants. */
374 || (SSE_REG_MODE_P (mode
)
376 && MEM_ALIGN (op0
) < align
)))
380 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
381 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
383 r
= validize_mem (r
);
385 r
= force_reg (imode
, SUBREG_REG (op1
));
386 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
389 op1
= validize_mem (force_const_mem (mode
, op1
));
392 /* We need to check memory alignment for SSE mode since attribute
393 can make operands unaligned. */
394 if (can_create_pseudo_p ()
395 && SSE_REG_MODE_P (mode
)
396 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
397 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
401 /* ix86_expand_vector_move_misalign() does not like both
402 arguments in memory. */
403 if (!register_operand (op0
, mode
)
404 && !register_operand (op1
, mode
))
405 op1
= force_reg (mode
, op1
);
407 tmp
[0] = op0
; tmp
[1] = op1
;
408 ix86_expand_vector_move_misalign (mode
, tmp
);
412 /* Make operand1 a register if it isn't already. */
413 if (can_create_pseudo_p ()
414 && !register_operand (op0
, mode
)
415 && !register_operand (op1
, mode
))
417 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
421 emit_insn (gen_rtx_SET (op0
, op1
));
424 /* Split 32-byte AVX unaligned load and store if needed. */
427 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
430 rtx (*extract
) (rtx
, rtx
, rtx
);
433 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
434 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
436 emit_insn (gen_rtx_SET (op0
, op1
));
440 rtx orig_op0
= NULL_RTX
;
441 mode
= GET_MODE (op0
);
442 switch (GET_MODE_CLASS (mode
))
444 case MODE_VECTOR_INT
:
446 if (mode
!= V32QImode
)
451 op0
= gen_reg_rtx (V32QImode
);
454 op0
= gen_lowpart (V32QImode
, op0
);
455 op1
= gen_lowpart (V32QImode
, op1
);
459 case MODE_VECTOR_FLOAT
:
470 extract
= gen_avx_vextractf128v32qi
;
474 extract
= gen_avx_vextractf128v8sf
;
478 extract
= gen_avx_vextractf128v4df
;
485 rtx r
= gen_reg_rtx (mode
);
486 m
= adjust_address (op1
, mode
, 0);
487 emit_move_insn (r
, m
);
488 m
= adjust_address (op1
, mode
, 16);
489 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
490 emit_move_insn (op0
, r
);
492 else if (MEM_P (op0
))
494 m
= adjust_address (op0
, mode
, 0);
495 emit_insn (extract (m
, op1
, const0_rtx
));
496 m
= adjust_address (op0
, mode
, 16);
497 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
503 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
506 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
507 straight to ix86_expand_vector_move. */
508 /* Code generation for scalar reg-reg moves of single and double precision data:
509 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
513 if (x86_sse_partial_reg_dependency == true)
518 Code generation for scalar loads of double precision data:
519 if (x86_sse_split_regs == true)
520 movlpd mem, reg (gas syntax)
524 Code generation for unaligned packed loads of single precision data
525 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
526 if (x86_sse_unaligned_move_optimal)
529 if (x86_sse_partial_reg_dependency == true)
541 Code generation for unaligned packed loads of double precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
543 if (x86_sse_unaligned_move_optimal)
546 if (x86_sse_split_regs == true)
559 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
566 /* Use unaligned load/store for AVX512 or when optimizing for size. */
567 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
569 emit_insn (gen_rtx_SET (op0
, op1
));
575 if (GET_MODE_SIZE (mode
) == 32)
576 ix86_avx256_split_vector_move_misalign (op0
, op1
);
578 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
579 emit_insn (gen_rtx_SET (op0
, op1
));
583 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
584 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
586 emit_insn (gen_rtx_SET (op0
, op1
));
590 /* ??? If we have typed data, then it would appear that using
591 movdqu is the only way to get unaligned data loaded with
593 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
595 emit_insn (gen_rtx_SET (op0
, op1
));
601 if (TARGET_SSE2
&& mode
== V2DFmode
)
605 /* When SSE registers are split into halves, we can avoid
606 writing to the top half twice. */
607 if (TARGET_SSE_SPLIT_REGS
)
614 /* ??? Not sure about the best option for the Intel chips.
615 The following would seem to satisfy; the register is
616 entirely cleared, breaking the dependency chain. We
617 then store to the upper half, with a dependency depth
618 of one. A rumor has it that Intel recommends two movsd
619 followed by an unpacklpd, but this is unconfirmed. And
620 given that the dependency depth of the unpacklpd would
621 still be one, I'm not sure why this would be better. */
622 zero
= CONST0_RTX (V2DFmode
);
625 m
= adjust_address (op1
, DFmode
, 0);
626 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
627 m
= adjust_address (op1
, DFmode
, 8);
628 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
634 if (mode
!= V4SFmode
)
635 t
= gen_reg_rtx (V4SFmode
);
639 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
640 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
644 m
= adjust_address (op1
, V2SFmode
, 0);
645 emit_insn (gen_sse_loadlps (t
, t
, m
));
646 m
= adjust_address (op1
, V2SFmode
, 8);
647 emit_insn (gen_sse_loadhps (t
, t
, m
));
648 if (mode
!= V4SFmode
)
649 emit_move_insn (op0
, gen_lowpart (mode
, t
));
652 else if (MEM_P (op0
))
654 if (TARGET_SSE2
&& mode
== V2DFmode
)
656 m
= adjust_address (op0
, DFmode
, 0);
657 emit_insn (gen_sse2_storelpd (m
, op1
));
658 m
= adjust_address (op0
, DFmode
, 8);
659 emit_insn (gen_sse2_storehpd (m
, op1
));
663 if (mode
!= V4SFmode
)
664 op1
= gen_lowpart (V4SFmode
, op1
);
666 m
= adjust_address (op0
, V2SFmode
, 0);
667 emit_insn (gen_sse_storelps (m
, op1
));
668 m
= adjust_address (op0
, V2SFmode
, 8);
669 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
676 /* Move bits 64:95 to bits 32:63. */
679 ix86_move_vector_high_sse_to_mmx (rtx op
)
681 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
682 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
683 GEN_INT (0), GEN_INT (0)));
684 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
685 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
686 rtx insn
= gen_rtx_SET (dest
, op
);
690 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
693 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
695 rtx op0
= operands
[0];
696 rtx op1
= operands
[1];
697 rtx op2
= operands
[2];
699 machine_mode dmode
= GET_MODE (op0
);
700 machine_mode smode
= GET_MODE (op1
);
701 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
702 machine_mode inner_smode
= GET_MODE_INNER (smode
);
704 /* Get the corresponding SSE mode for destination. */
705 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
706 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
708 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
709 nunits
/ 2).require ();
711 /* Get the corresponding SSE mode for source. */
712 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
713 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
716 /* Generate SSE pack with signed/unsigned saturation. */
717 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
718 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
719 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
721 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
722 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
723 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
727 ix86_move_vector_high_sse_to_mmx (op0
);
730 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
733 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
735 rtx op0
= operands
[0];
736 rtx op1
= operands
[1];
737 rtx op2
= operands
[2];
738 machine_mode mode
= GET_MODE (op0
);
740 /* The corresponding SSE mode. */
741 machine_mode sse_mode
, double_sse_mode
;
746 sse_mode
= V16QImode
;
747 double_sse_mode
= V32QImode
;
748 mask
= gen_rtx_PARALLEL (VOIDmode
,
750 GEN_INT (0), GEN_INT (16),
751 GEN_INT (1), GEN_INT (17),
752 GEN_INT (2), GEN_INT (18),
753 GEN_INT (3), GEN_INT (19),
754 GEN_INT (4), GEN_INT (20),
755 GEN_INT (5), GEN_INT (21),
756 GEN_INT (6), GEN_INT (22),
757 GEN_INT (7), GEN_INT (23)));
762 double_sse_mode
= V16HImode
;
763 mask
= gen_rtx_PARALLEL (VOIDmode
,
765 GEN_INT (0), GEN_INT (8),
766 GEN_INT (1), GEN_INT (9),
767 GEN_INT (2), GEN_INT (10),
768 GEN_INT (3), GEN_INT (11)));
773 double_sse_mode
= V8SImode
;
774 mask
= gen_rtx_PARALLEL (VOIDmode
,
776 GEN_INT (0), GEN_INT (4),
777 GEN_INT (1), GEN_INT (5)));
784 /* Generate SSE punpcklXX. */
785 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
786 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
787 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
789 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
790 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
791 rtx insn
= gen_rtx_SET (dest
, op2
);
796 /* Move bits 64:127 to bits 0:63. */
797 mask
= gen_rtx_PARALLEL (VOIDmode
,
798 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
799 GEN_INT (0), GEN_INT (0)));
800 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
801 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
802 insn
= gen_rtx_SET (dest
, op1
);
807 /* Helper function of ix86_fixup_binary_operands to canonicalize
808 operand order. Returns true if the operands should be swapped. */
811 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
814 rtx dst
= operands
[0];
815 rtx src1
= operands
[1];
816 rtx src2
= operands
[2];
818 /* If the operation is not commutative, we can't do anything. */
819 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
820 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
823 /* Highest priority is that src1 should match dst. */
824 if (rtx_equal_p (dst
, src1
))
826 if (rtx_equal_p (dst
, src2
))
829 /* Next highest priority is that immediate constants come second. */
830 if (immediate_operand (src2
, mode
))
832 if (immediate_operand (src1
, mode
))
835 /* Lowest priority is that memory references should come second. */
845 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
846 destination to use for the operation. If different from the true
847 destination in operands[0], a copy operation will be required. */
850 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
853 rtx dst
= operands
[0];
854 rtx src1
= operands
[1];
855 rtx src2
= operands
[2];
857 /* Canonicalize operand order. */
858 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
860 /* It is invalid to swap operands of different modes. */
861 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
863 std::swap (src1
, src2
);
866 /* Both source operands cannot be in memory. */
867 if (MEM_P (src1
) && MEM_P (src2
))
869 /* Optimization: Only read from memory once. */
870 if (rtx_equal_p (src1
, src2
))
872 src2
= force_reg (mode
, src2
);
875 else if (rtx_equal_p (dst
, src1
))
876 src2
= force_reg (mode
, src2
);
878 src1
= force_reg (mode
, src1
);
881 /* If the destination is memory, and we do not have matching source
882 operands, do things in registers. */
883 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
884 dst
= gen_reg_rtx (mode
);
886 /* Source 1 cannot be a constant. */
887 if (CONSTANT_P (src1
))
888 src1
= force_reg (mode
, src1
);
890 /* Source 1 cannot be a non-matching memory. */
891 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
892 src1
= force_reg (mode
, src1
);
894 /* Improve address combine. */
896 && GET_MODE_CLASS (mode
) == MODE_INT
898 src2
= force_reg (mode
, src2
);
905 /* Similarly, but assume that the destination has already been
909 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
910 machine_mode mode
, rtx operands
[])
912 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
913 gcc_assert (dst
== operands
[0]);
916 /* Attempt to expand a binary operator. Make the expansion closer to the
917 actual machine, then just general_operand, which will allow 3 separate
918 memory references (one output, two input) in a single insn. */
921 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
924 rtx src1
, src2
, dst
, op
, clob
;
926 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
930 /* Emit the instruction. */
932 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
936 && !rtx_equal_p (dst
, src1
))
938 /* This is going to be an LEA; avoid splitting it later. */
943 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
944 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
947 /* Fix up the destination if needed. */
948 if (dst
!= operands
[0])
949 emit_move_insn (operands
[0], dst
);
952 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
953 the given OPERANDS. */
956 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
959 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
960 if (SUBREG_P (operands
[1]))
965 else if (SUBREG_P (operands
[2]))
970 /* Optimize (__m128i) d | (__m128i) e and similar code
971 when d and e are float vectors into float vector logical
972 insn. In C/C++ without using intrinsics there is no other way
973 to express vector logical operation on float vectors than
974 to cast them temporarily to integer vectors. */
976 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
977 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
978 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
979 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
980 && SUBREG_BYTE (op1
) == 0
981 && (GET_CODE (op2
) == CONST_VECTOR
982 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
983 && SUBREG_BYTE (op2
) == 0))
984 && can_create_pseudo_p ())
987 switch (GET_MODE (SUBREG_REG (op1
)))
995 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
996 if (GET_CODE (op2
) == CONST_VECTOR
)
998 op2
= gen_lowpart (GET_MODE (dst
), op2
);
999 op2
= force_reg (GET_MODE (dst
), op2
);
1004 op2
= SUBREG_REG (operands
[2]);
1005 if (!vector_operand (op2
, GET_MODE (dst
)))
1006 op2
= force_reg (GET_MODE (dst
), op2
);
1008 op1
= SUBREG_REG (op1
);
1009 if (!vector_operand (op1
, GET_MODE (dst
)))
1010 op1
= force_reg (GET_MODE (dst
), op1
);
1011 emit_insn (gen_rtx_SET (dst
,
1012 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1014 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1020 if (!vector_operand (operands
[1], mode
))
1021 operands
[1] = force_reg (mode
, operands
[1]);
1022 if (!vector_operand (operands
[2], mode
))
1023 operands
[2] = force_reg (mode
, operands
[2]);
1024 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1025 emit_insn (gen_rtx_SET (operands
[0],
1026 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1030 /* Return TRUE or FALSE depending on whether the binary operator meets the
1031 appropriate constraints. */
1034 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1037 rtx dst
= operands
[0];
1038 rtx src1
= operands
[1];
1039 rtx src2
= operands
[2];
1041 /* Both source operands cannot be in memory. */
1042 if (MEM_P (src1
) && MEM_P (src2
))
1045 /* Canonicalize operand order for commutative operators. */
1046 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1047 std::swap (src1
, src2
);
1049 /* If the destination is memory, we must have a matching source operand. */
1050 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1053 /* Source 1 cannot be a constant. */
1054 if (CONSTANT_P (src1
))
1057 /* Source 1 cannot be a non-matching memory. */
1058 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1059 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1063 || (TARGET_64BIT
&& mode
== DImode
))
1064 && satisfies_constraint_L (src2
));
1069 /* Attempt to expand a unary operator. Make the expansion closer to the
1070 actual machine, then just general_operand, which will allow 2 separate
1071 memory references (one output, one input) in a single insn. */
1074 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1077 bool matching_memory
= false;
1078 rtx src
, dst
, op
, clob
;
1083 /* If the destination is memory, and we do not have matching source
1084 operands, do things in registers. */
1087 if (rtx_equal_p (dst
, src
))
1088 matching_memory
= true;
1090 dst
= gen_reg_rtx (mode
);
1093 /* When source operand is memory, destination must match. */
1094 if (MEM_P (src
) && !matching_memory
)
1095 src
= force_reg (mode
, src
);
1097 /* Emit the instruction. */
1099 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1105 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1106 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1109 /* Fix up the destination if needed. */
1110 if (dst
!= operands
[0])
1111 emit_move_insn (operands
[0], dst
);
1114 /* Predict just emitted jump instruction to be taken with probability PROB. */
1117 predict_jump (int prob
)
1119 rtx_insn
*insn
= get_last_insn ();
1120 gcc_assert (JUMP_P (insn
));
1121 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1124 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1125 divisor are within the range [0-255]. */
1128 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1131 rtx_code_label
*end_label
, *qimode_label
;
1134 rtx scratch
, tmp0
, tmp1
, tmp2
;
1135 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1140 if (GET_MODE (operands
[0]) == SImode
)
1142 if (GET_MODE (operands
[1]) == SImode
)
1143 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1146 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1150 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1154 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1161 end_label
= gen_label_rtx ();
1162 qimode_label
= gen_label_rtx ();
1164 scratch
= gen_reg_rtx (mode
);
1166 /* Use 8bit unsigned divimod if dividend and divisor are within
1167 the range [0-255]. */
1168 emit_move_insn (scratch
, operands
[2]);
1169 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1170 scratch
, 1, OPTAB_DIRECT
);
1171 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1172 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1173 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1174 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1175 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1177 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1178 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1179 JUMP_LABEL (insn
) = qimode_label
;
1181 /* Generate original signed/unsigned divimod. */
1182 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1183 operands
[2], operands
[3]));
1185 /* Branch to the end. */
1186 emit_jump_insn (gen_jump (end_label
));
1189 /* Generate 8bit unsigned divide. */
1190 emit_label (qimode_label
);
1191 /* Don't use operands[0] for result of 8bit divide since not all
1192 registers support QImode ZERO_EXTRACT. */
1193 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1194 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1195 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1196 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1200 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1201 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1205 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1206 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1210 if (GET_MODE (operands
[0]) != SImode
)
1211 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1212 if (GET_MODE (operands
[1]) != SImode
)
1213 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1216 /* Extract remainder from AH. */
1217 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1218 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1219 GEN_INT (8), GEN_INT (8));
1220 insn
= emit_move_insn (operands
[1], tmp1
);
1221 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1223 /* Zero extend quotient from AL. */
1224 tmp1
= gen_lowpart (QImode
, tmp0
);
1225 insn
= emit_insn (gen_extend_insn
1227 GET_MODE (operands
[0]), QImode
, 1));
1228 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1230 emit_label (end_label
);
1233 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1234 matches destination. RTX includes clobber of FLAGS_REG. */
1237 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1242 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1243 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1245 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1248 /* Return true if regno1 def is nearest to the insn. */
1251 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1253 rtx_insn
*prev
= insn
;
1254 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1258 while (prev
&& prev
!= start
)
1260 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1262 prev
= PREV_INSN (prev
);
1265 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1267 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1269 prev
= PREV_INSN (prev
);
1272 /* None of the regs is defined in the bb. */
1276 /* Split lea instructions into a sequence of instructions
1277 which are executed on ALU to avoid AGU stalls.
1278 It is assumed that it is allowed to clobber flags register
1282 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1284 unsigned int regno0
, regno1
, regno2
;
1285 struct ix86_address parts
;
1289 ok
= ix86_decompose_address (operands
[1], &parts
);
1292 target
= gen_lowpart (mode
, operands
[0]);
1294 regno0
= true_regnum (target
);
1295 regno1
= INVALID_REGNUM
;
1296 regno2
= INVALID_REGNUM
;
1300 parts
.base
= gen_lowpart (mode
, parts
.base
);
1301 regno1
= true_regnum (parts
.base
);
1306 parts
.index
= gen_lowpart (mode
, parts
.index
);
1307 regno2
= true_regnum (parts
.index
);
1311 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1313 if (parts
.scale
> 1)
1315 /* Case r1 = r1 + ... */
1316 if (regno1
== regno0
)
1318 /* If we have a case r1 = r1 + C * r2 then we
1319 should use multiplication which is very
1320 expensive. Assume cost model is wrong if we
1321 have such case here. */
1322 gcc_assert (regno2
!= regno0
);
1324 for (adds
= parts
.scale
; adds
> 0; adds
--)
1325 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1329 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1330 if (regno0
!= regno2
)
1331 emit_insn (gen_rtx_SET (target
, parts
.index
));
1333 /* Use shift for scaling. */
1334 ix86_emit_binop (ASHIFT
, mode
, target
,
1335 GEN_INT (exact_log2 (parts
.scale
)));
1338 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1340 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1341 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1344 else if (!parts
.base
&& !parts
.index
)
1346 gcc_assert(parts
.disp
);
1347 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1353 if (regno0
!= regno2
)
1354 emit_insn (gen_rtx_SET (target
, parts
.index
));
1356 else if (!parts
.index
)
1358 if (regno0
!= regno1
)
1359 emit_insn (gen_rtx_SET (target
, parts
.base
));
1363 if (regno0
== regno1
)
1365 else if (regno0
== regno2
)
1371 /* Find better operand for SET instruction, depending
1372 on which definition is farther from the insn. */
1373 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1374 tmp
= parts
.index
, tmp1
= parts
.base
;
1376 tmp
= parts
.base
, tmp1
= parts
.index
;
1378 emit_insn (gen_rtx_SET (target
, tmp
));
1380 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1381 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1383 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1387 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1390 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1391 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1395 /* Post-reload splitter for converting an SF or DFmode value in an
1396 SSE register into an unsigned SImode. */
1399 ix86_split_convert_uns_si_sse (rtx operands
[])
1401 machine_mode vecmode
;
1402 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1404 large
= operands
[1];
1405 zero_or_two31
= operands
[2];
1406 input
= operands
[3];
1407 two31
= operands
[4];
1408 vecmode
= GET_MODE (large
);
1409 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1411 /* Load up the value into the low element. We must ensure that the other
1412 elements are valid floats -- zero is the easiest such value. */
1415 if (vecmode
== V4SFmode
)
1416 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1418 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1422 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1423 emit_move_insn (value
, CONST0_RTX (vecmode
));
1424 if (vecmode
== V4SFmode
)
1425 emit_insn (gen_sse_movss (value
, value
, input
));
1427 emit_insn (gen_sse2_movsd (value
, value
, input
));
1430 emit_move_insn (large
, two31
);
1431 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1433 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1434 emit_insn (gen_rtx_SET (large
, x
));
1436 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1437 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1439 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1440 emit_insn (gen_rtx_SET (value
, x
));
1442 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1443 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1445 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1446 if (vecmode
== V4SFmode
)
1447 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1449 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1452 emit_insn (gen_xorv4si3 (value
, value
, large
));
1455 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1456 machine_mode mode
, rtx target
,
1457 rtx var
, int one_var
);
1459 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1460 Expects the 64-bit DImode to be supplied in a pair of integral
1461 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1462 -mfpmath=sse, !optimize_size only. */
1465 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1467 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1468 rtx int_xmm
, fp_xmm
;
1469 rtx biases
, exponents
;
1472 int_xmm
= gen_reg_rtx (V4SImode
);
1473 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1474 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1475 else if (TARGET_SSE_SPLIT_REGS
)
1477 emit_clobber (int_xmm
);
1478 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1482 x
= gen_reg_rtx (V2DImode
);
1483 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1484 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1487 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1488 gen_rtvec (4, GEN_INT (0x43300000UL
),
1489 GEN_INT (0x45300000UL
),
1490 const0_rtx
, const0_rtx
));
1491 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1493 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1494 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1496 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1497 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1498 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1499 (0x1.0p84 + double(fp_value_hi_xmm)).
1500 Note these exponents differ by 32. */
1502 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1504 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1505 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1506 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1507 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1508 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1509 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1510 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1511 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1512 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1514 /* Add the upper and lower DFmode values together. */
1516 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1519 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1520 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1521 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1524 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1527 /* Not used, but eases macroization of patterns. */
1529 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1534 /* Convert an unsigned SImode value into a DFmode. Only currently used
1535 for SSE, but applicable anywhere. */
1538 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1540 REAL_VALUE_TYPE TWO31r
;
1543 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1544 NULL
, 1, OPTAB_DIRECT
);
1546 fp
= gen_reg_rtx (DFmode
);
1547 emit_insn (gen_floatsidf2 (fp
, x
));
1549 real_ldexp (&TWO31r
, &dconst1
, 31);
1550 x
= const_double_from_real_value (TWO31r
, DFmode
);
1552 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1554 emit_move_insn (target
, x
);
1557 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1558 32-bit mode; otherwise we have a direct convert instruction. */
1561 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1563 REAL_VALUE_TYPE TWO32r
;
1564 rtx fp_lo
, fp_hi
, x
;
1566 fp_lo
= gen_reg_rtx (DFmode
);
1567 fp_hi
= gen_reg_rtx (DFmode
);
1569 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1571 real_ldexp (&TWO32r
, &dconst1
, 32);
1572 x
= const_double_from_real_value (TWO32r
, DFmode
);
1573 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1575 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1577 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1580 emit_move_insn (target
, x
);
1583 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1584 For x86_32, -mfpmath=sse, !optimize_size only. */
1586 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1588 REAL_VALUE_TYPE ONE16r
;
1589 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1591 real_ldexp (&ONE16r
, &dconst1
, 16);
1592 x
= const_double_from_real_value (ONE16r
, SFmode
);
1593 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1594 NULL
, 0, OPTAB_DIRECT
);
1595 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1596 NULL
, 0, OPTAB_DIRECT
);
1597 fp_hi
= gen_reg_rtx (SFmode
);
1598 fp_lo
= gen_reg_rtx (SFmode
);
1599 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1600 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1601 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1603 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1605 if (!rtx_equal_p (target
, fp_hi
))
1606 emit_move_insn (target
, fp_hi
);
1609 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1610 a vector of unsigned ints VAL to vector of floats TARGET. */
1613 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1616 REAL_VALUE_TYPE TWO16r
;
1617 machine_mode intmode
= GET_MODE (val
);
1618 machine_mode fltmode
= GET_MODE (target
);
1619 rtx (*cvt
) (rtx
, rtx
);
1621 if (intmode
== V4SImode
)
1622 cvt
= gen_floatv4siv4sf2
;
1624 cvt
= gen_floatv8siv8sf2
;
1625 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1626 tmp
[0] = force_reg (intmode
, tmp
[0]);
1627 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1629 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1630 NULL_RTX
, 1, OPTAB_DIRECT
);
1631 tmp
[3] = gen_reg_rtx (fltmode
);
1632 emit_insn (cvt (tmp
[3], tmp
[1]));
1633 tmp
[4] = gen_reg_rtx (fltmode
);
1634 emit_insn (cvt (tmp
[4], tmp
[2]));
1635 real_ldexp (&TWO16r
, &dconst1
, 16);
1636 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1637 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1638 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1640 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1642 if (tmp
[7] != target
)
1643 emit_move_insn (target
, tmp
[7]);
1646 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1647 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1648 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1649 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1652 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1654 REAL_VALUE_TYPE TWO31r
;
1656 machine_mode mode
= GET_MODE (val
);
1657 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1658 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1659 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1662 for (i
= 0; i
< 3; i
++)
1663 tmp
[i
] = gen_reg_rtx (mode
);
1664 real_ldexp (&TWO31r
, &dconst1
, 31);
1665 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1666 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1667 two31r
= force_reg (mode
, two31r
);
1670 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1671 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1672 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1673 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1674 default: gcc_unreachable ();
1676 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1677 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1678 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1680 if (intmode
== V4SImode
|| TARGET_AVX2
)
1681 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1682 gen_lowpart (intmode
, tmp
[0]),
1683 GEN_INT (31), NULL_RTX
, 0,
1687 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1688 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1689 *xorp
= expand_simple_binop (intmode
, AND
,
1690 gen_lowpart (intmode
, tmp
[0]),
1694 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1698 /* Generate code for floating point ABS or NEG. */
1701 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1705 bool use_sse
= false;
1706 bool vector_mode
= VECTOR_MODE_P (mode
);
1707 machine_mode vmode
= mode
;
1710 if (vector_mode
|| mode
== TFmode
)
1712 else if (TARGET_SSE_MATH
)
1714 use_sse
= SSE_FLOAT_MODE_P (mode
);
1717 else if (mode
== DFmode
)
1724 set
= gen_rtx_fmt_e (code
, mode
, src
);
1725 set
= gen_rtx_SET (dst
, set
);
1729 rtx mask
, use
, clob
;
1731 /* NEG and ABS performed with SSE use bitwise mask operations.
1732 Create the appropriate mask now. */
1733 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1734 use
= gen_rtx_USE (VOIDmode
, mask
);
1735 if (vector_mode
|| mode
== TFmode
)
1736 par
= gen_rtvec (2, set
, use
);
1739 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1740 par
= gen_rtvec (3, set
, use
, clob
);
1747 /* Changing of sign for FP values is doable using integer unit too. */
1748 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1749 par
= gen_rtvec (2, set
, clob
);
1752 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1755 /* Deconstruct a floating point ABS or NEG operation
1756 with integer registers into integer operations. */
1759 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1762 enum rtx_code absneg_op
;
1765 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1770 dst
= gen_lowpart (SImode
, operands
[0]);
1774 set
= gen_int_mode (0x7fffffff, SImode
);
1779 set
= gen_int_mode (0x80000000, SImode
);
1782 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1788 dst
= gen_lowpart (DImode
, operands
[0]);
1789 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1794 set
= gen_rtx_NOT (DImode
, dst
);
1798 dst
= gen_highpart (SImode
, operands
[0]);
1802 set
= gen_int_mode (0x7fffffff, SImode
);
1807 set
= gen_int_mode (0x80000000, SImode
);
1810 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1815 dst
= gen_rtx_REG (SImode
,
1816 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1819 set
= GEN_INT (0x7fff);
1824 set
= GEN_INT (0x8000);
1827 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1834 set
= gen_rtx_SET (dst
, set
);
1836 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1837 rtvec par
= gen_rtvec (2, set
, clob
);
1839 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1842 /* Expand a copysign operation. Special case operand 0 being a constant. */
1845 ix86_expand_copysign (rtx operands
[])
1847 machine_mode mode
, vmode
;
1848 rtx dest
, op0
, op1
, mask
;
1854 mode
= GET_MODE (dest
);
1858 else if (mode
== DFmode
)
1860 else if (mode
== TFmode
)
1865 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1867 if (CONST_DOUBLE_P (op0
))
1869 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1870 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1872 if (mode
== SFmode
|| mode
== DFmode
)
1874 if (op0
== CONST0_RTX (mode
))
1875 op0
= CONST0_RTX (vmode
);
1878 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1880 op0
= force_reg (vmode
, v
);
1883 else if (op0
!= CONST0_RTX (mode
))
1884 op0
= force_reg (mode
, op0
);
1886 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1890 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1892 emit_insn (gen_copysign3_var
1893 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1897 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1898 be a constant, and so has already been expanded into a vector constant. */
1901 ix86_split_copysign_const (rtx operands
[])
1903 machine_mode mode
, vmode
;
1904 rtx dest
, op0
, mask
, x
;
1910 mode
= GET_MODE (dest
);
1911 vmode
= GET_MODE (mask
);
1913 dest
= lowpart_subreg (vmode
, dest
, mode
);
1914 x
= gen_rtx_AND (vmode
, dest
, mask
);
1915 emit_insn (gen_rtx_SET (dest
, x
));
1917 if (op0
!= CONST0_RTX (vmode
))
1919 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1920 emit_insn (gen_rtx_SET (dest
, x
));
1924 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1925 so we have to do two masks. */
1928 ix86_split_copysign_var (rtx operands
[])
1930 machine_mode mode
, vmode
;
1931 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1934 scratch
= operands
[1];
1937 nmask
= operands
[4];
1940 mode
= GET_MODE (dest
);
1941 vmode
= GET_MODE (mask
);
1943 if (rtx_equal_p (op0
, op1
))
1945 /* Shouldn't happen often (it's useless, obviously), but when it does
1946 we'd generate incorrect code if we continue below. */
1947 emit_move_insn (dest
, op0
);
1951 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1953 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1955 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1956 emit_insn (gen_rtx_SET (scratch
, x
));
1959 op0
= lowpart_subreg (vmode
, op0
, mode
);
1960 x
= gen_rtx_NOT (vmode
, dest
);
1961 x
= gen_rtx_AND (vmode
, x
, op0
);
1962 emit_insn (gen_rtx_SET (dest
, x
));
1966 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1968 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1970 else /* alternative 2,4 */
1972 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1973 op1
= lowpart_subreg (vmode
, op1
, mode
);
1974 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1976 emit_insn (gen_rtx_SET (scratch
, x
));
1978 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1980 dest
= lowpart_subreg (vmode
, op0
, mode
);
1981 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1983 else /* alternative 3,4 */
1985 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1987 op0
= lowpart_subreg (vmode
, op0
, mode
);
1988 x
= gen_rtx_AND (vmode
, dest
, op0
);
1990 emit_insn (gen_rtx_SET (dest
, x
));
1993 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
1994 emit_insn (gen_rtx_SET (dest
, x
));
1997 /* Expand an xorsign operation. */
2000 ix86_expand_xorsign (rtx operands
[])
2002 machine_mode mode
, vmode
;
2003 rtx dest
, op0
, op1
, mask
;
2009 mode
= GET_MODE (dest
);
2013 else if (mode
== DFmode
)
2018 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2020 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2023 /* Deconstruct an xorsign operation into bit masks. */
2026 ix86_split_xorsign (rtx operands
[])
2028 machine_mode mode
, vmode
;
2029 rtx dest
, op0
, mask
, x
;
2035 mode
= GET_MODE (dest
);
2036 vmode
= GET_MODE (mask
);
2038 dest
= lowpart_subreg (vmode
, dest
, mode
);
2039 x
= gen_rtx_AND (vmode
, dest
, mask
);
2040 emit_insn (gen_rtx_SET (dest
, x
));
2042 op0
= lowpart_subreg (vmode
, op0
, mode
);
2043 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2044 emit_insn (gen_rtx_SET (dest
, x
));
2047 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2050 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2052 machine_mode mode
= GET_MODE (op0
);
2055 /* Handle special case - vector comparsion with boolean result, transform
2056 it using ptest instruction. */
2057 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2059 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2060 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2062 gcc_assert (code
== EQ
|| code
== NE
);
2063 /* Generate XOR since we can't check that one operand is zero vector. */
2064 tmp
= gen_reg_rtx (mode
);
2065 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2066 tmp
= gen_lowpart (p_mode
, tmp
);
2067 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2068 gen_rtx_UNSPEC (CCmode
,
2069 gen_rtvec (2, tmp
, tmp
),
2071 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2072 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2073 gen_rtx_LABEL_REF (VOIDmode
, label
),
2075 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2088 tmp
= ix86_expand_compare (code
, op0
, op1
);
2089 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2090 gen_rtx_LABEL_REF (VOIDmode
, label
),
2092 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2098 /* For 32-bit target DI comparison may be performed on
2099 SSE registers. To allow this we should avoid split
2100 to SI mode which is achieved by doing xor in DI mode
2101 and then comparing with zero (which is recognized by
2102 STV pass). We don't compare using xor when optimizing
2104 if (!optimize_insn_for_size_p ()
2106 && (code
== EQ
|| code
== NE
))
2108 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2113 /* Expand DImode branch into multiple compare+branch. */
2116 rtx_code_label
*label2
;
2117 enum rtx_code code1
, code2
, code3
;
2118 machine_mode submode
;
2120 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2122 std::swap (op0
, op1
);
2123 code
= swap_condition (code
);
2126 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2127 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2129 submode
= mode
== DImode
? SImode
: DImode
;
2131 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2132 avoid two branches. This costs one extra insn, so disable when
2133 optimizing for size. */
2135 if ((code
== EQ
|| code
== NE
)
2136 && (!optimize_insn_for_size_p ()
2137 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2142 if (hi
[1] != const0_rtx
)
2143 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2144 NULL_RTX
, 0, OPTAB_WIDEN
);
2147 if (lo
[1] != const0_rtx
)
2148 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2149 NULL_RTX
, 0, OPTAB_WIDEN
);
2151 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2152 NULL_RTX
, 0, OPTAB_WIDEN
);
2154 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2158 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2159 op1 is a constant and the low word is zero, then we can just
2160 examine the high word. Similarly for low word -1 and
2161 less-or-equal-than or greater-than. */
2163 if (CONST_INT_P (hi
[1]))
2166 case LT
: case LTU
: case GE
: case GEU
:
2167 if (lo
[1] == const0_rtx
)
2169 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2173 case LE
: case LEU
: case GT
: case GTU
:
2174 if (lo
[1] == constm1_rtx
)
2176 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2184 /* Emulate comparisons that do not depend on Zero flag with
2185 double-word subtraction. Note that only Overflow, Sign
2186 and Carry flags are valid, so swap arguments and condition
2187 of comparisons that would otherwise test Zero flag. */
2191 case LE
: case LEU
: case GT
: case GTU
:
2192 std::swap (lo
[0], lo
[1]);
2193 std::swap (hi
[0], hi
[1]);
2194 code
= swap_condition (code
);
2197 case LT
: case LTU
: case GE
: case GEU
:
2199 bool uns
= (code
== LTU
|| code
== GEU
);
2200 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2201 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2203 if (!nonimmediate_operand (lo
[0], submode
))
2204 lo
[0] = force_reg (submode
, lo
[0]);
2205 if (!x86_64_general_operand (lo
[1], submode
))
2206 lo
[1] = force_reg (submode
, lo
[1]);
2208 if (!register_operand (hi
[0], submode
))
2209 hi
[0] = force_reg (submode
, hi
[0]);
2210 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2211 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2212 hi
[1] = force_reg (submode
, hi
[1]);
2214 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2216 tmp
= gen_rtx_SCRATCH (submode
);
2217 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2219 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2220 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2228 /* Otherwise, we need two or three jumps. */
2230 label2
= gen_label_rtx ();
2233 code2
= swap_condition (code
);
2234 code3
= unsigned_condition (code
);
2238 case LT
: case GT
: case LTU
: case GTU
:
2241 case LE
: code1
= LT
; code2
= GT
; break;
2242 case GE
: code1
= GT
; code2
= LT
; break;
2243 case LEU
: code1
= LTU
; code2
= GTU
; break;
2244 case GEU
: code1
= GTU
; code2
= LTU
; break;
2246 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2247 case NE
: code2
= UNKNOWN
; break;
2255 * if (hi(a) < hi(b)) goto true;
2256 * if (hi(a) > hi(b)) goto false;
2257 * if (lo(a) < lo(b)) goto true;
2261 if (code1
!= UNKNOWN
)
2262 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2263 if (code2
!= UNKNOWN
)
2264 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2266 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2268 if (code2
!= UNKNOWN
)
2269 emit_label (label2
);
2274 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2279 /* Figure out whether to use unordered fp comparisons. */
2282 ix86_unordered_fp_compare (enum rtx_code code
)
2284 if (!TARGET_IEEE_FP
)
2313 /* Return a comparison we can do and that it is equivalent to
2314 swap_condition (code) apart possibly from orderedness.
2315 But, never change orderedness if TARGET_IEEE_FP, returning
2316 UNKNOWN in that case if necessary. */
2318 static enum rtx_code
2319 ix86_fp_swap_condition (enum rtx_code code
)
2323 case GT
: /* GTU - CF=0 & ZF=0 */
2324 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2325 case GE
: /* GEU - CF=0 */
2326 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2327 case UNLT
: /* LTU - CF=1 */
2328 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2329 case UNLE
: /* LEU - CF=1 | ZF=1 */
2330 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2332 return swap_condition (code
);
2336 /* Return cost of comparison CODE using the best strategy for performance.
2337 All following functions do use number of instructions as a cost metrics.
2338 In future this should be tweaked to compute bytes for optimize_size and
2339 take into account performance of various instructions on various CPUs. */
2342 ix86_fp_comparison_cost (enum rtx_code code
)
2346 /* The cost of code using bit-twiddling on %ah. */
2363 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2367 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2373 switch (ix86_fp_comparison_strategy (code
))
2375 case IX86_FPCMP_COMI
:
2376 return arith_cost
> 4 ? 3 : 2;
2377 case IX86_FPCMP_SAHF
:
2378 return arith_cost
> 4 ? 4 : 3;
2384 /* Swap, force into registers, or otherwise massage the two operands
2385 to a fp comparison. The operands are updated in place; the new
2386 comparison code is returned. */
2388 static enum rtx_code
2389 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2391 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2392 rtx op0
= *pop0
, op1
= *pop1
;
2393 machine_mode op_mode
= GET_MODE (op0
);
2394 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2396 /* All of the unordered compare instructions only work on registers.
2397 The same is true of the fcomi compare instructions. The XFmode
2398 compare instructions require registers except when comparing
2399 against zero or when converting operand 1 from fixed point to
2403 && (unordered_compare
2404 || (op_mode
== XFmode
2405 && ! (standard_80387_constant_p (op0
) == 1
2406 || standard_80387_constant_p (op1
) == 1)
2407 && GET_CODE (op1
) != FLOAT
)
2408 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2410 op0
= force_reg (op_mode
, op0
);
2411 op1
= force_reg (op_mode
, op1
);
2415 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2416 things around if they appear profitable, otherwise force op0
2419 if (standard_80387_constant_p (op0
) == 0
2421 && ! (standard_80387_constant_p (op1
) == 0
2424 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2425 if (new_code
!= UNKNOWN
)
2427 std::swap (op0
, op1
);
2433 op0
= force_reg (op_mode
, op0
);
2435 if (CONSTANT_P (op1
))
2437 int tmp
= standard_80387_constant_p (op1
);
2439 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2443 op1
= force_reg (op_mode
, op1
);
2446 op1
= force_reg (op_mode
, op1
);
2450 /* Try to rearrange the comparison to make it cheaper. */
2451 if (ix86_fp_comparison_cost (code
)
2452 > ix86_fp_comparison_cost (swap_condition (code
))
2453 && (REG_P (op1
) || can_create_pseudo_p ()))
2455 std::swap (op0
, op1
);
2456 code
= swap_condition (code
);
2458 op0
= force_reg (op_mode
, op0
);
2466 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2469 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2471 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2472 machine_mode cmp_mode
;
2475 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2477 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2478 if (unordered_compare
)
2479 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2481 /* Do fcomi/sahf based test when profitable. */
2482 switch (ix86_fp_comparison_strategy (code
))
2484 case IX86_FPCMP_COMI
:
2485 cmp_mode
= CCFPmode
;
2486 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2489 case IX86_FPCMP_SAHF
:
2490 cmp_mode
= CCFPmode
;
2491 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2492 scratch
= gen_reg_rtx (HImode
);
2493 emit_insn (gen_rtx_SET (scratch
, tmp
));
2494 emit_insn (gen_x86_sahf_1 (scratch
));
2497 case IX86_FPCMP_ARITH
:
2498 cmp_mode
= CCNOmode
;
2499 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2500 scratch
= gen_reg_rtx (HImode
);
2501 emit_insn (gen_rtx_SET (scratch
, tmp
));
2503 /* In the unordered case, we have to check C2 for NaN's, which
2504 doesn't happen to work out to anything nice combination-wise.
2505 So do some bit twiddling on the value we've got in AH to come
2506 up with an appropriate set of condition codes. */
2512 if (code
== GT
|| !TARGET_IEEE_FP
)
2514 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2519 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2520 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2521 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2528 if (code
== LT
&& TARGET_IEEE_FP
)
2530 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2531 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2537 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2543 if (code
== GE
|| !TARGET_IEEE_FP
)
2545 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2550 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2551 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2557 if (code
== LE
&& TARGET_IEEE_FP
)
2559 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2560 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2561 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2567 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2573 if (code
== EQ
&& TARGET_IEEE_FP
)
2575 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2576 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2582 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2588 if (code
== NE
&& TARGET_IEEE_FP
)
2590 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2591 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2597 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2603 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2607 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2620 /* Return the test that should be put into the flags user, i.e.
2621 the bcc, scc, or cmov instruction. */
2622 return gen_rtx_fmt_ee (code
, VOIDmode
,
2623 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2627 /* Generate insn patterns to do an integer compare of OPERANDS. */
2630 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2632 machine_mode cmpmode
;
2635 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2636 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2638 /* This is very simple, but making the interface the same as in the
2639 FP case makes the rest of the code easier. */
2640 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2641 emit_insn (gen_rtx_SET (flags
, tmp
));
2643 /* Return the test that should be put into the flags user, i.e.
2644 the bcc, scc, or cmov instruction. */
2645 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2649 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2653 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2654 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2656 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2658 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2659 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2662 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2668 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2672 gcc_assert (GET_MODE (dest
) == QImode
);
2674 ret
= ix86_expand_compare (code
, op0
, op1
);
2675 PUT_MODE (ret
, QImode
);
2676 emit_insn (gen_rtx_SET (dest
, ret
));
2679 /* Expand comparison setting or clearing carry flag. Return true when
2680 successful and set pop for the operation. */
2682 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2685 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2687 /* Do not handle double-mode compares that go through special path. */
2688 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2691 if (SCALAR_FLOAT_MODE_P (mode
))
2694 rtx_insn
*compare_seq
;
2696 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2698 /* Shortcut: following common codes never translate
2699 into carry flag compares. */
2700 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2701 || code
== ORDERED
|| code
== UNORDERED
)
2704 /* These comparisons require zero flag; swap operands so they won't. */
2705 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2708 std::swap (op0
, op1
);
2709 code
= swap_condition (code
);
2712 /* Try to expand the comparison and verify that we end up with
2713 carry flag based comparison. This fails to be true only when
2714 we decide to expand comparison using arithmetic that is not
2715 too common scenario. */
2717 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2718 compare_seq
= get_insns ();
2721 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2722 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2724 code
= GET_CODE (compare_op
);
2726 if (code
!= LTU
&& code
!= GEU
)
2729 emit_insn (compare_seq
);
2734 if (!INTEGRAL_MODE_P (mode
))
2743 /* Convert a==0 into (unsigned)a<1. */
2746 if (op1
!= const0_rtx
)
2749 code
= (code
== EQ
? LTU
: GEU
);
2752 /* Convert a>b into b<a or a>=b-1. */
2755 if (CONST_INT_P (op1
))
2757 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2758 /* Bail out on overflow. We still can swap operands but that
2759 would force loading of the constant into register. */
2760 if (op1
== const0_rtx
2761 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2763 code
= (code
== GTU
? GEU
: LTU
);
2767 std::swap (op0
, op1
);
2768 code
= (code
== GTU
? LTU
: GEU
);
2772 /* Convert a>=0 into (unsigned)a<0x80000000. */
2775 if (mode
== DImode
|| op1
!= const0_rtx
)
2777 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2778 code
= (code
== LT
? GEU
: LTU
);
2782 if (mode
== DImode
|| op1
!= constm1_rtx
)
2784 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2785 code
= (code
== LE
? GEU
: LTU
);
2791 /* Swapping operands may cause constant to appear as first operand. */
2792 if (!nonimmediate_operand (op0
, VOIDmode
))
2794 if (!can_create_pseudo_p ())
2796 op0
= force_reg (mode
, op0
);
2798 *pop
= ix86_expand_compare (code
, op0
, op1
);
2799 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2803 /* Expand conditional increment or decrement using adb/sbb instructions.
2804 The default case using setcc followed by the conditional move can be
2805 done by generic code. */
2807 ix86_expand_int_addcc (rtx operands
[])
2809 enum rtx_code code
= GET_CODE (operands
[1]);
2811 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2813 rtx val
= const0_rtx
;
2816 rtx op0
= XEXP (operands
[1], 0);
2817 rtx op1
= XEXP (operands
[1], 1);
2819 if (operands
[3] != const1_rtx
2820 && operands
[3] != constm1_rtx
)
2822 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2824 code
= GET_CODE (compare_op
);
2826 flags
= XEXP (compare_op
, 0);
2828 if (GET_MODE (flags
) == CCFPmode
)
2831 code
= ix86_fp_compare_code_to_integer (code
);
2838 PUT_CODE (compare_op
,
2839 reverse_condition_maybe_unordered
2840 (GET_CODE (compare_op
)));
2842 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2845 mode
= GET_MODE (operands
[0]);
2847 /* Construct either adc or sbb insn. */
2848 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2849 insn
= gen_sub3_carry
;
2851 insn
= gen_add3_carry
;
2853 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2859 ix86_expand_int_movcc (rtx operands
[])
2861 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2862 rtx_insn
*compare_seq
;
2864 machine_mode mode
= GET_MODE (operands
[0]);
2865 bool sign_bit_compare_p
= false;
2866 rtx op0
= XEXP (operands
[1], 0);
2867 rtx op1
= XEXP (operands
[1], 1);
2869 if (GET_MODE (op0
) == TImode
2870 || (GET_MODE (op0
) == DImode
2875 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2876 compare_seq
= get_insns ();
2879 compare_code
= GET_CODE (compare_op
);
2881 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2882 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2883 sign_bit_compare_p
= true;
2885 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2886 HImode insns, we'd be swallowed in word prefix ops. */
2888 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2889 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2890 && CONST_INT_P (operands
[2])
2891 && CONST_INT_P (operands
[3]))
2893 rtx out
= operands
[0];
2894 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2895 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2899 /* Sign bit compares are better done using shifts than we do by using
2901 if (sign_bit_compare_p
2902 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2904 /* Detect overlap between destination and compare sources. */
2907 if (!sign_bit_compare_p
)
2912 compare_code
= GET_CODE (compare_op
);
2914 flags
= XEXP (compare_op
, 0);
2916 if (GET_MODE (flags
) == CCFPmode
)
2920 = ix86_fp_compare_code_to_integer (compare_code
);
2923 /* To simplify rest of code, restrict to the GEU case. */
2924 if (compare_code
== LTU
)
2927 compare_code
= reverse_condition (compare_code
);
2928 code
= reverse_condition (code
);
2933 PUT_CODE (compare_op
,
2934 reverse_condition_maybe_unordered
2935 (GET_CODE (compare_op
)));
2937 PUT_CODE (compare_op
,
2938 reverse_condition (GET_CODE (compare_op
)));
2942 if (reg_overlap_mentioned_p (out
, op0
)
2943 || reg_overlap_mentioned_p (out
, op1
))
2944 tmp
= gen_reg_rtx (mode
);
2947 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2949 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2950 flags
, compare_op
));
2954 if (code
== GT
|| code
== GE
)
2955 code
= reverse_condition (code
);
2961 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2974 tmp
= expand_simple_binop (mode
, PLUS
,
2976 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2987 tmp
= expand_simple_binop (mode
, IOR
,
2989 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2991 else if (diff
== -1 && ct
)
3001 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3003 tmp
= expand_simple_binop (mode
, PLUS
,
3004 copy_rtx (tmp
), GEN_INT (cf
),
3005 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3013 * andl cf - ct, dest
3023 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3026 tmp
= expand_simple_binop (mode
, AND
,
3028 gen_int_mode (cf
- ct
, mode
),
3029 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3031 tmp
= expand_simple_binop (mode
, PLUS
,
3032 copy_rtx (tmp
), GEN_INT (ct
),
3033 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3036 if (!rtx_equal_p (tmp
, out
))
3037 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3044 machine_mode cmp_mode
= GET_MODE (op0
);
3045 enum rtx_code new_code
;
3047 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3049 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3051 /* We may be reversing a non-trapping
3052 comparison to a trapping comparison. */
3053 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3054 && code
!= EQ
&& code
!= NE
3055 && code
!= ORDERED
&& code
!= UNORDERED
)
3058 new_code
= reverse_condition_maybe_unordered (code
);
3061 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3062 if (new_code
!= UNKNOWN
)
3070 compare_code
= UNKNOWN
;
3071 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3072 && CONST_INT_P (op1
))
3074 if (op1
== const0_rtx
3075 && (code
== LT
|| code
== GE
))
3076 compare_code
= code
;
3077 else if (op1
== constm1_rtx
)
3081 else if (code
== GT
)
3086 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3087 if (compare_code
!= UNKNOWN
3088 && GET_MODE (op0
) == GET_MODE (out
)
3089 && (cf
== -1 || ct
== -1))
3091 /* If lea code below could be used, only optimize
3092 if it results in a 2 insn sequence. */
3094 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3095 || diff
== 3 || diff
== 5 || diff
== 9)
3096 || (compare_code
== LT
&& ct
== -1)
3097 || (compare_code
== GE
&& cf
== -1))
3100 * notl op1 (if necessary)
3108 code
= reverse_condition (code
);
3111 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3113 out
= expand_simple_binop (mode
, IOR
,
3115 out
, 1, OPTAB_DIRECT
);
3116 if (out
!= operands
[0])
3117 emit_move_insn (operands
[0], out
);
3124 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3125 || diff
== 3 || diff
== 5 || diff
== 9)
3126 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3128 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3134 * lea cf(dest*(ct-cf)),dest
3138 * This also catches the degenerate setcc-only case.
3144 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3147 /* On x86_64 the lea instruction operates on Pmode, so we need
3148 to get arithmetics done in proper mode to match. */
3150 tmp
= copy_rtx (out
);
3154 out1
= copy_rtx (out
);
3155 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3159 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3165 tmp
= plus_constant (mode
, tmp
, cf
);
3168 if (!rtx_equal_p (tmp
, out
))
3171 out
= force_operand (tmp
, copy_rtx (out
));
3173 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3175 if (!rtx_equal_p (out
, operands
[0]))
3176 emit_move_insn (operands
[0], copy_rtx (out
));
3182 * General case: Jumpful:
3183 * xorl dest,dest cmpl op1, op2
3184 * cmpl op1, op2 movl ct, dest
3186 * decl dest movl cf, dest
3187 * andl (cf-ct),dest 1:
3192 * This is reasonably steep, but branch mispredict costs are
3193 * high on modern cpus, so consider failing only if optimizing
3197 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3198 && BRANCH_COST (optimize_insn_for_speed_p (),
3203 machine_mode cmp_mode
= GET_MODE (op0
);
3204 enum rtx_code new_code
;
3206 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3208 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3210 /* We may be reversing a non-trapping
3211 comparison to a trapping comparison. */
3212 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3213 && code
!= EQ
&& code
!= NE
3214 && code
!= ORDERED
&& code
!= UNORDERED
)
3217 new_code
= reverse_condition_maybe_unordered (code
);
3222 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3223 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3224 compare_code
= reverse_condition (compare_code
);
3227 if (new_code
!= UNKNOWN
)
3235 if (compare_code
!= UNKNOWN
)
3237 /* notl op1 (if needed)
3242 For x < 0 (resp. x <= -1) there will be no notl,
3243 so if possible swap the constants to get rid of the
3245 True/false will be -1/0 while code below (store flag
3246 followed by decrement) is 0/-1, so the constants need
3247 to be exchanged once more. */
3249 if (compare_code
== GE
|| !cf
)
3251 code
= reverse_condition (code
);
3257 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3261 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3263 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3265 copy_rtx (out
), 1, OPTAB_DIRECT
);
3268 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3269 gen_int_mode (cf
- ct
, mode
),
3270 copy_rtx (out
), 1, OPTAB_DIRECT
);
3272 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3273 copy_rtx (out
), 1, OPTAB_DIRECT
);
3274 if (!rtx_equal_p (out
, operands
[0]))
3275 emit_move_insn (operands
[0], copy_rtx (out
));
3281 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3283 /* Try a few things more with specific constants and a variable. */
3286 rtx var
, orig_out
, out
, tmp
;
3288 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3291 /* If one of the two operands is an interesting constant, load a
3292 constant with the above and mask it in with a logical operation. */
3294 if (CONST_INT_P (operands
[2]))
3297 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3298 operands
[3] = constm1_rtx
, op
= and_optab
;
3299 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3300 operands
[3] = const0_rtx
, op
= ior_optab
;
3304 else if (CONST_INT_P (operands
[3]))
3307 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3308 operands
[2] = constm1_rtx
, op
= and_optab
;
3309 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3310 operands
[2] = const0_rtx
, op
= ior_optab
;
3317 orig_out
= operands
[0];
3318 tmp
= gen_reg_rtx (mode
);
3321 /* Recurse to get the constant loaded. */
3322 if (!ix86_expand_int_movcc (operands
))
3325 /* Mask in the interesting variable. */
3326 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3328 if (!rtx_equal_p (out
, orig_out
))
3329 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3335 * For comparison with above,
3345 if (! nonimmediate_operand (operands
[2], mode
))
3346 operands
[2] = force_reg (mode
, operands
[2]);
3347 if (! nonimmediate_operand (operands
[3], mode
))
3348 operands
[3] = force_reg (mode
, operands
[3]);
3350 if (! register_operand (operands
[2], VOIDmode
)
3352 || ! register_operand (operands
[3], VOIDmode
)))
3353 operands
[2] = force_reg (mode
, operands
[2]);
3356 && ! register_operand (operands
[3], VOIDmode
))
3357 operands
[3] = force_reg (mode
, operands
[3]);
3359 emit_insn (compare_seq
);
3360 emit_insn (gen_rtx_SET (operands
[0],
3361 gen_rtx_IF_THEN_ELSE (mode
,
3362 compare_op
, operands
[2],
3367 /* Detect conditional moves that exactly match min/max operational
3368 semantics. Note that this is IEEE safe, as long as we don't
3369 interchange the operands.
3371 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3372 and TRUE if the operation is successful and instructions are emitted. */
3375 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3376 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3384 else if (code
== UNGE
)
3385 std::swap (if_true
, if_false
);
3389 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3391 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3396 mode
= GET_MODE (dest
);
3398 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3399 but MODE may be a vector mode and thus not appropriate. */
3400 if (!flag_finite_math_only
|| flag_signed_zeros
)
3402 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3405 if_true
= force_reg (mode
, if_true
);
3406 v
= gen_rtvec (2, if_true
, if_false
);
3407 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3411 code
= is_min
? SMIN
: SMAX
;
3412 if (MEM_P (if_true
) && MEM_P (if_false
))
3413 if_true
= force_reg (mode
, if_true
);
3414 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3417 emit_insn (gen_rtx_SET (dest
, tmp
));
3421 /* Return true if MODE is valid for vector compare to mask register,
3422 Same result for conditionl vector move with mask register. */
3424 ix86_valid_mask_cmp_mode (machine_mode mode
)
3426 /* XOP has its own vector conditional movement. */
3427 if (TARGET_XOP
&& !TARGET_AVX512F
)
3430 /* AVX512F is needed for mask operation. */
3431 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3434 /* AVX512BW is needed for vector QI/HImode,
3435 AVX512VL is needed for 128/256-bit vector. */
3436 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3437 int vector_size
= GET_MODE_SIZE (mode
);
3438 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3441 return vector_size
== 64 || TARGET_AVX512VL
;
3444 /* Expand an SSE comparison. Return the register with the result. */
3447 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3448 rtx op_true
, rtx op_false
)
3450 machine_mode mode
= GET_MODE (dest
);
3451 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3453 /* In general case result of comparison can differ from operands' type. */
3454 machine_mode cmp_mode
;
3456 /* In AVX512F the result of comparison is an integer mask. */
3457 bool maskcmp
= false;
3460 if (ix86_valid_mask_cmp_mode (cmp_ops_mode
))
3462 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3464 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3467 cmp_mode
= cmp_ops_mode
;
3469 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3471 int (*op1_predicate
)(rtx
, machine_mode
)
3472 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3474 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3475 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3478 || (maskcmp
&& cmp_mode
!= mode
)
3479 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3480 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3481 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3483 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3485 if (cmp_mode
!= mode
&& !maskcmp
)
3487 x
= force_reg (cmp_ops_mode
, x
);
3488 convert_move (dest
, x
, false);
3491 emit_insn (gen_rtx_SET (dest
, x
));
3496 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3497 operations. This is used for both scalar and vector conditional moves. */
3500 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3502 machine_mode mode
= GET_MODE (dest
);
3503 machine_mode cmpmode
= GET_MODE (cmp
);
3505 /* In AVX512F the result of comparison is an integer mask. */
3506 bool maskcmp
= mode
!= cmpmode
&& ix86_valid_mask_cmp_mode (mode
);
3510 /* If we have an integer mask and FP value then we need
3511 to cast mask to FP mode. */
3512 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3514 cmp
= force_reg (cmpmode
, cmp
);
3515 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3520 /* Using vector move with mask register. */
3521 cmp
= force_reg (cmpmode
, cmp
);
3522 /* Optimize for mask zero. */
3523 op_true
= (op_true
!= CONST0_RTX (mode
)
3524 ? force_reg (mode
, op_true
) : op_true
);
3525 op_false
= (op_false
!= CONST0_RTX (mode
)
3526 ? force_reg (mode
, op_false
) : op_false
);
3527 if (op_true
== CONST0_RTX (mode
))
3529 rtx (*gen_not
) (rtx
, rtx
);
3532 case E_QImode
: gen_not
= gen_knotqi
; break;
3533 case E_HImode
: gen_not
= gen_knothi
; break;
3534 case E_SImode
: gen_not
= gen_knotsi
; break;
3535 case E_DImode
: gen_not
= gen_knotdi
; break;
3536 default: gcc_unreachable ();
3538 rtx n
= gen_reg_rtx (cmpmode
);
3539 emit_insn (gen_not (n
, cmp
));
3541 /* Reverse op_true op_false. */
3542 std::swap (op_true
, op_false
);
3545 rtx vec_merge
= gen_rtx_VEC_MERGE (mode
, op_true
, op_false
, cmp
);
3546 emit_insn (gen_rtx_SET (dest
, vec_merge
));
3549 else if (vector_all_ones_operand (op_true
, mode
)
3550 && op_false
== CONST0_RTX (mode
))
3552 emit_insn (gen_rtx_SET (dest
, cmp
));
3555 else if (op_false
== CONST0_RTX (mode
))
3557 op_true
= force_reg (mode
, op_true
);
3558 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3559 emit_insn (gen_rtx_SET (dest
, x
));
3562 else if (op_true
== CONST0_RTX (mode
))
3564 op_false
= force_reg (mode
, op_false
);
3565 x
= gen_rtx_NOT (mode
, cmp
);
3566 x
= gen_rtx_AND (mode
, x
, op_false
);
3567 emit_insn (gen_rtx_SET (dest
, x
));
3570 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3572 op_false
= force_reg (mode
, op_false
);
3573 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3574 emit_insn (gen_rtx_SET (dest
, x
));
3577 else if (TARGET_XOP
)
3579 op_true
= force_reg (mode
, op_true
);
3581 if (!nonimmediate_operand (op_false
, mode
))
3582 op_false
= force_reg (mode
, op_false
);
3584 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3590 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3593 if (!vector_operand (op_true
, mode
))
3594 op_true
= force_reg (mode
, op_true
);
3596 op_false
= force_reg (mode
, op_false
);
3602 gen
= gen_sse4_1_blendvps
;
3606 gen
= gen_sse4_1_blendvpd
;
3611 gen
= gen_sse4_1_blendvss
;
3612 op_true
= force_reg (mode
, op_true
);
3618 gen
= gen_sse4_1_blendvsd
;
3619 op_true
= force_reg (mode
, op_true
);
3628 gen
= gen_sse4_1_pblendvb
;
3629 if (mode
!= V16QImode
)
3630 d
= gen_reg_rtx (V16QImode
);
3631 op_false
= gen_lowpart (V16QImode
, op_false
);
3632 op_true
= gen_lowpart (V16QImode
, op_true
);
3633 cmp
= gen_lowpart (V16QImode
, cmp
);
3638 gen
= gen_avx_blendvps256
;
3642 gen
= gen_avx_blendvpd256
;
3650 gen
= gen_avx2_pblendvb
;
3651 if (mode
!= V32QImode
)
3652 d
= gen_reg_rtx (V32QImode
);
3653 op_false
= gen_lowpart (V32QImode
, op_false
);
3654 op_true
= gen_lowpart (V32QImode
, op_true
);
3655 cmp
= gen_lowpart (V32QImode
, cmp
);
3660 gen
= gen_avx512bw_blendmv64qi
;
3663 gen
= gen_avx512bw_blendmv32hi
;
3666 gen
= gen_avx512f_blendmv16si
;
3669 gen
= gen_avx512f_blendmv8di
;
3672 gen
= gen_avx512f_blendmv8df
;
3675 gen
= gen_avx512f_blendmv16sf
;
3684 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3686 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3690 op_true
= force_reg (mode
, op_true
);
3692 t2
= gen_reg_rtx (mode
);
3694 t3
= gen_reg_rtx (mode
);
3698 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3699 emit_insn (gen_rtx_SET (t2
, x
));
3701 x
= gen_rtx_NOT (mode
, cmp
);
3702 x
= gen_rtx_AND (mode
, x
, op_false
);
3703 emit_insn (gen_rtx_SET (t3
, x
));
3705 x
= gen_rtx_IOR (mode
, t3
, t2
);
3706 emit_insn (gen_rtx_SET (dest
, x
));
3710 /* Swap, force into registers, or otherwise massage the two operands
3711 to an sse comparison with a mask result. Thus we differ a bit from
3712 ix86_prepare_fp_compare_args which expects to produce a flags result.
3714 The DEST operand exists to help determine whether to commute commutative
3715 operators. The POP0/POP1 operands are updated in place. The new
3716 comparison code is returned, or UNKNOWN if not implementable. */
3718 static enum rtx_code
3719 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3720 rtx
*pop0
, rtx
*pop1
)
3726 /* AVX supports all the needed comparisons. */
3729 /* We have no LTGT as an operator. We could implement it with
3730 NE & ORDERED, but this requires an extra temporary. It's
3731 not clear that it's worth it. */
3738 /* These are supported directly. */
3745 /* AVX has 3 operand comparisons, no need to swap anything. */
3748 /* For commutative operators, try to canonicalize the destination
3749 operand to be first in the comparison - this helps reload to
3750 avoid extra moves. */
3751 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3759 /* These are not supported directly before AVX, and furthermore
3760 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3761 comparison operands to transform into something that is
3763 std::swap (*pop0
, *pop1
);
3764 code
= swap_condition (code
);
3774 /* Expand a floating-point conditional move. Return true if successful. */
3777 ix86_expand_fp_movcc (rtx operands
[])
3779 machine_mode mode
= GET_MODE (operands
[0]);
3780 enum rtx_code code
= GET_CODE (operands
[1]);
3781 rtx tmp
, compare_op
;
3782 rtx op0
= XEXP (operands
[1], 0);
3783 rtx op1
= XEXP (operands
[1], 1);
3785 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3789 /* Since we've no cmove for sse registers, don't force bad register
3790 allocation just to gain access to it. Deny movcc when the
3791 comparison mode doesn't match the move mode. */
3792 cmode
= GET_MODE (op0
);
3793 if (cmode
== VOIDmode
)
3794 cmode
= GET_MODE (op1
);
3798 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3799 if (code
== UNKNOWN
)
3802 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3803 operands
[2], operands
[3]))
3806 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3807 operands
[2], operands
[3]);
3808 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3812 if (GET_MODE (op0
) == TImode
3813 || (GET_MODE (op0
) == DImode
3817 /* The floating point conditional move instructions don't directly
3818 support conditions resulting from a signed integer comparison. */
3820 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3821 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3823 tmp
= gen_reg_rtx (QImode
);
3824 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3826 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3829 emit_insn (gen_rtx_SET (operands
[0],
3830 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3831 operands
[2], operands
[3])));
3836 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3839 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3864 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3867 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3904 /* Return immediate value to be used in UNSPEC_PCMP
3905 for comparison CODE in MODE. */
3908 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3910 if (FLOAT_MODE_P (mode
))
3911 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3912 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3915 /* Expand AVX-512 vector comparison. */
3918 ix86_expand_mask_vec_cmp (rtx operands
[])
3920 machine_mode mask_mode
= GET_MODE (operands
[0]);
3921 machine_mode cmp_mode
= GET_MODE (operands
[2]);
3922 enum rtx_code code
= GET_CODE (operands
[1]);
3923 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3933 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3937 unspec_code
= UNSPEC_PCMP
;
3940 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, operands
[2],
3943 emit_insn (gen_rtx_SET (operands
[0], unspec
));
3948 /* Expand fp vector comparison. */
3951 ix86_expand_fp_vec_cmp (rtx operands
[])
3953 enum rtx_code code
= GET_CODE (operands
[1]);
3956 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
3957 &operands
[2], &operands
[3]);
3958 if (code
== UNKNOWN
)
3961 switch (GET_CODE (operands
[1]))
3964 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
3965 operands
[3], NULL
, NULL
);
3966 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
3967 operands
[3], NULL
, NULL
);
3971 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
3972 operands
[3], NULL
, NULL
);
3973 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
3974 operands
[3], NULL
, NULL
);
3980 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
3984 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
3985 operands
[1], operands
[2]);
3987 if (operands
[0] != cmp
)
3988 emit_move_insn (operands
[0], cmp
);
3994 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
3995 rtx op_true
, rtx op_false
, bool *negate
)
3997 machine_mode data_mode
= GET_MODE (dest
);
3998 machine_mode mode
= GET_MODE (cop0
);
4003 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4005 && (mode
== V16QImode
|| mode
== V8HImode
4006 || mode
== V4SImode
|| mode
== V2DImode
))
4008 /* AVX512F supports all of the comparsions
4009 on all 128/256/512-bit vector int types. */
4010 else if (ix86_valid_mask_cmp_mode (mode
))
4014 /* Canonicalize the comparison to EQ, GT, GTU. */
4025 code
= reverse_condition (code
);
4031 code
= reverse_condition (code
);
4037 std::swap (cop0
, cop1
);
4038 code
= swap_condition (code
);
4045 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4046 if (mode
== V2DImode
)
4051 /* SSE4.1 supports EQ. */
4058 /* SSE4.2 supports GT/GTU. */
4068 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4069 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4071 std::swap (optrue
, opfalse
);
4073 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4074 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4075 min (x, y) == x). While we add one instruction (the minimum),
4076 we remove the need for two instructions in the negation, as the
4077 result is done this way.
4078 When using masks, do it for SI/DImode element types, as it is shorter
4079 than the two subtractions. */
4081 && GET_MODE_SIZE (mode
) != 64
4082 && vector_all_ones_operand (opfalse
, data_mode
)
4083 && optrue
== CONST0_RTX (data_mode
))
4085 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4086 /* Don't do it if not using integer masks and we'd end up with
4087 the right values in the registers though. */
4088 && (GET_MODE_SIZE (mode
) == 64
4089 || !vector_all_ones_operand (optrue
, data_mode
)
4090 || opfalse
!= CONST0_RTX (data_mode
))))
4092 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4097 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4100 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4101 cop0
= force_reg (mode
, cop0
);
4102 cop1
= force_reg (mode
, cop1
);
4106 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4110 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4114 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4117 if (TARGET_AVX512VL
)
4119 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4120 cop0
= force_reg (mode
, cop0
);
4121 cop1
= force_reg (mode
, cop1
);
4125 if (code
== GTU
&& TARGET_SSE2
)
4126 gen
= gen_uminv16qi3
;
4127 else if (code
== GT
&& TARGET_SSE4_1
)
4128 gen
= gen_sminv16qi3
;
4131 if (code
== GTU
&& TARGET_SSE4_1
)
4132 gen
= gen_uminv8hi3
;
4133 else if (code
== GT
&& TARGET_SSE2
)
4134 gen
= gen_sminv8hi3
;
4138 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4141 if (TARGET_AVX512VL
)
4143 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4144 cop0
= force_reg (mode
, cop0
);
4145 cop1
= force_reg (mode
, cop1
);
4154 rtx tem
= gen_reg_rtx (mode
);
4155 if (!vector_operand (cop0
, mode
))
4156 cop0
= force_reg (mode
, cop0
);
4157 if (!vector_operand (cop1
, mode
))
4158 cop1
= force_reg (mode
, cop1
);
4160 emit_insn (gen (tem
, cop0
, cop1
));
4166 /* Unsigned parallel compare is not supported by the hardware.
4167 Play some tricks to turn this into a signed comparison
4171 cop0
= force_reg (mode
, cop0
);
4184 /* Subtract (-(INT MAX) - 1) from both operands to make
4186 mask
= ix86_build_signbit_mask (mode
, true, false);
4187 t1
= gen_reg_rtx (mode
);
4188 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4190 t2
= gen_reg_rtx (mode
);
4191 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4205 /* Perform a parallel unsigned saturating subtraction. */
4206 x
= gen_reg_rtx (mode
);
4207 emit_insn (gen_rtx_SET
4208 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4210 cop1
= CONST0_RTX (mode
);
4222 std::swap (op_true
, op_false
);
4224 /* Allow the comparison to be done in one mode, but the movcc to
4225 happen in another mode. */
4226 if (data_mode
== mode
)
4228 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4233 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4234 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4236 if (GET_MODE (x
) == mode
)
4237 x
= gen_lowpart (data_mode
, x
);
4243 /* Expand integer vector comparison. */
4246 ix86_expand_int_vec_cmp (rtx operands
[])
4248 rtx_code code
= GET_CODE (operands
[1]);
4249 bool negate
= false;
4250 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4251 operands
[3], NULL
, NULL
, &negate
);
4257 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4258 CONST0_RTX (GET_MODE (cmp
)),
4259 NULL
, NULL
, &negate
);
4261 gcc_assert (!negate
);
4263 if (operands
[0] != cmp
)
4264 emit_move_insn (operands
[0], cmp
);
4269 /* Expand a floating-point vector conditional move; a vcond operation
4270 rather than a movcc operation. */
4273 ix86_expand_fp_vcond (rtx operands
[])
4275 enum rtx_code code
= GET_CODE (operands
[3]);
4278 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4279 &operands
[4], &operands
[5]);
4280 if (code
== UNKNOWN
)
4283 switch (GET_CODE (operands
[3]))
4286 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4287 operands
[5], operands
[0], operands
[0]);
4288 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4289 operands
[5], operands
[1], operands
[2]);
4293 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4294 operands
[5], operands
[0], operands
[0]);
4295 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4296 operands
[5], operands
[1], operands
[2]);
4302 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4304 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4308 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4309 operands
[5], operands
[1], operands
[2]))
4312 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4313 operands
[1], operands
[2]);
4314 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4318 /* Expand a signed/unsigned integral vector conditional move. */
4321 ix86_expand_int_vcond (rtx operands
[])
4323 machine_mode data_mode
= GET_MODE (operands
[0]);
4324 machine_mode mode
= GET_MODE (operands
[4]);
4325 enum rtx_code code
= GET_CODE (operands
[3]);
4326 bool negate
= false;
4332 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4333 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4334 if ((code
== LT
|| code
== GE
)
4335 && data_mode
== mode
4336 && cop1
== CONST0_RTX (mode
)
4337 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4338 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4339 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4340 && (GET_MODE_SIZE (data_mode
) == 16
4341 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4343 rtx negop
= operands
[2 - (code
== LT
)];
4344 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4345 if (negop
== CONST1_RTX (data_mode
))
4347 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4348 operands
[0], 1, OPTAB_DIRECT
);
4349 if (res
!= operands
[0])
4350 emit_move_insn (operands
[0], res
);
4353 else if (GET_MODE_INNER (data_mode
) != DImode
4354 && vector_all_ones_operand (negop
, data_mode
))
4356 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4357 operands
[0], 0, OPTAB_DIRECT
);
4358 if (res
!= operands
[0])
4359 emit_move_insn (operands
[0], res
);
4364 if (!nonimmediate_operand (cop1
, mode
))
4365 cop1
= force_reg (mode
, cop1
);
4366 if (!general_operand (operands
[1], data_mode
))
4367 operands
[1] = force_reg (data_mode
, operands
[1]);
4368 if (!general_operand (operands
[2], data_mode
))
4369 operands
[2] = force_reg (data_mode
, operands
[2]);
4371 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4372 operands
[1], operands
[2], &negate
);
4377 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4378 operands
[2-negate
]);
4383 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4384 struct expand_vec_perm_d
*d
)
4386 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4387 expander, so args are either in d, or in op0, op1 etc. */
4388 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4389 machine_mode maskmode
= mode
;
4390 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4395 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4396 gen
= gen_avx512vl_vpermt2varv8hi3
;
4399 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4400 gen
= gen_avx512vl_vpermt2varv16hi3
;
4403 if (TARGET_AVX512VBMI
)
4404 gen
= gen_avx512bw_vpermt2varv64qi3
;
4407 if (TARGET_AVX512BW
)
4408 gen
= gen_avx512bw_vpermt2varv32hi3
;
4411 if (TARGET_AVX512VL
)
4412 gen
= gen_avx512vl_vpermt2varv4si3
;
4415 if (TARGET_AVX512VL
)
4416 gen
= gen_avx512vl_vpermt2varv8si3
;
4420 gen
= gen_avx512f_vpermt2varv16si3
;
4423 if (TARGET_AVX512VL
)
4425 gen
= gen_avx512vl_vpermt2varv4sf3
;
4426 maskmode
= V4SImode
;
4430 if (TARGET_AVX512VL
)
4432 gen
= gen_avx512vl_vpermt2varv8sf3
;
4433 maskmode
= V8SImode
;
4439 gen
= gen_avx512f_vpermt2varv16sf3
;
4440 maskmode
= V16SImode
;
4444 if (TARGET_AVX512VL
)
4445 gen
= gen_avx512vl_vpermt2varv2di3
;
4448 if (TARGET_AVX512VL
)
4449 gen
= gen_avx512vl_vpermt2varv4di3
;
4453 gen
= gen_avx512f_vpermt2varv8di3
;
4456 if (TARGET_AVX512VL
)
4458 gen
= gen_avx512vl_vpermt2varv2df3
;
4459 maskmode
= V2DImode
;
4463 if (TARGET_AVX512VL
)
4465 gen
= gen_avx512vl_vpermt2varv4df3
;
4466 maskmode
= V4DImode
;
4472 gen
= gen_avx512f_vpermt2varv8df3
;
4473 maskmode
= V8DImode
;
4483 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4484 expander, so args are either in d, or in op0, op1 etc. */
4491 for (int i
= 0; i
< d
->nelt
; ++i
)
4492 vec
[i
] = GEN_INT (d
->perm
[i
]);
4493 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4496 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4500 /* Expand a variable vector permutation. */
4503 ix86_expand_vec_perm (rtx operands
[])
4505 rtx target
= operands
[0];
4506 rtx op0
= operands
[1];
4507 rtx op1
= operands
[2];
4508 rtx mask
= operands
[3];
4509 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4510 machine_mode mode
= GET_MODE (op0
);
4511 machine_mode maskmode
= GET_MODE (mask
);
4513 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4515 /* Number of elements in the vector. */
4516 w
= GET_MODE_NUNITS (mode
);
4517 e
= GET_MODE_UNIT_SIZE (mode
);
4518 gcc_assert (w
<= 64);
4520 if (TARGET_AVX512F
&& one_operand_shuffle
)
4522 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4526 gen
=gen_avx512f_permvarv16si
;
4529 gen
= gen_avx512f_permvarv16sf
;
4532 gen
= gen_avx512f_permvarv8di
;
4535 gen
= gen_avx512f_permvarv8df
;
4542 emit_insn (gen (target
, op0
, mask
));
4547 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4552 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4554 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4555 an constant shuffle operand. With a tiny bit of effort we can
4556 use VPERMD instead. A re-interpretation stall for V4DFmode is
4557 unfortunate but there's no avoiding it.
4558 Similarly for V16HImode we don't have instructions for variable
4559 shuffling, while for V32QImode we can use after preparing suitable
4560 masks vpshufb; vpshufb; vpermq; vpor. */
4562 if (mode
== V16HImode
)
4564 maskmode
= mode
= V32QImode
;
4570 maskmode
= mode
= V8SImode
;
4574 t1
= gen_reg_rtx (maskmode
);
4576 /* Replicate the low bits of the V4DImode mask into V8SImode:
4578 t1 = { A A B B C C D D }. */
4579 for (i
= 0; i
< w
/ 2; ++i
)
4580 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4581 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4582 vt
= force_reg (maskmode
, vt
);
4583 mask
= gen_lowpart (maskmode
, mask
);
4584 if (maskmode
== V8SImode
)
4585 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4587 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4589 /* Multiply the shuffle indicies by two. */
4590 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4593 /* Add one to the odd shuffle indicies:
4594 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4595 for (i
= 0; i
< w
/ 2; ++i
)
4597 vec
[i
* 2] = const0_rtx
;
4598 vec
[i
* 2 + 1] = const1_rtx
;
4600 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4601 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4602 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4605 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4606 operands
[3] = mask
= t1
;
4607 target
= gen_reg_rtx (mode
);
4608 op0
= gen_lowpart (mode
, op0
);
4609 op1
= gen_lowpart (mode
, op1
);
4615 /* The VPERMD and VPERMPS instructions already properly ignore
4616 the high bits of the shuffle elements. No need for us to
4617 perform an AND ourselves. */
4618 if (one_operand_shuffle
)
4620 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4621 if (target
!= operands
[0])
4622 emit_move_insn (operands
[0],
4623 gen_lowpart (GET_MODE (operands
[0]), target
));
4627 t1
= gen_reg_rtx (V8SImode
);
4628 t2
= gen_reg_rtx (V8SImode
);
4629 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4630 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4636 mask
= gen_lowpart (V8SImode
, mask
);
4637 if (one_operand_shuffle
)
4638 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4641 t1
= gen_reg_rtx (V8SFmode
);
4642 t2
= gen_reg_rtx (V8SFmode
);
4643 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4644 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4650 /* By combining the two 128-bit input vectors into one 256-bit
4651 input vector, we can use VPERMD and VPERMPS for the full
4652 two-operand shuffle. */
4653 t1
= gen_reg_rtx (V8SImode
);
4654 t2
= gen_reg_rtx (V8SImode
);
4655 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4656 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4657 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4658 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4662 t1
= gen_reg_rtx (V8SFmode
);
4663 t2
= gen_reg_rtx (V8SImode
);
4664 mask
= gen_lowpart (V4SImode
, mask
);
4665 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4666 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4667 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4668 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4672 t1
= gen_reg_rtx (V32QImode
);
4673 t2
= gen_reg_rtx (V32QImode
);
4674 t3
= gen_reg_rtx (V32QImode
);
4675 vt2
= GEN_INT (-128);
4676 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4677 vt
= force_reg (V32QImode
, vt
);
4678 for (i
= 0; i
< 32; i
++)
4679 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4680 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4681 vt2
= force_reg (V32QImode
, vt2
);
4682 /* From mask create two adjusted masks, which contain the same
4683 bits as mask in the low 7 bits of each vector element.
4684 The first mask will have the most significant bit clear
4685 if it requests element from the same 128-bit lane
4686 and MSB set if it requests element from the other 128-bit lane.
4687 The second mask will have the opposite values of the MSB,
4688 and additionally will have its 128-bit lanes swapped.
4689 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4690 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4691 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4692 stands for other 12 bytes. */
4693 /* The bit whether element is from the same lane or the other
4694 lane is bit 4, so shift it up by 3 to the MSB position. */
4695 t5
= gen_reg_rtx (V4DImode
);
4696 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4698 /* Clear MSB bits from the mask just in case it had them set. */
4699 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4700 /* After this t1 will have MSB set for elements from other lane. */
4701 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4702 /* Clear bits other than MSB. */
4703 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4704 /* Or in the lower bits from mask into t3. */
4705 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4706 /* And invert MSB bits in t1, so MSB is set for elements from the same
4708 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4709 /* Swap 128-bit lanes in t3. */
4710 t6
= gen_reg_rtx (V4DImode
);
4711 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4712 const2_rtx
, GEN_INT (3),
4713 const0_rtx
, const1_rtx
));
4714 /* And or in the lower bits from mask into t1. */
4715 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4716 if (one_operand_shuffle
)
4718 /* Each of these shuffles will put 0s in places where
4719 element from the other 128-bit lane is needed, otherwise
4720 will shuffle in the requested value. */
4721 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4722 gen_lowpart (V32QImode
, t6
)));
4723 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4724 /* For t3 the 128-bit lanes are swapped again. */
4725 t7
= gen_reg_rtx (V4DImode
);
4726 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4727 const2_rtx
, GEN_INT (3),
4728 const0_rtx
, const1_rtx
));
4729 /* And oring both together leads to the result. */
4730 emit_insn (gen_iorv32qi3 (target
, t1
,
4731 gen_lowpart (V32QImode
, t7
)));
4732 if (target
!= operands
[0])
4733 emit_move_insn (operands
[0],
4734 gen_lowpart (GET_MODE (operands
[0]), target
));
4738 t4
= gen_reg_rtx (V32QImode
);
4739 /* Similarly to the above one_operand_shuffle code,
4740 just for repeated twice for each operand. merge_two:
4741 code will merge the two results together. */
4742 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4743 gen_lowpart (V32QImode
, t6
)));
4744 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4745 gen_lowpart (V32QImode
, t6
)));
4746 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4747 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4748 t7
= gen_reg_rtx (V4DImode
);
4749 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4750 const2_rtx
, GEN_INT (3),
4751 const0_rtx
, const1_rtx
));
4752 t8
= gen_reg_rtx (V4DImode
);
4753 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4754 const2_rtx
, GEN_INT (3),
4755 const0_rtx
, const1_rtx
));
4756 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4757 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4763 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4770 /* The XOP VPPERM insn supports three inputs. By ignoring the
4771 one_operand_shuffle special case, we avoid creating another
4772 set of constant vectors in memory. */
4773 one_operand_shuffle
= false;
4775 /* mask = mask & {2*w-1, ...} */
4776 vt
= GEN_INT (2*w
- 1);
4780 /* mask = mask & {w-1, ...} */
4781 vt
= GEN_INT (w
- 1);
4784 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4785 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4786 NULL_RTX
, 0, OPTAB_DIRECT
);
4788 /* For non-QImode operations, convert the word permutation control
4789 into a byte permutation control. */
4790 if (mode
!= V16QImode
)
4792 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4793 GEN_INT (exact_log2 (e
)),
4794 NULL_RTX
, 0, OPTAB_DIRECT
);
4796 /* Convert mask to vector of chars. */
4797 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4799 /* Replicate each of the input bytes into byte positions:
4800 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4801 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4802 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4803 for (i
= 0; i
< 16; ++i
)
4804 vec
[i
] = GEN_INT (i
/e
* e
);
4805 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4806 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4808 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4810 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4812 /* Convert it into the byte positions by doing
4813 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4814 for (i
= 0; i
< 16; ++i
)
4815 vec
[i
] = GEN_INT (i
% e
);
4816 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4817 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4818 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4821 /* The actual shuffle operations all operate on V16QImode. */
4822 op0
= gen_lowpart (V16QImode
, op0
);
4823 op1
= gen_lowpart (V16QImode
, op1
);
4827 if (GET_MODE (target
) != V16QImode
)
4828 target
= gen_reg_rtx (V16QImode
);
4829 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4830 if (target
!= operands
[0])
4831 emit_move_insn (operands
[0],
4832 gen_lowpart (GET_MODE (operands
[0]), target
));
4834 else if (one_operand_shuffle
)
4836 if (GET_MODE (target
) != V16QImode
)
4837 target
= gen_reg_rtx (V16QImode
);
4838 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4839 if (target
!= operands
[0])
4840 emit_move_insn (operands
[0],
4841 gen_lowpart (GET_MODE (operands
[0]), target
));
4848 /* Shuffle the two input vectors independently. */
4849 t1
= gen_reg_rtx (V16QImode
);
4850 t2
= gen_reg_rtx (V16QImode
);
4851 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4852 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4855 /* Then merge them together. The key is whether any given control
4856 element contained a bit set that indicates the second word. */
4859 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4861 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4862 more shuffle to convert the V2DI input mask into a V4SI
4863 input mask. At which point the masking that expand_int_vcond
4864 will work as desired. */
4865 rtx t3
= gen_reg_rtx (V4SImode
);
4866 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4867 const0_rtx
, const0_rtx
,
4868 const2_rtx
, const2_rtx
));
4870 maskmode
= V4SImode
;
4874 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4875 vt
= force_reg (maskmode
, vt
);
4876 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4877 NULL_RTX
, 0, OPTAB_DIRECT
);
4879 if (GET_MODE (target
) != mode
)
4880 target
= gen_reg_rtx (mode
);
4882 xops
[1] = gen_lowpart (mode
, t2
);
4883 xops
[2] = gen_lowpart (mode
, t1
);
4884 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4887 ok
= ix86_expand_int_vcond (xops
);
4889 if (target
!= operands
[0])
4890 emit_move_insn (operands
[0],
4891 gen_lowpart (GET_MODE (operands
[0]), target
));
4895 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4896 true if we should do zero extension, else sign extension. HIGH_P is
4897 true if we want the N/2 high elements, else the low elements. */
4900 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4902 machine_mode imode
= GET_MODE (src
);
4907 rtx (*unpack
)(rtx
, rtx
);
4908 rtx (*extract
)(rtx
, rtx
) = NULL
;
4909 machine_mode halfmode
= BLKmode
;
4915 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4917 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4918 halfmode
= V32QImode
;
4920 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4924 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4926 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4927 halfmode
= V16QImode
;
4929 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4933 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4935 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4936 halfmode
= V16HImode
;
4938 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4942 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4944 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4945 halfmode
= V8HImode
;
4947 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
4951 unpack
= gen_avx512f_zero_extendv8siv8di2
;
4953 unpack
= gen_avx512f_sign_extendv8siv8di2
;
4954 halfmode
= V8SImode
;
4956 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
4960 unpack
= gen_avx2_zero_extendv4siv4di2
;
4962 unpack
= gen_avx2_sign_extendv4siv4di2
;
4963 halfmode
= V4SImode
;
4965 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
4969 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
4971 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
4975 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
4977 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
4981 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
4983 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
4989 if (GET_MODE_SIZE (imode
) >= 32)
4991 tmp
= gen_reg_rtx (halfmode
);
4992 emit_insn (extract (tmp
, src
));
4996 /* Shift higher 8 bytes to lower 8 bytes. */
4997 tmp
= gen_reg_rtx (V1TImode
);
4998 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5000 tmp
= gen_lowpart (imode
, tmp
);
5005 emit_insn (unpack (dest
, tmp
));
5009 rtx (*unpack
)(rtx
, rtx
, rtx
);
5015 unpack
= gen_vec_interleave_highv16qi
;
5017 unpack
= gen_vec_interleave_lowv16qi
;
5021 unpack
= gen_vec_interleave_highv8hi
;
5023 unpack
= gen_vec_interleave_lowv8hi
;
5027 unpack
= gen_vec_interleave_highv4si
;
5029 unpack
= gen_vec_interleave_lowv4si
;
5036 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5038 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5039 src
, pc_rtx
, pc_rtx
);
5041 rtx tmp2
= gen_reg_rtx (imode
);
5042 emit_insn (unpack (tmp2
, src
, tmp
));
5043 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5047 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5048 but works for floating pointer parameters and nonoffsetable memories.
5049 For pushes, it returns just stack offsets; the values will be saved
5050 in the right order. Maximally three parts are generated. */
5053 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5058 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5060 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5062 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5063 gcc_assert (size
>= 2 && size
<= 4);
5065 /* Optimize constant pool reference to immediates. This is used by fp
5066 moves, that force all constants to memory to allow combining. */
5067 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5068 operand
= avoid_constant_pool_reference (operand
);
5070 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5072 /* The only non-offsetable memories we handle are pushes. */
5073 int ok
= push_operand (operand
, VOIDmode
);
5077 operand
= copy_rtx (operand
);
5078 PUT_MODE (operand
, word_mode
);
5079 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5083 if (GET_CODE (operand
) == CONST_VECTOR
)
5085 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5086 /* Caution: if we looked through a constant pool memory above,
5087 the operand may actually have a different mode now. That's
5088 ok, since we want to pun this all the way back to an integer. */
5089 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5090 gcc_assert (operand
!= NULL
);
5097 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5102 if (REG_P (operand
))
5104 gcc_assert (reload_completed
);
5105 for (i
= 0; i
< size
; i
++)
5106 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5108 else if (offsettable_memref_p (operand
))
5110 operand
= adjust_address (operand
, SImode
, 0);
5112 for (i
= 1; i
< size
; i
++)
5113 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5115 else if (CONST_DOUBLE_P (operand
))
5117 const REAL_VALUE_TYPE
*r
;
5120 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5124 real_to_target (l
, r
, mode
);
5125 parts
[3] = gen_int_mode (l
[3], SImode
);
5126 parts
[2] = gen_int_mode (l
[2], SImode
);
5129 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5130 long double may not be 80-bit. */
5131 real_to_target (l
, r
, mode
);
5132 parts
[2] = gen_int_mode (l
[2], SImode
);
5135 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5140 parts
[1] = gen_int_mode (l
[1], SImode
);
5141 parts
[0] = gen_int_mode (l
[0], SImode
);
5150 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5151 if (mode
== XFmode
|| mode
== TFmode
)
5153 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5154 if (REG_P (operand
))
5156 gcc_assert (reload_completed
);
5157 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5158 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5160 else if (offsettable_memref_p (operand
))
5162 operand
= adjust_address (operand
, DImode
, 0);
5164 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5166 else if (CONST_DOUBLE_P (operand
))
5170 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5172 /* real_to_target puts 32-bit pieces in each long. */
5173 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5174 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5177 if (upper_mode
== SImode
)
5178 parts
[1] = gen_int_mode (l
[2], SImode
);
5181 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5182 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5193 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5194 Return false when normal moves are needed; true when all required
5195 insns have been emitted. Operands 2-4 contain the input values
5196 int the correct order; operands 5-7 contain the output values. */
5199 ix86_split_long_move (rtx operands
[])
5205 machine_mode mode
= GET_MODE (operands
[0]);
5206 bool collisionparts
[4];
5208 /* The DFmode expanders may ask us to move double.
5209 For 64bit target this is single move. By hiding the fact
5210 here we simplify i386.md splitters. */
5211 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5213 /* Optimize constant pool reference to immediates. This is used by
5214 fp moves, that force all constants to memory to allow combining. */
5216 if (MEM_P (operands
[1])
5217 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5218 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5219 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5220 if (push_operand (operands
[0], VOIDmode
))
5222 operands
[0] = copy_rtx (operands
[0]);
5223 PUT_MODE (operands
[0], word_mode
);
5226 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5227 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5228 emit_move_insn (operands
[0], operands
[1]);
5232 /* The only non-offsettable memory we handle is push. */
5233 if (push_operand (operands
[0], VOIDmode
))
5236 gcc_assert (!MEM_P (operands
[0])
5237 || offsettable_memref_p (operands
[0]));
5239 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5240 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5242 /* When emitting push, take care for source operands on the stack. */
5243 if (push
&& MEM_P (operands
[1])
5244 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5246 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5248 /* Compensate for the stack decrement by 4. */
5249 if (!TARGET_64BIT
&& nparts
== 3
5250 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5251 src_base
= plus_constant (Pmode
, src_base
, 4);
5253 /* src_base refers to the stack pointer and is
5254 automatically decreased by emitted push. */
5255 for (i
= 0; i
< nparts
; i
++)
5256 part
[1][i
] = change_address (part
[1][i
],
5257 GET_MODE (part
[1][i
]), src_base
);
5260 /* We need to do copy in the right order in case an address register
5261 of the source overlaps the destination. */
5262 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5266 for (i
= 0; i
< nparts
; i
++)
5269 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5270 if (collisionparts
[i
])
5274 /* Collision in the middle part can be handled by reordering. */
5275 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5277 std::swap (part
[0][1], part
[0][2]);
5278 std::swap (part
[1][1], part
[1][2]);
5280 else if (collisions
== 1
5282 && (collisionparts
[1] || collisionparts
[2]))
5284 if (collisionparts
[1])
5286 std::swap (part
[0][1], part
[0][2]);
5287 std::swap (part
[1][1], part
[1][2]);
5291 std::swap (part
[0][2], part
[0][3]);
5292 std::swap (part
[1][2], part
[1][3]);
5296 /* If there are more collisions, we can't handle it by reordering.
5297 Do an lea to the last part and use only one colliding move. */
5298 else if (collisions
> 1)
5304 base
= part
[0][nparts
- 1];
5306 /* Handle the case when the last part isn't valid for lea.
5307 Happens in 64-bit mode storing the 12-byte XFmode. */
5308 if (GET_MODE (base
) != Pmode
)
5309 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5311 addr
= XEXP (part
[1][0], 0);
5312 if (TARGET_TLS_DIRECT_SEG_REFS
)
5314 struct ix86_address parts
;
5315 int ok
= ix86_decompose_address (addr
, &parts
);
5317 /* It is not valid to use %gs: or %fs: in lea. */
5318 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5320 emit_insn (gen_rtx_SET (base
, addr
));
5321 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5322 for (i
= 1; i
< nparts
; i
++)
5324 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5325 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5336 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5337 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5338 emit_move_insn (part
[0][2], part
[1][2]);
5340 else if (nparts
== 4)
5342 emit_move_insn (part
[0][3], part
[1][3]);
5343 emit_move_insn (part
[0][2], part
[1][2]);
5348 /* In 64bit mode we don't have 32bit push available. In case this is
5349 register, it is OK - we will just use larger counterpart. We also
5350 retype memory - these comes from attempt to avoid REX prefix on
5351 moving of second half of TFmode value. */
5352 if (GET_MODE (part
[1][1]) == SImode
)
5354 switch (GET_CODE (part
[1][1]))
5357 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5361 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5368 if (GET_MODE (part
[1][0]) == SImode
)
5369 part
[1][0] = part
[1][1];
5372 emit_move_insn (part
[0][1], part
[1][1]);
5373 emit_move_insn (part
[0][0], part
[1][0]);
5377 /* Choose correct order to not overwrite the source before it is copied. */
5378 if ((REG_P (part
[0][0])
5379 && REG_P (part
[1][1])
5380 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5382 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5384 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5386 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5388 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5390 operands
[2 + i
] = part
[0][j
];
5391 operands
[6 + i
] = part
[1][j
];
5396 for (i
= 0; i
< nparts
; i
++)
5398 operands
[2 + i
] = part
[0][i
];
5399 operands
[6 + i
] = part
[1][i
];
5403 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5404 if (optimize_insn_for_size_p ())
5406 for (j
= 0; j
< nparts
- 1; j
++)
5407 if (CONST_INT_P (operands
[6 + j
])
5408 && operands
[6 + j
] != const0_rtx
5409 && REG_P (operands
[2 + j
]))
5410 for (i
= j
; i
< nparts
- 1; i
++)
5411 if (CONST_INT_P (operands
[7 + i
])
5412 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5413 operands
[7 + i
] = operands
[2 + j
];
5416 for (i
= 0; i
< nparts
; i
++)
5417 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5422 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5423 left shift by a constant, either using a single shift or
5424 a sequence of add instructions. */
5427 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5430 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5431 && !optimize_insn_for_size_p ()))
5434 emit_insn (gen_add2_insn (operand
, operand
));
5438 rtx (*insn
)(rtx
, rtx
, rtx
);
5440 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5441 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5446 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5448 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5449 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5450 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5451 machine_mode half_mode
;
5453 rtx low
[2], high
[2];
5456 if (CONST_INT_P (operands
[2]))
5458 split_double_mode (mode
, operands
, 2, low
, high
);
5459 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5461 if (count
>= half_width
)
5463 emit_move_insn (high
[0], low
[1]);
5464 emit_move_insn (low
[0], const0_rtx
);
5466 if (count
> half_width
)
5467 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5471 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5473 if (!rtx_equal_p (operands
[0], operands
[1]))
5474 emit_move_insn (operands
[0], operands
[1]);
5476 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5477 ix86_expand_ashl_const (low
[0], count
, mode
);
5482 split_double_mode (mode
, operands
, 1, low
, high
);
5483 half_mode
= mode
== DImode
? SImode
: DImode
;
5485 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5487 if (operands
[1] == const1_rtx
)
5489 /* Assuming we've chosen a QImode capable registers, then 1 << N
5490 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5491 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5493 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5495 ix86_expand_clear (low
[0]);
5496 ix86_expand_clear (high
[0]);
5497 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5499 d
= gen_lowpart (QImode
, low
[0]);
5500 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5501 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5502 emit_insn (gen_rtx_SET (d
, s
));
5504 d
= gen_lowpart (QImode
, high
[0]);
5505 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5506 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5507 emit_insn (gen_rtx_SET (d
, s
));
5510 /* Otherwise, we can get the same results by manually performing
5511 a bit extract operation on bit 5/6, and then performing the two
5512 shifts. The two methods of getting 0/1 into low/high are exactly
5513 the same size. Avoiding the shift in the bit extract case helps
5514 pentium4 a bit; no one else seems to care much either way. */
5517 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5518 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5519 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5525 gen_lshr3
= gen_lshrsi3
;
5526 gen_and3
= gen_andsi3
;
5527 gen_xor3
= gen_xorsi3
;
5532 gen_lshr3
= gen_lshrdi3
;
5533 gen_and3
= gen_anddi3
;
5534 gen_xor3
= gen_xordi3
;
5538 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5539 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5541 x
= gen_lowpart (half_mode
, operands
[2]);
5542 emit_insn (gen_rtx_SET (high
[0], x
));
5544 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5545 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5546 emit_move_insn (low
[0], high
[0]);
5547 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5550 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5551 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5555 if (operands
[1] == constm1_rtx
)
5557 /* For -1 << N, we can avoid the shld instruction, because we
5558 know that we're shifting 0...31/63 ones into a -1. */
5559 emit_move_insn (low
[0], constm1_rtx
);
5560 if (optimize_insn_for_size_p ())
5561 emit_move_insn (high
[0], low
[0]);
5563 emit_move_insn (high
[0], constm1_rtx
);
5567 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5569 if (!rtx_equal_p (operands
[0], operands
[1]))
5570 emit_move_insn (operands
[0], operands
[1]);
5572 split_double_mode (mode
, operands
, 1, low
, high
);
5573 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5576 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5578 if (TARGET_CMOVE
&& scratch
)
5580 ix86_expand_clear (scratch
);
5581 emit_insn (gen_x86_shift_adj_1
5582 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5585 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5589 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5591 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5592 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5593 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5594 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5596 rtx low
[2], high
[2];
5599 if (CONST_INT_P (operands
[2]))
5601 split_double_mode (mode
, operands
, 2, low
, high
);
5602 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5604 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5606 emit_move_insn (high
[0], high
[1]);
5607 emit_insn (gen_ashr3 (high
[0], high
[0],
5608 GEN_INT (half_width
- 1)));
5609 emit_move_insn (low
[0], high
[0]);
5612 else if (count
>= half_width
)
5614 emit_move_insn (low
[0], high
[1]);
5615 emit_move_insn (high
[0], low
[0]);
5616 emit_insn (gen_ashr3 (high
[0], high
[0],
5617 GEN_INT (half_width
- 1)));
5619 if (count
> half_width
)
5620 emit_insn (gen_ashr3 (low
[0], low
[0],
5621 GEN_INT (count
- half_width
)));
5625 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5627 if (!rtx_equal_p (operands
[0], operands
[1]))
5628 emit_move_insn (operands
[0], operands
[1]);
5630 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5631 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5636 machine_mode half_mode
;
5638 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5640 if (!rtx_equal_p (operands
[0], operands
[1]))
5641 emit_move_insn (operands
[0], operands
[1]);
5643 split_double_mode (mode
, operands
, 1, low
, high
);
5644 half_mode
= mode
== DImode
? SImode
: DImode
;
5646 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5647 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5649 if (TARGET_CMOVE
&& scratch
)
5651 emit_move_insn (scratch
, high
[0]);
5652 emit_insn (gen_ashr3 (scratch
, scratch
,
5653 GEN_INT (half_width
- 1)));
5654 emit_insn (gen_x86_shift_adj_1
5655 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5658 emit_insn (gen_x86_shift_adj_3
5659 (half_mode
, low
[0], high
[0], operands
[2]));
5664 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5666 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5667 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5668 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5669 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5671 rtx low
[2], high
[2];
5674 if (CONST_INT_P (operands
[2]))
5676 split_double_mode (mode
, operands
, 2, low
, high
);
5677 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5679 if (count
>= half_width
)
5681 emit_move_insn (low
[0], high
[1]);
5682 ix86_expand_clear (high
[0]);
5684 if (count
> half_width
)
5685 emit_insn (gen_lshr3 (low
[0], low
[0],
5686 GEN_INT (count
- half_width
)));
5690 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5692 if (!rtx_equal_p (operands
[0], operands
[1]))
5693 emit_move_insn (operands
[0], operands
[1]);
5695 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5696 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5701 machine_mode half_mode
;
5703 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5705 if (!rtx_equal_p (operands
[0], operands
[1]))
5706 emit_move_insn (operands
[0], operands
[1]);
5708 split_double_mode (mode
, operands
, 1, low
, high
);
5709 half_mode
= mode
== DImode
? SImode
: DImode
;
5711 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5712 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5714 if (TARGET_CMOVE
&& scratch
)
5716 ix86_expand_clear (scratch
);
5717 emit_insn (gen_x86_shift_adj_1
5718 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5721 emit_insn (gen_x86_shift_adj_2
5722 (half_mode
, low
[0], high
[0], operands
[2]));
5726 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5727 DImode for constant loop counts. */
5730 counter_mode (rtx count_exp
)
5732 if (GET_MODE (count_exp
) != VOIDmode
)
5733 return GET_MODE (count_exp
);
5734 if (!CONST_INT_P (count_exp
))
5736 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5741 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5742 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5743 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5744 memory by VALUE (supposed to be in MODE).
5746 The size is rounded down to whole number of chunk size moved at once.
5747 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5751 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5752 rtx destptr
, rtx srcptr
, rtx value
,
5753 rtx count
, machine_mode mode
, int unroll
,
5754 int expected_size
, bool issetmem
)
5756 rtx_code_label
*out_label
, *top_label
;
5758 machine_mode iter_mode
= counter_mode (count
);
5759 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5760 rtx piece_size
= GEN_INT (piece_size_n
);
5761 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5765 top_label
= gen_label_rtx ();
5766 out_label
= gen_label_rtx ();
5767 iter
= gen_reg_rtx (iter_mode
);
5769 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5770 NULL
, 1, OPTAB_DIRECT
);
5771 /* Those two should combine. */
5772 if (piece_size
== const1_rtx
)
5774 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5776 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5778 emit_move_insn (iter
, const0_rtx
);
5780 emit_label (top_label
);
5782 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5784 /* This assert could be relaxed - in this case we'll need to compute
5785 smallest power of two, containing in PIECE_SIZE_N and pass it to
5787 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5788 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5789 destmem
= adjust_address (destmem
, mode
, 0);
5793 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5794 srcmem
= adjust_address (srcmem
, mode
, 0);
5796 /* When unrolling for chips that reorder memory reads and writes,
5797 we can save registers by using single temporary.
5798 Also using 4 temporaries is overkill in 32bit mode. */
5799 if (!TARGET_64BIT
&& 0)
5801 for (i
= 0; i
< unroll
; i
++)
5805 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5806 GET_MODE_SIZE (mode
));
5807 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5808 GET_MODE_SIZE (mode
));
5810 emit_move_insn (destmem
, srcmem
);
5816 gcc_assert (unroll
<= 4);
5817 for (i
= 0; i
< unroll
; i
++)
5819 tmpreg
[i
] = gen_reg_rtx (mode
);
5821 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5822 GET_MODE_SIZE (mode
));
5823 emit_move_insn (tmpreg
[i
], srcmem
);
5825 for (i
= 0; i
< unroll
; i
++)
5828 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5829 GET_MODE_SIZE (mode
));
5830 emit_move_insn (destmem
, tmpreg
[i
]);
5835 for (i
= 0; i
< unroll
; i
++)
5838 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5839 GET_MODE_SIZE (mode
));
5840 emit_move_insn (destmem
, value
);
5843 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5844 true, OPTAB_LIB_WIDEN
);
5846 emit_move_insn (iter
, tmp
);
5848 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5850 if (expected_size
!= -1)
5852 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5853 if (expected_size
== 0)
5855 else if (expected_size
> REG_BR_PROB_BASE
)
5856 predict_jump (REG_BR_PROB_BASE
- 1);
5858 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5862 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5863 iter
= ix86_zero_extend_to_Pmode (iter
);
5864 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5865 true, OPTAB_LIB_WIDEN
);
5867 emit_move_insn (destptr
, tmp
);
5870 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5871 true, OPTAB_LIB_WIDEN
);
5873 emit_move_insn (srcptr
, tmp
);
5875 emit_label (out_label
);
5878 /* Divide COUNTREG by SCALE. */
5880 scale_counter (rtx countreg
, int scale
)
5886 if (CONST_INT_P (countreg
))
5887 return GEN_INT (INTVAL (countreg
) / scale
);
5888 gcc_assert (REG_P (countreg
));
5890 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5891 GEN_INT (exact_log2 (scale
)),
5892 NULL
, 1, OPTAB_DIRECT
);
5896 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5897 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5898 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5899 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5900 ORIG_VALUE is the original value passed to memset to fill the memory with.
5901 Other arguments have same meaning as for previous function. */
5904 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5905 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5907 machine_mode mode
, bool issetmem
)
5912 HOST_WIDE_INT rounded_count
;
5914 /* If possible, it is shorter to use rep movs.
5915 TODO: Maybe it is better to move this logic to decide_alg. */
5916 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5917 && (!issetmem
|| orig_value
== const0_rtx
))
5920 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5921 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5923 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5924 GET_MODE_SIZE (mode
)));
5927 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5928 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5929 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5932 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5933 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5936 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5937 destmem
= shallow_copy_rtx (destmem
);
5938 set_mem_size (destmem
, rounded_count
);
5940 else if (MEM_SIZE_KNOWN_P (destmem
))
5941 clear_mem_size (destmem
);
5945 value
= force_reg (mode
, gen_lowpart (mode
, value
));
5946 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
5950 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
5951 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
5954 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5955 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5956 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
5959 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
5960 if (CONST_INT_P (count
))
5963 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5964 srcmem
= shallow_copy_rtx (srcmem
);
5965 set_mem_size (srcmem
, rounded_count
);
5969 if (MEM_SIZE_KNOWN_P (srcmem
))
5970 clear_mem_size (srcmem
);
5972 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
5977 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
5979 SRC is passed by pointer to be updated on return.
5980 Return value is updated DST. */
5982 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
5983 HOST_WIDE_INT size_to_move
)
5985 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
5986 enum insn_code code
;
5987 machine_mode move_mode
;
5990 /* Find the widest mode in which we could perform moves.
5991 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
5992 it until move of such size is supported. */
5993 piece_size
= 1 << floor_log2 (size_to_move
);
5994 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
5995 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
5997 gcc_assert (piece_size
> 1);
6001 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6002 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6003 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6005 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6006 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6007 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6009 move_mode
= word_mode
;
6010 piece_size
= GET_MODE_SIZE (move_mode
);
6011 code
= optab_handler (mov_optab
, move_mode
);
6014 gcc_assert (code
!= CODE_FOR_nothing
);
6016 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6017 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6019 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6020 gcc_assert (size_to_move
% piece_size
== 0);
6022 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6024 /* We move from memory to memory, so we'll need to do it via
6025 a temporary register. */
6026 tempreg
= gen_reg_rtx (move_mode
);
6027 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6028 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6030 emit_move_insn (destptr
,
6031 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6032 emit_move_insn (srcptr
,
6033 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
6035 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6037 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6041 /* Update DST and SRC rtx. */
6046 /* Helper function for the string operations below. Dest VARIABLE whether
6047 it is aligned to VALUE bytes. If true, jump to the label. */
6049 static rtx_code_label
*
6050 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6052 rtx_code_label
*label
= gen_label_rtx ();
6053 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6054 if (GET_MODE (variable
) == DImode
)
6055 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6057 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6058 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6061 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6063 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6068 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6071 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6072 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6075 if (CONST_INT_P (count
))
6077 HOST_WIDE_INT countval
= INTVAL (count
);
6078 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6081 /* For now MAX_SIZE should be a power of 2. This assert could be
6082 relaxed, but it'll require a bit more complicated epilogue
6084 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6085 for (i
= max_size
; i
>= 1; i
>>= 1)
6087 if (epilogue_size
& i
)
6088 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6094 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6095 count
, 1, OPTAB_DIRECT
);
6096 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6097 count
, QImode
, 1, 4, false);
6101 /* When there are stringops, we can cheaply increase dest and src pointers.
6102 Otherwise we save code size by maintaining offset (zero is readily
6103 available from preceding rep operation) and using x86 addressing modes.
6105 if (TARGET_SINGLE_STRINGOP
)
6109 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6110 src
= change_address (srcmem
, SImode
, srcptr
);
6111 dest
= change_address (destmem
, SImode
, destptr
);
6112 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6114 LABEL_NUSES (label
) = 1;
6118 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6119 src
= change_address (srcmem
, HImode
, srcptr
);
6120 dest
= change_address (destmem
, HImode
, destptr
);
6121 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6123 LABEL_NUSES (label
) = 1;
6127 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6128 src
= change_address (srcmem
, QImode
, srcptr
);
6129 dest
= change_address (destmem
, QImode
, destptr
);
6130 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6132 LABEL_NUSES (label
) = 1;
6137 rtx offset
= force_reg (Pmode
, const0_rtx
);
6142 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6143 src
= change_address (srcmem
, SImode
, srcptr
);
6144 dest
= change_address (destmem
, SImode
, destptr
);
6145 emit_move_insn (dest
, src
);
6146 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6147 true, OPTAB_LIB_WIDEN
);
6149 emit_move_insn (offset
, tmp
);
6151 LABEL_NUSES (label
) = 1;
6155 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6156 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6157 src
= change_address (srcmem
, HImode
, tmp
);
6158 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6159 dest
= change_address (destmem
, HImode
, tmp
);
6160 emit_move_insn (dest
, src
);
6161 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6162 true, OPTAB_LIB_WIDEN
);
6164 emit_move_insn (offset
, tmp
);
6166 LABEL_NUSES (label
) = 1;
6170 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6171 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6172 src
= change_address (srcmem
, QImode
, tmp
);
6173 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6174 dest
= change_address (destmem
, QImode
, tmp
);
6175 emit_move_insn (dest
, src
);
6177 LABEL_NUSES (label
) = 1;
6182 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6183 with value PROMOTED_VAL.
6184 SRC is passed by pointer to be updated on return.
6185 Return value is updated DST. */
6187 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6188 HOST_WIDE_INT size_to_move
)
6191 enum insn_code code
;
6192 machine_mode move_mode
;
6195 /* Find the widest mode in which we could perform moves.
6196 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6197 it until move of such size is supported. */
6198 move_mode
= GET_MODE (promoted_val
);
6199 if (move_mode
== VOIDmode
)
6201 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6203 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6204 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6205 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6207 piece_size
= GET_MODE_SIZE (move_mode
);
6208 code
= optab_handler (mov_optab
, move_mode
);
6209 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6211 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6213 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6214 gcc_assert (size_to_move
% piece_size
== 0);
6216 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6218 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6220 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6221 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6226 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6228 emit_move_insn (destptr
,
6229 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6231 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6235 /* Update DST rtx. */
6238 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6240 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6241 rtx count
, int max_size
)
6243 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6244 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6245 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6246 gen_lowpart (QImode
, value
), count
, QImode
,
6247 1, max_size
/ 2, true);
6250 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6252 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6253 rtx count
, int max_size
)
6257 if (CONST_INT_P (count
))
6259 HOST_WIDE_INT countval
= INTVAL (count
);
6260 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6263 /* For now MAX_SIZE should be a power of 2. This assert could be
6264 relaxed, but it'll require a bit more complicated epilogue
6266 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6267 for (i
= max_size
; i
>= 1; i
>>= 1)
6269 if (epilogue_size
& i
)
6271 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6272 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6274 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6281 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6286 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6289 dest
= change_address (destmem
, DImode
, destptr
);
6290 emit_insn (gen_strset (destptr
, dest
, value
));
6291 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6292 emit_insn (gen_strset (destptr
, dest
, value
));
6296 dest
= change_address (destmem
, SImode
, destptr
);
6297 emit_insn (gen_strset (destptr
, dest
, value
));
6298 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6299 emit_insn (gen_strset (destptr
, dest
, value
));
6300 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6301 emit_insn (gen_strset (destptr
, dest
, value
));
6302 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6303 emit_insn (gen_strset (destptr
, dest
, value
));
6306 LABEL_NUSES (label
) = 1;
6310 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6313 dest
= change_address (destmem
, DImode
, destptr
);
6314 emit_insn (gen_strset (destptr
, dest
, value
));
6318 dest
= change_address (destmem
, SImode
, destptr
);
6319 emit_insn (gen_strset (destptr
, dest
, value
));
6320 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6321 emit_insn (gen_strset (destptr
, dest
, value
));
6324 LABEL_NUSES (label
) = 1;
6328 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6329 dest
= change_address (destmem
, SImode
, destptr
);
6330 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6332 LABEL_NUSES (label
) = 1;
6336 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6337 dest
= change_address (destmem
, HImode
, destptr
);
6338 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6340 LABEL_NUSES (label
) = 1;
6344 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6345 dest
= change_address (destmem
, QImode
, destptr
);
6346 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6348 LABEL_NUSES (label
) = 1;
6352 /* Adjust COUNTER by the VALUE. */
6354 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6356 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6359 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6360 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6361 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6363 Return value is updated DESTMEM. */
6366 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6367 rtx destptr
, rtx srcptr
, rtx value
,
6368 rtx vec_value
, rtx count
, int align
,
6369 int desired_alignment
, bool issetmem
)
6372 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6376 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6379 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6380 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6382 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6385 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6386 ix86_adjust_counter (count
, i
);
6388 LABEL_NUSES (label
) = 1;
6389 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6395 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6396 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6397 and jump to DONE_LABEL. */
6399 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6400 rtx destptr
, rtx srcptr
,
6401 rtx value
, rtx vec_value
,
6402 rtx count
, int size
,
6403 rtx done_label
, bool issetmem
)
6405 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6406 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6410 /* If we do not have vector value to copy, we must reduce size. */
6415 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6417 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6418 mode
= GET_MODE (value
);
6421 mode
= GET_MODE (vec_value
), value
= vec_value
;
6425 /* Choose appropriate vector mode. */
6427 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6428 else if (size
>= 16)
6429 mode
= TARGET_SSE
? V16QImode
: DImode
;
6430 srcmem
= change_address (srcmem
, mode
, srcptr
);
6432 destmem
= change_address (destmem
, mode
, destptr
);
6433 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6434 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6435 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6438 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6441 emit_move_insn (destmem
, srcmem
);
6442 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6444 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6447 destmem
= offset_address (destmem
, count
, 1);
6448 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6449 GET_MODE_SIZE (mode
));
6452 srcmem
= offset_address (srcmem
, count
, 1);
6453 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6454 GET_MODE_SIZE (mode
));
6456 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6459 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6462 emit_move_insn (destmem
, srcmem
);
6463 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6465 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6467 emit_jump_insn (gen_jump (done_label
));
6471 LABEL_NUSES (label
) = 1;
6474 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6475 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6476 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6477 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6478 DONE_LABEL is a label after the whole copying sequence. The label is created
6479 on demand if *DONE_LABEL is NULL.
6480 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6481 bounds after the initial copies.
6483 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6484 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6485 we will dispatch to a library call for large blocks.
6487 In pseudocode we do:
6491 Assume that SIZE is 4. Bigger sizes are handled analogously
6494 copy 4 bytes from SRCPTR to DESTPTR
6495 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6500 copy 1 byte from SRCPTR to DESTPTR
6503 copy 2 bytes from SRCPTR to DESTPTR
6504 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6509 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6510 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6512 OLD_DESPTR = DESTPTR;
6513 Align DESTPTR up to DESIRED_ALIGN
6514 SRCPTR += DESTPTR - OLD_DESTPTR
6515 COUNT -= DEST_PTR - OLD_DESTPTR
6517 Round COUNT down to multiple of SIZE
6518 << optional caller supplied zero size guard is here >>
6519 << optional caller supplied dynamic check is here >>
6520 << caller supplied main copy loop is here >>
6525 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6526 rtx
*destptr
, rtx
*srcptr
,
6528 rtx value
, rtx vec_value
,
6530 rtx_code_label
**done_label
,
6534 unsigned HOST_WIDE_INT
*min_size
,
6538 rtx_code_label
*loop_label
= NULL
, *label
;
6541 int prolog_size
= 0;
6544 /* Chose proper value to copy. */
6545 if (issetmem
&& VECTOR_MODE_P (mode
))
6546 mode_value
= vec_value
;
6549 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6551 /* See if block is big or small, handle small blocks. */
6552 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6555 loop_label
= gen_label_rtx ();
6558 *done_label
= gen_label_rtx ();
6560 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6564 /* Handle sizes > 3. */
6565 for (;size2
> 2; size2
>>= 1)
6566 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6570 size2
, *done_label
, issetmem
);
6571 /* Nothing to copy? Jump to DONE_LABEL if so */
6572 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6575 /* Do a byte copy. */
6576 destmem
= change_address (destmem
, QImode
, *destptr
);
6578 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6581 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6582 emit_move_insn (destmem
, srcmem
);
6585 /* Handle sizes 2 and 3. */
6586 label
= ix86_expand_aligntest (*count
, 2, false);
6587 destmem
= change_address (destmem
, HImode
, *destptr
);
6588 destmem
= offset_address (destmem
, *count
, 1);
6589 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6591 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6594 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6595 srcmem
= offset_address (srcmem
, *count
, 1);
6596 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6597 emit_move_insn (destmem
, srcmem
);
6601 LABEL_NUSES (label
) = 1;
6602 emit_jump_insn (gen_jump (*done_label
));
6606 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6607 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6609 /* Start memcpy for COUNT >= SIZE. */
6612 emit_label (loop_label
);
6613 LABEL_NUSES (loop_label
) = 1;
6616 /* Copy first desired_align bytes. */
6618 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6619 destmem
= change_address (destmem
, mode
, *destptr
);
6620 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6621 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6624 emit_move_insn (destmem
, mode_value
);
6627 emit_move_insn (destmem
, srcmem
);
6628 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6630 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6631 prolog_size
+= GET_MODE_SIZE (mode
);
6635 /* Copy last SIZE bytes. */
6636 destmem
= offset_address (destmem
, *count
, 1);
6637 destmem
= offset_address (destmem
,
6638 GEN_INT (-size
- prolog_size
),
6641 emit_move_insn (destmem
, mode_value
);
6644 srcmem
= offset_address (srcmem
, *count
, 1);
6645 srcmem
= offset_address (srcmem
,
6646 GEN_INT (-size
- prolog_size
),
6648 emit_move_insn (destmem
, srcmem
);
6650 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6652 destmem
= offset_address (destmem
, modesize
, 1);
6654 emit_move_insn (destmem
, mode_value
);
6657 srcmem
= offset_address (srcmem
, modesize
, 1);
6658 emit_move_insn (destmem
, srcmem
);
6662 /* Align destination. */
6663 if (desired_align
> 1 && desired_align
> align
)
6665 rtx saveddest
= *destptr
;
6667 gcc_assert (desired_align
<= size
);
6668 /* Align destptr up, place it to new register. */
6669 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6670 GEN_INT (prolog_size
),
6671 NULL_RTX
, 1, OPTAB_DIRECT
);
6672 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6673 REG_POINTER (*destptr
) = 1;
6674 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6675 GEN_INT (-desired_align
),
6676 *destptr
, 1, OPTAB_DIRECT
);
6677 /* See how many bytes we skipped. */
6678 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6680 saveddest
, 1, OPTAB_DIRECT
);
6681 /* Adjust srcptr and count. */
6683 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6684 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6685 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6686 saveddest
, *count
, 1, OPTAB_DIRECT
);
6687 /* We copied at most size + prolog_size. */
6688 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6690 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6694 /* Our loops always round down the block size, but for dispatch to
6695 library we need precise value. */
6697 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6698 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6702 gcc_assert (prolog_size
== 0);
6703 /* Decrease count, so we won't end up copying last word twice. */
6704 if (!CONST_INT_P (*count
))
6705 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6706 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6708 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6709 (unsigned HOST_WIDE_INT
)size
));
6711 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6716 /* This function is like the previous one, except here we know how many bytes
6717 need to be copied. That allows us to update alignment not only of DST, which
6718 is returned, but also of SRC, which is passed as a pointer for that
6721 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6722 rtx srcreg
, rtx value
, rtx vec_value
,
6723 int desired_align
, int align_bytes
,
6728 rtx orig_src
= NULL
;
6730 int copied_bytes
= 0;
6734 gcc_assert (srcp
!= NULL
);
6739 for (piece_size
= 1;
6740 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6743 if (align_bytes
& piece_size
)
6747 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6748 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6750 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6753 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6754 copied_bytes
+= piece_size
;
6757 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6758 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6759 if (MEM_SIZE_KNOWN_P (orig_dst
))
6760 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6764 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6766 if (src_align_bytes
>= 0)
6767 src_align_bytes
= desired_align
- src_align_bytes
;
6768 if (src_align_bytes
>= 0)
6770 unsigned int src_align
;
6771 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6773 if ((src_align_bytes
& (src_align
- 1))
6774 == (align_bytes
& (src_align
- 1)))
6777 if (src_align
> (unsigned int) desired_align
)
6778 src_align
= desired_align
;
6779 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6780 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6782 if (MEM_SIZE_KNOWN_P (orig_src
))
6783 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6790 /* Return true if ALG can be used in current context.
6791 Assume we expand memset if MEMSET is true. */
6793 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6795 if (alg
== no_stringop
)
6797 if (alg
== vector_loop
)
6798 return TARGET_SSE
|| TARGET_AVX
;
6799 /* Algorithms using the rep prefix want at least edi and ecx;
6800 additionally, memset wants eax and memcpy wants esi. Don't
6801 consider such algorithms if the user has appropriated those
6802 registers for their own purposes, or if we have a non-default
6803 address space, since some string insns cannot override the segment. */
6804 if (alg
== rep_prefix_1_byte
6805 || alg
== rep_prefix_4_byte
6806 || alg
== rep_prefix_8_byte
)
6810 if (fixed_regs
[CX_REG
]
6811 || fixed_regs
[DI_REG
]
6812 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6818 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6819 static enum stringop_alg
6820 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6821 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6822 bool memset
, bool zero_memset
, bool have_as
,
6823 int *dynamic_check
, bool *noalign
, bool recur
)
6825 const struct stringop_algs
*algs
;
6826 bool optimize_for_speed
;
6828 const struct processor_costs
*cost
;
6830 bool any_alg_usable_p
= false;
6833 *dynamic_check
= -1;
6835 /* Even if the string operation call is cold, we still might spend a lot
6836 of time processing large blocks. */
6837 if (optimize_function_for_size_p (cfun
)
6838 || (optimize_insn_for_size_p ()
6840 || (expected_size
!= -1 && expected_size
< 256))))
6841 optimize_for_speed
= false;
6843 optimize_for_speed
= true;
6845 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6847 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6849 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6851 /* See maximal size for user defined algorithm. */
6852 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6854 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6855 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6856 any_alg_usable_p
|= usable
;
6858 if (candidate
!= libcall
&& candidate
&& usable
)
6859 max
= algs
->size
[i
].max
;
6862 /* If expected size is not known but max size is small enough
6863 so inline version is a win, set expected size into
6865 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6866 && expected_size
== -1)
6867 expected_size
= min_size
/ 2 + max_size
/ 2;
6869 /* If user specified the algorithm, honor it if possible. */
6870 if (ix86_stringop_alg
!= no_stringop
6871 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6872 return ix86_stringop_alg
;
6873 /* rep; movq or rep; movl is the smallest variant. */
6874 else if (!optimize_for_speed
)
6877 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6878 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6879 ? rep_prefix_1_byte
: loop_1_byte
;
6881 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6882 ? rep_prefix_4_byte
: loop
;
6884 /* Very tiny blocks are best handled via the loop, REP is expensive to
6886 else if (expected_size
!= -1 && expected_size
< 4)
6888 else if (expected_size
!= -1)
6890 enum stringop_alg alg
= libcall
;
6891 bool alg_noalign
= false;
6892 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6894 /* We get here if the algorithms that were not libcall-based
6895 were rep-prefix based and we are unable to use rep prefixes
6896 based on global register usage. Break out of the loop and
6897 use the heuristic below. */
6898 if (algs
->size
[i
].max
== 0)
6900 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6902 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6904 if (candidate
!= libcall
6905 && alg_usable_p (candidate
, memset
, have_as
))
6908 alg_noalign
= algs
->size
[i
].noalign
;
6910 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6911 last non-libcall inline algorithm. */
6912 if (TARGET_INLINE_ALL_STRINGOPS
)
6914 /* When the current size is best to be copied by a libcall,
6915 but we are still forced to inline, run the heuristic below
6916 that will pick code for medium sized blocks. */
6919 *noalign
= alg_noalign
;
6922 else if (!any_alg_usable_p
)
6925 else if (alg_usable_p (candidate
, memset
, have_as
))
6927 *noalign
= algs
->size
[i
].noalign
;
6933 /* When asked to inline the call anyway, try to pick meaningful choice.
6934 We look for maximal size of block that is faster to copy by hand and
6935 take blocks of at most of that size guessing that average size will
6936 be roughly half of the block.
6938 If this turns out to be bad, we might simply specify the preferred
6939 choice in ix86_costs. */
6940 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6941 && (algs
->unknown_size
== libcall
6942 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
6944 enum stringop_alg alg
;
6945 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
6947 /* If there aren't any usable algorithms or if recursing already,
6948 then recursing on smaller sizes or same size isn't going to
6949 find anything. Just return the simple byte-at-a-time copy loop. */
6950 if (!any_alg_usable_p
|| recur
)
6952 /* Pick something reasonable. */
6953 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
6954 *dynamic_check
= 128;
6957 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
6958 zero_memset
, have_as
, dynamic_check
, noalign
, true);
6959 gcc_assert (*dynamic_check
== -1);
6960 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6961 *dynamic_check
= max
;
6963 gcc_assert (alg
!= libcall
);
6966 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
6967 ? algs
->unknown_size
: libcall
);
6970 /* Decide on alignment. We know that the operand is already aligned to ALIGN
6971 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
6973 decide_alignment (int align
,
6974 enum stringop_alg alg
,
6976 machine_mode move_mode
)
6978 int desired_align
= 0;
6980 gcc_assert (alg
!= no_stringop
);
6984 if (move_mode
== VOIDmode
)
6987 desired_align
= GET_MODE_SIZE (move_mode
);
6988 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
6989 copying whole cacheline at once. */
6990 if (TARGET_PENTIUMPRO
6991 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
6996 if (desired_align
< align
)
6997 desired_align
= align
;
6998 if (expected_size
!= -1 && expected_size
< 4)
6999 desired_align
= align
;
7001 return desired_align
;
7005 /* Helper function for memcpy. For QImode value 0xXY produce
7006 0xXYXYXYXY of wide specified by MODE. This is essentially
7007 a * 0x10101010, but we can do slightly better than
7008 synth_mult by unwinding the sequence by hand on CPUs with
7011 promote_duplicated_reg (machine_mode mode
, rtx val
)
7013 machine_mode valmode
= GET_MODE (val
);
7015 int nops
= mode
== DImode
? 3 : 2;
7017 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7018 if (val
== const0_rtx
)
7019 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7020 if (CONST_INT_P (val
))
7022 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7027 v
|= (v
<< 16) << 16;
7028 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7031 if (valmode
== VOIDmode
)
7033 if (valmode
!= QImode
)
7034 val
= gen_lowpart (QImode
, val
);
7037 if (!TARGET_PARTIAL_REG_STALL
)
7039 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7040 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7041 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7042 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7044 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7045 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7046 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7051 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7053 if (!TARGET_PARTIAL_REG_STALL
)
7054 emit_insn (gen_insv_1 (mode
, reg
, reg
));
7057 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7058 NULL
, 1, OPTAB_DIRECT
);
7059 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7062 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7063 NULL
, 1, OPTAB_DIRECT
);
7064 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7067 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7068 NULL
, 1, OPTAB_DIRECT
);
7069 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7074 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7075 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7076 alignment from ALIGN to DESIRED_ALIGN. */
7078 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7084 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7085 promoted_val
= promote_duplicated_reg (DImode
, val
);
7086 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7087 promoted_val
= promote_duplicated_reg (SImode
, val
);
7088 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7089 promoted_val
= promote_duplicated_reg (HImode
, val
);
7093 return promoted_val
;
7096 /* Copy the address to a Pmode register. This is used for x32 to
7097 truncate DImode TLS address to a SImode register. */
7100 ix86_copy_addr_to_reg (rtx addr
)
7103 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7105 reg
= copy_addr_to_reg (addr
);
7106 REG_POINTER (reg
) = 1;
7111 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7112 reg
= copy_to_mode_reg (DImode
, addr
);
7113 REG_POINTER (reg
) = 1;
7114 return gen_rtx_SUBREG (SImode
, reg
, 0);
7118 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7119 operations when profitable. The code depends upon architecture, block size
7120 and alignment, but always has one of the following overall structures:
7122 Aligned move sequence:
7124 1) Prologue guard: Conditional that jumps up to epilogues for small
7125 blocks that can be handled by epilogue alone. This is faster
7126 but also needed for correctness, since prologue assume the block
7127 is larger than the desired alignment.
7129 Optional dynamic check for size and libcall for large
7130 blocks is emitted here too, with -minline-stringops-dynamically.
7132 2) Prologue: copy first few bytes in order to get destination
7133 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7134 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7135 copied. We emit either a jump tree on power of two sized
7136 blocks, or a byte loop.
7138 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7139 with specified algorithm.
7141 4) Epilogue: code copying tail of the block that is too small to be
7142 handled by main body (or up to size guarded by prologue guard).
7144 Misaligned move sequence
7146 1) missaligned move prologue/epilogue containing:
7147 a) Prologue handling small memory blocks and jumping to done_label
7148 (skipped if blocks are known to be large enough)
7149 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7150 needed by single possibly misaligned move
7151 (skipped if alignment is not needed)
7152 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7154 2) Zero size guard dispatching to done_label, if needed
7156 3) dispatch to library call, if needed,
7158 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7159 with specified algorithm. */
7161 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7162 rtx align_exp
, rtx expected_align_exp
,
7163 rtx expected_size_exp
, rtx min_size_exp
,
7164 rtx max_size_exp
, rtx probable_max_size_exp
,
7169 rtx_code_label
*label
= NULL
;
7171 rtx_code_label
*jump_around_label
= NULL
;
7172 HOST_WIDE_INT align
= 1;
7173 unsigned HOST_WIDE_INT count
= 0;
7174 HOST_WIDE_INT expected_size
= -1;
7175 int size_needed
= 0, epilogue_size_needed
;
7176 int desired_align
= 0, align_bytes
= 0;
7177 enum stringop_alg alg
;
7178 rtx promoted_val
= NULL
;
7179 rtx vec_promoted_val
= NULL
;
7180 bool force_loopy_epilogue
= false;
7182 bool need_zero_guard
= false;
7184 machine_mode move_mode
= VOIDmode
;
7185 machine_mode wider_mode
;
7186 int unroll_factor
= 1;
7187 /* TODO: Once value ranges are available, fill in proper data. */
7188 unsigned HOST_WIDE_INT min_size
= 0;
7189 unsigned HOST_WIDE_INT max_size
= -1;
7190 unsigned HOST_WIDE_INT probable_max_size
= -1;
7191 bool misaligned_prologue_used
= false;
7194 if (CONST_INT_P (align_exp
))
7195 align
= INTVAL (align_exp
);
7196 /* i386 can do misaligned access on reasonably increased cost. */
7197 if (CONST_INT_P (expected_align_exp
)
7198 && INTVAL (expected_align_exp
) > align
)
7199 align
= INTVAL (expected_align_exp
);
7200 /* ALIGN is the minimum of destination and source alignment, but we care here
7201 just about destination alignment. */
7203 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7204 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7206 if (CONST_INT_P (count_exp
))
7208 min_size
= max_size
= probable_max_size
= count
= expected_size
7209 = INTVAL (count_exp
);
7210 /* When COUNT is 0, there is nothing to do. */
7217 min_size
= INTVAL (min_size_exp
);
7219 max_size
= INTVAL (max_size_exp
);
7220 if (probable_max_size_exp
)
7221 probable_max_size
= INTVAL (probable_max_size_exp
);
7222 if (CONST_INT_P (expected_size_exp
))
7223 expected_size
= INTVAL (expected_size_exp
);
7226 /* Make sure we don't need to care about overflow later on. */
7227 if (count
> (HOST_WIDE_INT_1U
<< 30))
7230 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7232 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7234 /* Step 0: Decide on preferred algorithm, desired alignment and
7235 size of chunks to be copied by main loop. */
7236 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7238 issetmem
&& val_exp
== const0_rtx
, have_as
,
7239 &dynamic_check
, &noalign
, false);
7242 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7243 stringop_alg_names
[alg
]);
7247 gcc_assert (alg
!= no_stringop
);
7249 /* For now vector-version of memset is generated only for memory zeroing, as
7250 creating of promoted vector value is very cheap in this case. */
7251 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7252 alg
= unrolled_loop
;
7255 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7256 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7258 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7261 move_mode
= word_mode
;
7269 need_zero_guard
= true;
7273 need_zero_guard
= true;
7276 need_zero_guard
= true;
7277 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7280 need_zero_guard
= true;
7282 /* Find the widest supported mode. */
7283 move_mode
= word_mode
;
7284 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7285 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7286 move_mode
= wider_mode
;
7288 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
7291 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7292 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7293 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7295 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7296 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7297 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7298 move_mode
= word_mode
;
7300 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7302 case rep_prefix_8_byte
:
7305 case rep_prefix_4_byte
:
7308 case rep_prefix_1_byte
:
7312 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7313 epilogue_size_needed
= size_needed
;
7315 /* If we are going to call any library calls conditionally, make sure any
7316 pending stack adjustment happen before the first conditional branch,
7317 otherwise they will be emitted before the library call only and won't
7318 happen from the other branches. */
7319 if (dynamic_check
!= -1)
7320 do_pending_stack_adjust ();
7322 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7323 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7324 align
= desired_align
;
7326 /* Step 1: Prologue guard. */
7328 /* Alignment code needs count to be in register. */
7329 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7331 if (INTVAL (count_exp
) > desired_align
7332 && INTVAL (count_exp
) > size_needed
)
7335 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7336 if (align_bytes
<= 0)
7339 align_bytes
= desired_align
- align_bytes
;
7341 if (align_bytes
== 0)
7342 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7344 gcc_assert (desired_align
>= 1 && align
>= 1);
7346 /* Misaligned move sequences handle both prologue and epilogue at once.
7347 Default code generation results in a smaller code for large alignments
7348 and also avoids redundant job when sizes are known precisely. */
7349 misaligned_prologue_used
7350 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7351 && MAX (desired_align
, epilogue_size_needed
) <= 32
7352 && desired_align
<= epilogue_size_needed
7353 && ((desired_align
> align
&& !align_bytes
)
7354 || (!count
&& epilogue_size_needed
> 1)));
7356 /* Do the cheap promotion to allow better CSE across the
7357 main loop and epilogue (ie one load of the big constant in the
7359 For now the misaligned move sequences do not have fast path
7360 without broadcasting. */
7361 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7363 if (alg
== vector_loop
)
7365 gcc_assert (val_exp
== const0_rtx
);
7366 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7367 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7368 GET_MODE_SIZE (word_mode
),
7369 desired_align
, align
);
7373 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7374 desired_align
, align
);
7377 /* Misaligned move sequences handles both prologues and epilogues at once.
7378 Default code generation results in smaller code for large alignments and
7379 also avoids redundant job when sizes are known precisely. */
7380 if (misaligned_prologue_used
)
7382 /* Misaligned move prologue handled small blocks by itself. */
7383 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7384 (dst
, src
, &destreg
, &srcreg
,
7385 move_mode
, promoted_val
, vec_promoted_val
,
7388 desired_align
< align
7389 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7390 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7392 src
= change_address (src
, BLKmode
, srcreg
);
7393 dst
= change_address (dst
, BLKmode
, destreg
);
7394 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7395 epilogue_size_needed
= 0;
7397 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7399 /* It is possible that we copied enough so the main loop will not
7401 gcc_assert (size_needed
> 1);
7402 if (jump_around_label
== NULL_RTX
)
7403 jump_around_label
= gen_label_rtx ();
7404 emit_cmp_and_jump_insns (count_exp
,
7405 GEN_INT (size_needed
),
7406 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7407 if (expected_size
== -1
7408 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7409 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7411 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7414 /* Ensure that alignment prologue won't copy past end of block. */
7415 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7417 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7418 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7419 Make sure it is power of 2. */
7420 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7422 /* To improve performance of small blocks, we jump around the VAL
7423 promoting mode. This mean that if the promoted VAL is not constant,
7424 we might not use it in the epilogue and have to use byte
7426 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7427 force_loopy_epilogue
= true;
7428 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7429 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7431 /* If main algorithm works on QImode, no epilogue is needed.
7432 For small sizes just don't align anything. */
7433 if (size_needed
== 1)
7434 desired_align
= align
;
7439 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7441 label
= gen_label_rtx ();
7442 emit_cmp_and_jump_insns (count_exp
,
7443 GEN_INT (epilogue_size_needed
),
7444 LTU
, 0, counter_mode (count_exp
), 1, label
);
7445 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7446 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7448 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7452 /* Emit code to decide on runtime whether library call or inline should be
7454 if (dynamic_check
!= -1)
7456 if (!issetmem
&& CONST_INT_P (count_exp
))
7458 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7460 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7461 count_exp
= const0_rtx
;
7467 rtx_code_label
*hot_label
= gen_label_rtx ();
7468 if (jump_around_label
== NULL_RTX
)
7469 jump_around_label
= gen_label_rtx ();
7470 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7471 LEU
, 0, counter_mode (count_exp
),
7473 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7475 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7477 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7478 emit_jump (jump_around_label
);
7479 emit_label (hot_label
);
7483 /* Step 2: Alignment prologue. */
7484 /* Do the expensive promotion once we branched off the small blocks. */
7485 if (issetmem
&& !promoted_val
)
7486 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7487 desired_align
, align
);
7489 if (desired_align
> align
&& !misaligned_prologue_used
)
7491 if (align_bytes
== 0)
7493 /* Except for the first move in prologue, we no longer know
7494 constant offset in aliasing info. It don't seems to worth
7495 the pain to maintain it for the first move, so throw away
7497 dst
= change_address (dst
, BLKmode
, destreg
);
7499 src
= change_address (src
, BLKmode
, srcreg
);
7500 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7501 promoted_val
, vec_promoted_val
,
7502 count_exp
, align
, desired_align
,
7504 /* At most desired_align - align bytes are copied. */
7505 if (min_size
< (unsigned)(desired_align
- align
))
7508 min_size
-= desired_align
- align
;
7512 /* If we know how many bytes need to be stored before dst is
7513 sufficiently aligned, maintain aliasing info accurately. */
7514 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7522 count_exp
= plus_constant (counter_mode (count_exp
),
7523 count_exp
, -align_bytes
);
7524 count
-= align_bytes
;
7525 min_size
-= align_bytes
;
7526 max_size
-= align_bytes
;
7529 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7530 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7531 || (align_bytes
== 0
7532 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7533 + desired_align
- align
))))
7535 /* It is possible that we copied enough so the main loop will not
7537 gcc_assert (size_needed
> 1);
7538 if (label
== NULL_RTX
)
7539 label
= gen_label_rtx ();
7540 emit_cmp_and_jump_insns (count_exp
,
7541 GEN_INT (size_needed
),
7542 LTU
, 0, counter_mode (count_exp
), 1, label
);
7543 if (expected_size
== -1
7544 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7545 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7547 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7550 if (label
&& size_needed
== 1)
7553 LABEL_NUSES (label
) = 1;
7555 epilogue_size_needed
= 1;
7557 promoted_val
= val_exp
;
7559 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7560 epilogue_size_needed
= size_needed
;
7562 /* Step 3: Main loop. */
7573 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7574 count_exp
, move_mode
, unroll_factor
,
7575 expected_size
, issetmem
);
7578 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7579 vec_promoted_val
, count_exp
, move_mode
,
7580 unroll_factor
, expected_size
, issetmem
);
7582 case rep_prefix_8_byte
:
7583 case rep_prefix_4_byte
:
7584 case rep_prefix_1_byte
:
7585 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7586 val_exp
, count_exp
, move_mode
, issetmem
);
7589 /* Adjust properly the offset of src and dest memory for aliasing. */
7590 if (CONST_INT_P (count_exp
))
7593 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7594 (count
/ size_needed
) * size_needed
);
7595 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7596 (count
/ size_needed
) * size_needed
);
7601 src
= change_address (src
, BLKmode
, srcreg
);
7602 dst
= change_address (dst
, BLKmode
, destreg
);
7605 /* Step 4: Epilogue to copy the remaining bytes. */
7609 /* When the main loop is done, COUNT_EXP might hold original count,
7610 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7611 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7612 bytes. Compensate if needed. */
7614 if (size_needed
< epilogue_size_needed
)
7616 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7617 GEN_INT (size_needed
- 1), count_exp
, 1,
7619 if (tmp
!= count_exp
)
7620 emit_move_insn (count_exp
, tmp
);
7623 LABEL_NUSES (label
) = 1;
7626 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7628 if (force_loopy_epilogue
)
7629 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7630 epilogue_size_needed
);
7634 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7635 vec_promoted_val
, count_exp
,
7636 epilogue_size_needed
);
7638 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7639 epilogue_size_needed
);
7642 if (jump_around_label
)
7643 emit_label (jump_around_label
);
7648 /* Expand the appropriate insns for doing strlen if not just doing
7651 out = result, initialized with the start address
7652 align_rtx = alignment of the address.
7653 scratch = scratch register, initialized with the startaddress when
7654 not aligned, otherwise undefined
7656 This is just the body. It needs the initializations mentioned above and
7657 some address computing at the end. These things are done in i386.md. */
7660 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7664 rtx_code_label
*align_2_label
= NULL
;
7665 rtx_code_label
*align_3_label
= NULL
;
7666 rtx_code_label
*align_4_label
= gen_label_rtx ();
7667 rtx_code_label
*end_0_label
= gen_label_rtx ();
7669 rtx tmpreg
= gen_reg_rtx (SImode
);
7670 rtx scratch
= gen_reg_rtx (SImode
);
7674 if (CONST_INT_P (align_rtx
))
7675 align
= INTVAL (align_rtx
);
7677 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7679 /* Is there a known alignment and is it less than 4? */
7682 rtx scratch1
= gen_reg_rtx (Pmode
);
7683 emit_move_insn (scratch1
, out
);
7684 /* Is there a known alignment and is it not 2? */
7687 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7688 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7690 /* Leave just the 3 lower bits. */
7691 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7692 NULL_RTX
, 0, OPTAB_WIDEN
);
7694 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7695 Pmode
, 1, align_4_label
);
7696 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7697 Pmode
, 1, align_2_label
);
7698 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7699 Pmode
, 1, align_3_label
);
7703 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7704 check if is aligned to 4 - byte. */
7706 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7707 NULL_RTX
, 0, OPTAB_WIDEN
);
7709 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7710 Pmode
, 1, align_4_label
);
7713 mem
= change_address (src
, QImode
, out
);
7715 /* Now compare the bytes. */
7717 /* Compare the first n unaligned byte on a byte per byte basis. */
7718 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7719 QImode
, 1, end_0_label
);
7721 /* Increment the address. */
7722 emit_insn (gen_add2_insn (out
, const1_rtx
));
7724 /* Not needed with an alignment of 2 */
7727 emit_label (align_2_label
);
7729 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7732 emit_insn (gen_add2_insn (out
, const1_rtx
));
7734 emit_label (align_3_label
);
7737 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7740 emit_insn (gen_add2_insn (out
, const1_rtx
));
7743 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7744 align this loop. It gives only huge programs, but does not help to
7746 emit_label (align_4_label
);
7748 mem
= change_address (src
, SImode
, out
);
7749 emit_move_insn (scratch
, mem
);
7750 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7752 /* This formula yields a nonzero result iff one of the bytes is zero.
7753 This saves three branches inside loop and many cycles. */
7755 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7756 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7757 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7758 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7759 gen_int_mode (0x80808080, SImode
)));
7760 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7765 rtx reg
= gen_reg_rtx (SImode
);
7766 rtx reg2
= gen_reg_rtx (Pmode
);
7767 emit_move_insn (reg
, tmpreg
);
7768 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7770 /* If zero is not in the first two bytes, move two bytes forward. */
7771 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7772 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7773 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7774 emit_insn (gen_rtx_SET (tmpreg
,
7775 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7778 /* Emit lea manually to avoid clobbering of flags. */
7779 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
7781 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7782 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7783 emit_insn (gen_rtx_SET (out
,
7784 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7790 rtx_code_label
*end_2_label
= gen_label_rtx ();
7791 /* Is zero in the first two bytes? */
7793 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7794 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7795 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7796 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7797 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7799 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7800 JUMP_LABEL (tmp
) = end_2_label
;
7802 /* Not in the first two. Move two bytes forward. */
7803 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7804 emit_insn (gen_add2_insn (out
, const2_rtx
));
7806 emit_label (end_2_label
);
7810 /* Avoid branch in fixing the byte. */
7811 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7812 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7813 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7814 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7815 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7817 emit_label (end_0_label
);
7820 /* Expand strlen. */
7823 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7825 if (TARGET_UNROLL_STRLEN
7826 && TARGET_INLINE_ALL_STRINGOPS
7827 && eoschar
== const0_rtx
7830 /* The generic case of strlen expander is long. Avoid it's
7831 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7832 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7833 /* Well it seems that some optimizer does not combine a call like
7834 foo(strlen(bar), strlen(bar));
7835 when the move and the subtraction is done here. It does calculate
7836 the length just once when these instructions are done inside of
7837 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7838 often used and I use one fewer register for the lifetime of
7839 output_strlen_unroll() this is better. */
7841 emit_move_insn (out
, addr
);
7843 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7845 /* strlensi_unroll_1 returns the address of the zero at the end of
7846 the string, like memchr(), so compute the length by subtracting
7847 the start address. */
7848 emit_insn (gen_sub2_insn (out
, addr
));
7855 /* For given symbol (function) construct code to compute address of it's PLT
7856 entry in large x86-64 PIC model. */
7859 construct_plt_address (rtx symbol
)
7863 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7864 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7865 gcc_assert (Pmode
== DImode
);
7867 tmp
= gen_reg_rtx (Pmode
);
7868 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7870 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7871 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7875 /* Additional registers that are clobbered by SYSV calls. */
7877 static int const x86_64_ms_sysv_extra_clobbered_registers
7878 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7882 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7883 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7887 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7889 rtx pop
, bool sibcall
)
7892 rtx use
= NULL
, call
;
7893 unsigned int vec_len
= 0;
7896 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7898 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7900 && (lookup_attribute ("interrupt",
7901 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7902 error ("interrupt service routine cannot be called directly");
7907 if (pop
== const0_rtx
)
7909 gcc_assert (!TARGET_64BIT
|| !pop
);
7911 if (TARGET_MACHO
&& !TARGET_64BIT
)
7914 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7915 fnaddr
= machopic_indirect_call_target (fnaddr
);
7920 /* Static functions and indirect calls don't need the pic register. Also,
7921 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7922 it an indirect call. */
7923 rtx addr
= XEXP (fnaddr
, 0);
7925 && GET_CODE (addr
) == SYMBOL_REF
7926 && !SYMBOL_REF_LOCAL_P (addr
))
7929 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7930 || !lookup_attribute ("noplt",
7931 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
7934 || (ix86_cmodel
== CM_LARGE_PIC
7935 && DEFAULT_ABI
!= MS_ABI
))
7937 use_reg (&use
, gen_rtx_REG (Pmode
,
7938 REAL_PIC_OFFSET_TABLE_REGNUM
));
7939 if (ix86_use_pseudo_pic_reg ())
7940 emit_move_insn (gen_rtx_REG (Pmode
,
7941 REAL_PIC_OFFSET_TABLE_REGNUM
),
7942 pic_offset_table_rtx
);
7945 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
7949 fnaddr
= gen_rtx_UNSPEC (Pmode
,
7950 gen_rtvec (1, addr
),
7952 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
7956 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
7958 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
7959 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
7962 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
7963 /* Pmode may not be the same as word_mode for x32, which
7964 doesn't support indirect branch via 32-bit memory slot.
7965 Since x32 GOT slot is 64 bit with zero upper 32 bits,
7966 indirect branch via x32 GOT slot is OK. */
7967 if (GET_MODE (fnaddr
) != word_mode
)
7968 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
7969 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
7974 /* Skip setting up RAX register for -mskip-rax-setup when there are no
7975 parameters passed in vector registers. */
7977 && (INTVAL (callarg2
) > 0
7978 || (INTVAL (callarg2
) == 0
7979 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
7981 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
7982 emit_move_insn (al
, callarg2
);
7986 if (ix86_cmodel
== CM_LARGE_PIC
7989 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
7990 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
7991 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
7992 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
7993 branch via x32 GOT slot is OK. */
7994 else if (!(TARGET_X32
7996 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
7997 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
7999 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8000 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8002 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8003 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8006 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8009 call
= gen_rtx_SET (retval
, call
);
8010 vec
[vec_len
++] = call
;
8014 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8015 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8016 vec
[vec_len
++] = pop
;
8019 if (cfun
->machine
->no_caller_saved_registers
8021 || (!TREE_THIS_VOLATILE (fndecl
)
8022 && !lookup_attribute ("no_caller_saved_registers",
8023 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8025 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8026 bool is_64bit_ms_abi
= (TARGET_64BIT
8027 && ix86_function_abi (fndecl
) == MS_ABI
);
8028 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8030 /* If there are no caller-saved registers, add all registers
8031 that are clobbered by the call which returns. */
8032 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8034 && (ix86_call_used_regs
[i
] == 1
8035 || (ix86_call_used_regs
[i
] & c_mask
))
8036 && !STACK_REGNO_P (i
)
8037 && !MMX_REGNO_P (i
))
8039 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8041 else if (TARGET_64BIT_MS_ABI
8042 && (!callarg2
|| INTVAL (callarg2
) != -2))
8046 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8048 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8049 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8051 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8054 /* Set here, but it may get cleared later. */
8055 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8060 /* Don't break hot-patched functions. */
8061 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8064 /* TODO: Cases not yet examined. */
8065 else if (flag_split_stack
)
8066 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8070 gcc_assert (!reload_completed
);
8071 cfun
->machine
->call_ms2sysv
= true;
8077 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8078 rtx_insn
*call_insn
= emit_call_insn (call
);
8080 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8085 /* Split simple return with popping POPC bytes from stack to indirect
8086 branch with stack adjustment . */
8089 ix86_split_simple_return_pop_internal (rtx popc
)
8091 struct machine_function
*m
= cfun
->machine
;
8092 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8095 /* There is no "pascal" calling convention in any 64bit ABI. */
8096 gcc_assert (!TARGET_64BIT
);
8098 insn
= emit_insn (gen_pop (ecx
));
8099 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8100 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8102 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8103 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8104 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8105 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8106 RTX_FRAME_RELATED_P (insn
) = 1;
8108 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8109 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8110 insn
= emit_insn (x
);
8111 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8112 RTX_FRAME_RELATED_P (insn
) = 1;
8114 /* Now return address is in ECX. */
8115 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8118 /* Errors in the source file can cause expand_expr to return const0_rtx
8119 where we expect a vector. To avoid crashing, use one of the vector
8120 clear instructions. */
8123 safe_vector_operand (rtx x
, machine_mode mode
)
8125 if (x
== const0_rtx
)
8126 x
= CONST0_RTX (mode
);
8130 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8133 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8136 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8137 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8138 rtx op0
= expand_normal (arg0
);
8139 rtx op1
= expand_normal (arg1
);
8140 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8141 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8142 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8144 if (VECTOR_MODE_P (mode0
))
8145 op0
= safe_vector_operand (op0
, mode0
);
8146 if (VECTOR_MODE_P (mode1
))
8147 op1
= safe_vector_operand (op1
, mode1
);
8149 if (optimize
|| !target
8150 || GET_MODE (target
) != tmode
8151 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8152 target
= gen_reg_rtx (tmode
);
8154 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8156 rtx x
= gen_reg_rtx (V4SImode
);
8157 emit_insn (gen_sse2_loadd (x
, op1
));
8158 op1
= gen_lowpart (TImode
, x
);
8161 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8162 op0
= copy_to_mode_reg (mode0
, op0
);
8163 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8164 op1
= copy_to_mode_reg (mode1
, op1
);
8166 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8175 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8178 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8179 enum ix86_builtin_func_type m_type
,
8180 enum rtx_code sub_code
)
8185 bool comparison_p
= false;
8187 bool last_arg_constant
= false;
8194 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8198 case MULTI_ARG_4_DF2_DI_I
:
8199 case MULTI_ARG_4_DF2_DI_I1
:
8200 case MULTI_ARG_4_SF2_SI_I
:
8201 case MULTI_ARG_4_SF2_SI_I1
:
8203 last_arg_constant
= true;
8206 case MULTI_ARG_3_SF
:
8207 case MULTI_ARG_3_DF
:
8208 case MULTI_ARG_3_SF2
:
8209 case MULTI_ARG_3_DF2
:
8210 case MULTI_ARG_3_DI
:
8211 case MULTI_ARG_3_SI
:
8212 case MULTI_ARG_3_SI_DI
:
8213 case MULTI_ARG_3_HI
:
8214 case MULTI_ARG_3_HI_SI
:
8215 case MULTI_ARG_3_QI
:
8216 case MULTI_ARG_3_DI2
:
8217 case MULTI_ARG_3_SI2
:
8218 case MULTI_ARG_3_HI2
:
8219 case MULTI_ARG_3_QI2
:
8223 case MULTI_ARG_2_SF
:
8224 case MULTI_ARG_2_DF
:
8225 case MULTI_ARG_2_DI
:
8226 case MULTI_ARG_2_SI
:
8227 case MULTI_ARG_2_HI
:
8228 case MULTI_ARG_2_QI
:
8232 case MULTI_ARG_2_DI_IMM
:
8233 case MULTI_ARG_2_SI_IMM
:
8234 case MULTI_ARG_2_HI_IMM
:
8235 case MULTI_ARG_2_QI_IMM
:
8237 last_arg_constant
= true;
8240 case MULTI_ARG_1_SF
:
8241 case MULTI_ARG_1_DF
:
8242 case MULTI_ARG_1_SF2
:
8243 case MULTI_ARG_1_DF2
:
8244 case MULTI_ARG_1_DI
:
8245 case MULTI_ARG_1_SI
:
8246 case MULTI_ARG_1_HI
:
8247 case MULTI_ARG_1_QI
:
8248 case MULTI_ARG_1_SI_DI
:
8249 case MULTI_ARG_1_HI_DI
:
8250 case MULTI_ARG_1_HI_SI
:
8251 case MULTI_ARG_1_QI_DI
:
8252 case MULTI_ARG_1_QI_SI
:
8253 case MULTI_ARG_1_QI_HI
:
8257 case MULTI_ARG_2_DI_CMP
:
8258 case MULTI_ARG_2_SI_CMP
:
8259 case MULTI_ARG_2_HI_CMP
:
8260 case MULTI_ARG_2_QI_CMP
:
8262 comparison_p
= true;
8265 case MULTI_ARG_2_SF_TF
:
8266 case MULTI_ARG_2_DF_TF
:
8267 case MULTI_ARG_2_DI_TF
:
8268 case MULTI_ARG_2_SI_TF
:
8269 case MULTI_ARG_2_HI_TF
:
8270 case MULTI_ARG_2_QI_TF
:
8279 if (optimize
|| !target
8280 || GET_MODE (target
) != tmode
8281 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8282 target
= gen_reg_rtx (tmode
);
8283 else if (memory_operand (target
, tmode
))
8286 gcc_assert (nargs
<= 4);
8288 for (i
= 0; i
< nargs
; i
++)
8290 tree arg
= CALL_EXPR_ARG (exp
, i
);
8291 rtx op
= expand_normal (arg
);
8292 int adjust
= (comparison_p
) ? 1 : 0;
8293 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8295 if (last_arg_constant
&& i
== nargs
- 1)
8297 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8299 enum insn_code new_icode
= icode
;
8302 case CODE_FOR_xop_vpermil2v2df3
:
8303 case CODE_FOR_xop_vpermil2v4sf3
:
8304 case CODE_FOR_xop_vpermil2v4df3
:
8305 case CODE_FOR_xop_vpermil2v8sf3
:
8306 error ("the last argument must be a 2-bit immediate");
8307 return gen_reg_rtx (tmode
);
8308 case CODE_FOR_xop_rotlv2di3
:
8309 new_icode
= CODE_FOR_rotlv2di3
;
8311 case CODE_FOR_xop_rotlv4si3
:
8312 new_icode
= CODE_FOR_rotlv4si3
;
8314 case CODE_FOR_xop_rotlv8hi3
:
8315 new_icode
= CODE_FOR_rotlv8hi3
;
8317 case CODE_FOR_xop_rotlv16qi3
:
8318 new_icode
= CODE_FOR_rotlv16qi3
;
8320 if (CONST_INT_P (op
))
8322 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8323 op
= GEN_INT (INTVAL (op
) & mask
);
8325 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8331 && insn_data
[new_icode
].operand
[0].mode
== tmode
8332 && insn_data
[new_icode
].operand
[1].mode
== tmode
8333 && insn_data
[new_icode
].operand
[2].mode
== mode
8334 && insn_data
[new_icode
].operand
[0].predicate
8335 == insn_data
[icode
].operand
[0].predicate
8336 && insn_data
[new_icode
].operand
[1].predicate
8337 == insn_data
[icode
].operand
[1].predicate
);
8350 if (VECTOR_MODE_P (mode
))
8351 op
= safe_vector_operand (op
, mode
);
8353 /* If we aren't optimizing, only allow one memory operand to be
8355 if (memory_operand (op
, mode
))
8358 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8361 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8363 op
= force_reg (mode
, op
);
8367 args
[i
].mode
= mode
;
8373 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8378 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8379 GEN_INT ((int)sub_code
));
8380 else if (! comparison_p
)
8381 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8384 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8388 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8393 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8397 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8411 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8412 insns with vec_merge. */
8415 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8419 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8420 rtx op1
, op0
= expand_normal (arg0
);
8421 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8422 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8424 if (optimize
|| !target
8425 || GET_MODE (target
) != tmode
8426 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8427 target
= gen_reg_rtx (tmode
);
8429 if (VECTOR_MODE_P (mode0
))
8430 op0
= safe_vector_operand (op0
, mode0
);
8432 if ((optimize
&& !register_operand (op0
, mode0
))
8433 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8434 op0
= copy_to_mode_reg (mode0
, op0
);
8437 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8438 op1
= copy_to_mode_reg (mode0
, op1
);
8440 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8447 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8450 ix86_expand_sse_compare (const struct builtin_description
*d
,
8451 tree exp
, rtx target
, bool swap
)
8454 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8455 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8456 rtx op0
= expand_normal (arg0
);
8457 rtx op1
= expand_normal (arg1
);
8459 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8460 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8461 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8462 enum rtx_code comparison
= d
->comparison
;
8464 if (VECTOR_MODE_P (mode0
))
8465 op0
= safe_vector_operand (op0
, mode0
);
8466 if (VECTOR_MODE_P (mode1
))
8467 op1
= safe_vector_operand (op1
, mode1
);
8469 /* Swap operands if we have a comparison that isn't available in
8472 std::swap (op0
, op1
);
8474 if (optimize
|| !target
8475 || GET_MODE (target
) != tmode
8476 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8477 target
= gen_reg_rtx (tmode
);
8479 if ((optimize
&& !register_operand (op0
, mode0
))
8480 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8481 op0
= copy_to_mode_reg (mode0
, op0
);
8482 if ((optimize
&& !register_operand (op1
, mode1
))
8483 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8484 op1
= copy_to_mode_reg (mode1
, op1
);
8486 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8487 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8494 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8497 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8501 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8502 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8503 rtx op0
= expand_normal (arg0
);
8504 rtx op1
= expand_normal (arg1
);
8505 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8506 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8507 enum rtx_code comparison
= d
->comparison
;
8509 if (VECTOR_MODE_P (mode0
))
8510 op0
= safe_vector_operand (op0
, mode0
);
8511 if (VECTOR_MODE_P (mode1
))
8512 op1
= safe_vector_operand (op1
, mode1
);
8514 /* Swap operands if we have a comparison that isn't available in
8516 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8517 std::swap (op0
, op1
);
8519 target
= gen_reg_rtx (SImode
);
8520 emit_move_insn (target
, const0_rtx
);
8521 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8523 if ((optimize
&& !register_operand (op0
, mode0
))
8524 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8525 op0
= copy_to_mode_reg (mode0
, op0
);
8526 if ((optimize
&& !register_operand (op1
, mode1
))
8527 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8528 op1
= copy_to_mode_reg (mode1
, op1
);
8530 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8534 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8535 gen_rtx_fmt_ee (comparison
, QImode
,
8539 return SUBREG_REG (target
);
8542 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8545 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8549 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8550 rtx op1
, op0
= expand_normal (arg0
);
8551 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8552 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8554 if (optimize
|| target
== 0
8555 || GET_MODE (target
) != tmode
8556 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8557 target
= gen_reg_rtx (tmode
);
8559 if (VECTOR_MODE_P (mode0
))
8560 op0
= safe_vector_operand (op0
, mode0
);
8562 if ((optimize
&& !register_operand (op0
, mode0
))
8563 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8564 op0
= copy_to_mode_reg (mode0
, op0
);
8566 op1
= GEN_INT (d
->comparison
);
8568 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8576 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8577 tree exp
, rtx target
)
8580 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8581 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8582 rtx op0
= expand_normal (arg0
);
8583 rtx op1
= expand_normal (arg1
);
8585 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8586 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8587 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8589 if (optimize
|| target
== 0
8590 || GET_MODE (target
) != tmode
8591 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8592 target
= gen_reg_rtx (tmode
);
8594 op0
= safe_vector_operand (op0
, mode0
);
8595 op1
= safe_vector_operand (op1
, mode1
);
8597 if ((optimize
&& !register_operand (op0
, mode0
))
8598 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8599 op0
= copy_to_mode_reg (mode0
, op0
);
8600 if ((optimize
&& !register_operand (op1
, mode1
))
8601 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8602 op1
= copy_to_mode_reg (mode1
, op1
);
8604 op2
= GEN_INT (d
->comparison
);
8606 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8613 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8616 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8620 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8621 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8622 rtx op0
= expand_normal (arg0
);
8623 rtx op1
= expand_normal (arg1
);
8624 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8625 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8626 enum rtx_code comparison
= d
->comparison
;
8628 if (VECTOR_MODE_P (mode0
))
8629 op0
= safe_vector_operand (op0
, mode0
);
8630 if (VECTOR_MODE_P (mode1
))
8631 op1
= safe_vector_operand (op1
, mode1
);
8633 target
= gen_reg_rtx (SImode
);
8634 emit_move_insn (target
, const0_rtx
);
8635 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8637 if ((optimize
&& !register_operand (op0
, mode0
))
8638 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8639 op0
= copy_to_mode_reg (mode0
, op0
);
8640 if ((optimize
&& !register_operand (op1
, mode1
))
8641 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8642 op1
= copy_to_mode_reg (mode1
, op1
);
8644 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8648 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8649 gen_rtx_fmt_ee (comparison
, QImode
,
8653 return SUBREG_REG (target
);
8656 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8659 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8660 tree exp
, rtx target
)
8663 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8664 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8665 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8666 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8667 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8668 rtx scratch0
, scratch1
;
8669 rtx op0
= expand_normal (arg0
);
8670 rtx op1
= expand_normal (arg1
);
8671 rtx op2
= expand_normal (arg2
);
8672 rtx op3
= expand_normal (arg3
);
8673 rtx op4
= expand_normal (arg4
);
8674 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8676 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8677 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8678 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8679 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8680 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8681 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8682 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8684 if (VECTOR_MODE_P (modev2
))
8685 op0
= safe_vector_operand (op0
, modev2
);
8686 if (VECTOR_MODE_P (modev4
))
8687 op2
= safe_vector_operand (op2
, modev4
);
8689 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8690 op0
= copy_to_mode_reg (modev2
, op0
);
8691 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8692 op1
= copy_to_mode_reg (modei3
, op1
);
8693 if ((optimize
&& !register_operand (op2
, modev4
))
8694 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8695 op2
= copy_to_mode_reg (modev4
, op2
);
8696 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8697 op3
= copy_to_mode_reg (modei5
, op3
);
8699 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8701 error ("the fifth argument must be an 8-bit immediate");
8705 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8707 if (optimize
|| !target
8708 || GET_MODE (target
) != tmode0
8709 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8710 target
= gen_reg_rtx (tmode0
);
8712 scratch1
= gen_reg_rtx (tmode1
);
8714 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8716 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8718 if (optimize
|| !target
8719 || GET_MODE (target
) != tmode1
8720 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8721 target
= gen_reg_rtx (tmode1
);
8723 scratch0
= gen_reg_rtx (tmode0
);
8725 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8729 gcc_assert (d
->flag
);
8731 scratch0
= gen_reg_rtx (tmode0
);
8732 scratch1
= gen_reg_rtx (tmode1
);
8734 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8744 target
= gen_reg_rtx (SImode
);
8745 emit_move_insn (target
, const0_rtx
);
8746 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8749 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8750 gen_rtx_fmt_ee (EQ
, QImode
,
8751 gen_rtx_REG ((machine_mode
) d
->flag
,
8754 return SUBREG_REG (target
);
8761 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8764 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8765 tree exp
, rtx target
)
8768 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8769 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8770 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8771 rtx scratch0
, scratch1
;
8772 rtx op0
= expand_normal (arg0
);
8773 rtx op1
= expand_normal (arg1
);
8774 rtx op2
= expand_normal (arg2
);
8775 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8777 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8778 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8779 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8780 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8781 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8783 if (VECTOR_MODE_P (modev2
))
8784 op0
= safe_vector_operand (op0
, modev2
);
8785 if (VECTOR_MODE_P (modev3
))
8786 op1
= safe_vector_operand (op1
, modev3
);
8788 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8789 op0
= copy_to_mode_reg (modev2
, op0
);
8790 if ((optimize
&& !register_operand (op1
, modev3
))
8791 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8792 op1
= copy_to_mode_reg (modev3
, op1
);
8794 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8796 error ("the third argument must be an 8-bit immediate");
8800 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8802 if (optimize
|| !target
8803 || GET_MODE (target
) != tmode0
8804 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8805 target
= gen_reg_rtx (tmode0
);
8807 scratch1
= gen_reg_rtx (tmode1
);
8809 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8811 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8813 if (optimize
|| !target
8814 || GET_MODE (target
) != tmode1
8815 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8816 target
= gen_reg_rtx (tmode1
);
8818 scratch0
= gen_reg_rtx (tmode0
);
8820 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8824 gcc_assert (d
->flag
);
8826 scratch0
= gen_reg_rtx (tmode0
);
8827 scratch1
= gen_reg_rtx (tmode1
);
8829 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8839 target
= gen_reg_rtx (SImode
);
8840 emit_move_insn (target
, const0_rtx
);
8841 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8844 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8845 gen_rtx_fmt_ee (EQ
, QImode
,
8846 gen_rtx_REG ((machine_mode
) d
->flag
,
8849 return SUBREG_REG (target
);
8855 /* Fixup modeless constants to fit required mode. */
8858 fixup_modeless_constant (rtx x
, machine_mode mode
)
8860 if (GET_MODE (x
) == VOIDmode
)
8861 x
= convert_to_mode (mode
, x
, 1);
8865 /* Subroutine of ix86_expand_builtin to take care of insns with
8866 variable number of operands. */
8869 ix86_expand_args_builtin (const struct builtin_description
*d
,
8870 tree exp
, rtx target
)
8872 rtx pat
, real_target
;
8873 unsigned int i
, nargs
;
8874 unsigned int nargs_constant
= 0;
8875 unsigned int mask_pos
= 0;
8882 bool second_arg_count
= false;
8883 enum insn_code icode
= d
->icode
;
8884 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8885 machine_mode tmode
= insn_p
->operand
[0].mode
;
8886 machine_mode rmode
= VOIDmode
;
8888 enum rtx_code comparison
= d
->comparison
;
8890 switch ((enum ix86_builtin_func_type
) d
->flag
)
8892 case V2DF_FTYPE_V2DF_ROUND
:
8893 case V4DF_FTYPE_V4DF_ROUND
:
8894 case V8DF_FTYPE_V8DF_ROUND
:
8895 case V4SF_FTYPE_V4SF_ROUND
:
8896 case V8SF_FTYPE_V8SF_ROUND
:
8897 case V16SF_FTYPE_V16SF_ROUND
:
8898 case V4SI_FTYPE_V4SF_ROUND
:
8899 case V8SI_FTYPE_V8SF_ROUND
:
8900 case V16SI_FTYPE_V16SF_ROUND
:
8901 return ix86_expand_sse_round (d
, exp
, target
);
8902 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8903 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8904 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8905 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8906 case INT_FTYPE_V8SF_V8SF_PTEST
:
8907 case INT_FTYPE_V4DI_V4DI_PTEST
:
8908 case INT_FTYPE_V4DF_V4DF_PTEST
:
8909 case INT_FTYPE_V4SF_V4SF_PTEST
:
8910 case INT_FTYPE_V2DI_V2DI_PTEST
:
8911 case INT_FTYPE_V2DF_V2DF_PTEST
:
8912 return ix86_expand_sse_ptest (d
, exp
, target
);
8913 case FLOAT128_FTYPE_FLOAT128
:
8914 case FLOAT_FTYPE_FLOAT
:
8916 case UINT_FTYPE_UINT
:
8917 case UINT16_FTYPE_UINT16
:
8918 case UINT64_FTYPE_INT
:
8919 case UINT64_FTYPE_UINT64
:
8920 case INT64_FTYPE_INT64
:
8921 case INT64_FTYPE_V4SF
:
8922 case INT64_FTYPE_V2DF
:
8923 case INT_FTYPE_V16QI
:
8924 case INT_FTYPE_V8QI
:
8925 case INT_FTYPE_V8SF
:
8926 case INT_FTYPE_V4DF
:
8927 case INT_FTYPE_V4SF
:
8928 case INT_FTYPE_V2DF
:
8929 case INT_FTYPE_V32QI
:
8930 case V16QI_FTYPE_V16QI
:
8931 case V8SI_FTYPE_V8SF
:
8932 case V8SI_FTYPE_V4SI
:
8933 case V8HI_FTYPE_V8HI
:
8934 case V8HI_FTYPE_V16QI
:
8935 case V8QI_FTYPE_V8QI
:
8936 case V8SF_FTYPE_V8SF
:
8937 case V8SF_FTYPE_V8SI
:
8938 case V8SF_FTYPE_V4SF
:
8939 case V8SF_FTYPE_V8HI
:
8940 case V4SI_FTYPE_V4SI
:
8941 case V4SI_FTYPE_V16QI
:
8942 case V4SI_FTYPE_V4SF
:
8943 case V4SI_FTYPE_V8SI
:
8944 case V4SI_FTYPE_V8HI
:
8945 case V4SI_FTYPE_V4DF
:
8946 case V4SI_FTYPE_V2DF
:
8947 case V4HI_FTYPE_V4HI
:
8948 case V4DF_FTYPE_V4DF
:
8949 case V4DF_FTYPE_V4SI
:
8950 case V4DF_FTYPE_V4SF
:
8951 case V4DF_FTYPE_V2DF
:
8952 case V4SF_FTYPE_V4SF
:
8953 case V4SF_FTYPE_V4SI
:
8954 case V4SF_FTYPE_V8SF
:
8955 case V4SF_FTYPE_V4DF
:
8956 case V4SF_FTYPE_V8HI
:
8957 case V4SF_FTYPE_V2DF
:
8958 case V2DI_FTYPE_V2DI
:
8959 case V2DI_FTYPE_V16QI
:
8960 case V2DI_FTYPE_V8HI
:
8961 case V2DI_FTYPE_V4SI
:
8962 case V2DF_FTYPE_V2DF
:
8963 case V2DF_FTYPE_V4SI
:
8964 case V2DF_FTYPE_V4DF
:
8965 case V2DF_FTYPE_V4SF
:
8966 case V2DF_FTYPE_V2SI
:
8967 case V2SI_FTYPE_V2SI
:
8968 case V2SI_FTYPE_V4SF
:
8969 case V2SI_FTYPE_V2SF
:
8970 case V2SI_FTYPE_V2DF
:
8971 case V2SF_FTYPE_V2SF
:
8972 case V2SF_FTYPE_V2SI
:
8973 case V32QI_FTYPE_V32QI
:
8974 case V32QI_FTYPE_V16QI
:
8975 case V16HI_FTYPE_V16HI
:
8976 case V16HI_FTYPE_V8HI
:
8977 case V8SI_FTYPE_V8SI
:
8978 case V16HI_FTYPE_V16QI
:
8979 case V8SI_FTYPE_V16QI
:
8980 case V4DI_FTYPE_V16QI
:
8981 case V8SI_FTYPE_V8HI
:
8982 case V4DI_FTYPE_V8HI
:
8983 case V4DI_FTYPE_V4SI
:
8984 case V4DI_FTYPE_V2DI
:
8991 case UHI_FTYPE_V16QI
:
8992 case USI_FTYPE_V32QI
:
8993 case UDI_FTYPE_V64QI
:
8994 case V16QI_FTYPE_UHI
:
8995 case V32QI_FTYPE_USI
:
8996 case V64QI_FTYPE_UDI
:
8997 case V8HI_FTYPE_UQI
:
8998 case V16HI_FTYPE_UHI
:
8999 case V32HI_FTYPE_USI
:
9000 case V4SI_FTYPE_UQI
:
9001 case V8SI_FTYPE_UQI
:
9002 case V4SI_FTYPE_UHI
:
9003 case V8SI_FTYPE_UHI
:
9004 case UQI_FTYPE_V8HI
:
9005 case UHI_FTYPE_V16HI
:
9006 case USI_FTYPE_V32HI
:
9007 case UQI_FTYPE_V4SI
:
9008 case UQI_FTYPE_V8SI
:
9009 case UHI_FTYPE_V16SI
:
9010 case UQI_FTYPE_V2DI
:
9011 case UQI_FTYPE_V4DI
:
9012 case UQI_FTYPE_V8DI
:
9013 case V16SI_FTYPE_UHI
:
9014 case V2DI_FTYPE_UQI
:
9015 case V4DI_FTYPE_UQI
:
9016 case V16SI_FTYPE_INT
:
9017 case V16SF_FTYPE_V8SF
:
9018 case V16SI_FTYPE_V8SI
:
9019 case V16SF_FTYPE_V4SF
:
9020 case V16SI_FTYPE_V4SI
:
9021 case V16SI_FTYPE_V16SF
:
9022 case V16SI_FTYPE_V16SI
:
9023 case V64QI_FTYPE_V64QI
:
9024 case V32HI_FTYPE_V32HI
:
9025 case V16SF_FTYPE_V16SF
:
9026 case V8DI_FTYPE_UQI
:
9027 case V8DI_FTYPE_V8DI
:
9028 case V8DF_FTYPE_V4DF
:
9029 case V8DF_FTYPE_V2DF
:
9030 case V8DF_FTYPE_V8DF
:
9031 case V4DI_FTYPE_V4DI
:
9032 case V16HI_FTYPE_V16SF
:
9033 case V8HI_FTYPE_V8SF
:
9034 case V8HI_FTYPE_V4SF
:
9037 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9038 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9039 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9040 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9041 case V16QI_FTYPE_V16QI_V16QI
:
9042 case V16QI_FTYPE_V8HI_V8HI
:
9043 case V16SF_FTYPE_V16SF_V16SF
:
9044 case V8QI_FTYPE_V8QI_V8QI
:
9045 case V8QI_FTYPE_V4HI_V4HI
:
9046 case V8HI_FTYPE_V8HI_V8HI
:
9047 case V8HI_FTYPE_V16QI_V16QI
:
9048 case V8HI_FTYPE_V4SI_V4SI
:
9049 case V8SF_FTYPE_V8SF_V8SF
:
9050 case V8SF_FTYPE_V8SF_V8SI
:
9051 case V8DF_FTYPE_V8DF_V8DF
:
9052 case V4SI_FTYPE_V4SI_V4SI
:
9053 case V4SI_FTYPE_V8HI_V8HI
:
9054 case V4SI_FTYPE_V2DF_V2DF
:
9055 case V4HI_FTYPE_V4HI_V4HI
:
9056 case V4HI_FTYPE_V8QI_V8QI
:
9057 case V4HI_FTYPE_V2SI_V2SI
:
9058 case V4DF_FTYPE_V4DF_V4DF
:
9059 case V4DF_FTYPE_V4DF_V4DI
:
9060 case V4SF_FTYPE_V4SF_V4SF
:
9061 case V4SF_FTYPE_V4SF_V4SI
:
9062 case V4SF_FTYPE_V4SF_V2SI
:
9063 case V4SF_FTYPE_V4SF_V2DF
:
9064 case V4SF_FTYPE_V4SF_UINT
:
9065 case V4SF_FTYPE_V4SF_DI
:
9066 case V4SF_FTYPE_V4SF_SI
:
9067 case V2DI_FTYPE_V2DI_V2DI
:
9068 case V2DI_FTYPE_V16QI_V16QI
:
9069 case V2DI_FTYPE_V4SI_V4SI
:
9070 case V2DI_FTYPE_V2DI_V16QI
:
9071 case V2SI_FTYPE_V2SI_V2SI
:
9072 case V2SI_FTYPE_V4HI_V4HI
:
9073 case V2SI_FTYPE_V2SF_V2SF
:
9074 case V2DF_FTYPE_V2DF_V2DF
:
9075 case V2DF_FTYPE_V2DF_V4SF
:
9076 case V2DF_FTYPE_V2DF_V2DI
:
9077 case V2DF_FTYPE_V2DF_DI
:
9078 case V2DF_FTYPE_V2DF_SI
:
9079 case V2DF_FTYPE_V2DF_UINT
:
9080 case V2SF_FTYPE_V2SF_V2SF
:
9081 case V1DI_FTYPE_V1DI_V1DI
:
9082 case V1DI_FTYPE_V8QI_V8QI
:
9083 case V1DI_FTYPE_V2SI_V2SI
:
9084 case V32QI_FTYPE_V16HI_V16HI
:
9085 case V16HI_FTYPE_V8SI_V8SI
:
9086 case V64QI_FTYPE_V64QI_V64QI
:
9087 case V32QI_FTYPE_V32QI_V32QI
:
9088 case V16HI_FTYPE_V32QI_V32QI
:
9089 case V16HI_FTYPE_V16HI_V16HI
:
9090 case V8SI_FTYPE_V4DF_V4DF
:
9091 case V8SI_FTYPE_V8SI_V8SI
:
9092 case V8SI_FTYPE_V16HI_V16HI
:
9093 case V4DI_FTYPE_V4DI_V4DI
:
9094 case V4DI_FTYPE_V8SI_V8SI
:
9095 case V8DI_FTYPE_V64QI_V64QI
:
9096 if (comparison
== UNKNOWN
)
9097 return ix86_expand_binop_builtin (icode
, exp
, target
);
9100 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9101 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9102 gcc_assert (comparison
!= UNKNOWN
);
9106 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9107 case V16HI_FTYPE_V16HI_SI_COUNT
:
9108 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9109 case V8SI_FTYPE_V8SI_SI_COUNT
:
9110 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9111 case V4DI_FTYPE_V4DI_INT_COUNT
:
9112 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9113 case V8HI_FTYPE_V8HI_SI_COUNT
:
9114 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9115 case V4SI_FTYPE_V4SI_SI_COUNT
:
9116 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9117 case V4HI_FTYPE_V4HI_SI_COUNT
:
9118 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9119 case V2DI_FTYPE_V2DI_SI_COUNT
:
9120 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9121 case V2SI_FTYPE_V2SI_SI_COUNT
:
9122 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9123 case V1DI_FTYPE_V1DI_SI_COUNT
:
9125 second_arg_count
= true;
9127 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9128 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9129 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9130 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9131 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9132 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9133 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9134 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9135 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9136 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9137 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9138 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9139 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9140 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9141 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9142 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9143 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9144 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9146 second_arg_count
= true;
9148 case UINT64_FTYPE_UINT64_UINT64
:
9149 case UINT_FTYPE_UINT_UINT
:
9150 case UINT_FTYPE_UINT_USHORT
:
9151 case UINT_FTYPE_UINT_UCHAR
:
9152 case UINT16_FTYPE_UINT16_INT
:
9153 case UINT8_FTYPE_UINT8_INT
:
9154 case UQI_FTYPE_UQI_UQI
:
9155 case UHI_FTYPE_UHI_UHI
:
9156 case USI_FTYPE_USI_USI
:
9157 case UDI_FTYPE_UDI_UDI
:
9158 case V16SI_FTYPE_V8DF_V8DF
:
9159 case V32HI_FTYPE_V16SF_V16SF
:
9160 case V16HI_FTYPE_V8SF_V8SF
:
9161 case V8HI_FTYPE_V4SF_V4SF
:
9162 case V16HI_FTYPE_V16SF_UHI
:
9163 case V8HI_FTYPE_V8SF_UQI
:
9164 case V8HI_FTYPE_V4SF_UQI
:
9167 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9172 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9177 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9182 case V8HI_FTYPE_V8HI_INT
:
9183 case V8HI_FTYPE_V8SF_INT
:
9184 case V16HI_FTYPE_V16SF_INT
:
9185 case V8HI_FTYPE_V4SF_INT
:
9186 case V8SF_FTYPE_V8SF_INT
:
9187 case V4SF_FTYPE_V16SF_INT
:
9188 case V16SF_FTYPE_V16SF_INT
:
9189 case V4SI_FTYPE_V4SI_INT
:
9190 case V4SI_FTYPE_V8SI_INT
:
9191 case V4HI_FTYPE_V4HI_INT
:
9192 case V4DF_FTYPE_V4DF_INT
:
9193 case V4DF_FTYPE_V8DF_INT
:
9194 case V4SF_FTYPE_V4SF_INT
:
9195 case V4SF_FTYPE_V8SF_INT
:
9196 case V2DI_FTYPE_V2DI_INT
:
9197 case V2DF_FTYPE_V2DF_INT
:
9198 case V2DF_FTYPE_V4DF_INT
:
9199 case V16HI_FTYPE_V16HI_INT
:
9200 case V8SI_FTYPE_V8SI_INT
:
9201 case V16SI_FTYPE_V16SI_INT
:
9202 case V4SI_FTYPE_V16SI_INT
:
9203 case V4DI_FTYPE_V4DI_INT
:
9204 case V2DI_FTYPE_V4DI_INT
:
9205 case V4DI_FTYPE_V8DI_INT
:
9206 case UQI_FTYPE_UQI_UQI_CONST
:
9207 case UHI_FTYPE_UHI_UQI
:
9208 case USI_FTYPE_USI_UQI
:
9209 case UDI_FTYPE_UDI_UQI
:
9213 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9214 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9215 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9216 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9217 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9218 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9219 case UHI_FTYPE_V16SI_V16SI_UHI
:
9220 case UQI_FTYPE_V8DI_V8DI_UQI
:
9221 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9222 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9223 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9224 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9225 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9226 case V16SI_FTYPE_SI_V16SI_UHI
:
9227 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9228 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9229 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9230 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9231 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9232 case V8SI_FTYPE_SI_V8SI_UQI
:
9233 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9234 case V4SI_FTYPE_SI_V4SI_UQI
:
9235 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9236 case V4DI_FTYPE_DI_V4DI_UQI
:
9237 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9238 case V2DI_FTYPE_DI_V2DI_UQI
:
9239 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9240 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9241 case V64QI_FTYPE_QI_V64QI_UDI
:
9242 case V32QI_FTYPE_V32QI_V32QI_USI
:
9243 case V32QI_FTYPE_V16QI_V32QI_USI
:
9244 case V32QI_FTYPE_QI_V32QI_USI
:
9245 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9246 case V16QI_FTYPE_QI_V16QI_UHI
:
9247 case V32HI_FTYPE_V8HI_V32HI_USI
:
9248 case V32HI_FTYPE_HI_V32HI_USI
:
9249 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9250 case V16HI_FTYPE_HI_V16HI_UHI
:
9251 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9252 case V8HI_FTYPE_HI_V8HI_UQI
:
9253 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9254 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9255 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9256 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9257 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9258 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9259 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9260 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9261 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9262 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9263 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9264 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9265 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9266 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9267 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9268 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9269 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9270 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9271 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9272 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9273 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9274 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9275 case V32QI_FTYPE_V32HI_V32QI_USI
:
9276 case UHI_FTYPE_V16QI_V16QI_UHI
:
9277 case USI_FTYPE_V32QI_V32QI_USI
:
9278 case UDI_FTYPE_V64QI_V64QI_UDI
:
9279 case UQI_FTYPE_V8HI_V8HI_UQI
:
9280 case UHI_FTYPE_V16HI_V16HI_UHI
:
9281 case USI_FTYPE_V32HI_V32HI_USI
:
9282 case UQI_FTYPE_V4SI_V4SI_UQI
:
9283 case UQI_FTYPE_V8SI_V8SI_UQI
:
9284 case UQI_FTYPE_V2DI_V2DI_UQI
:
9285 case UQI_FTYPE_V4DI_V4DI_UQI
:
9286 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9287 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9288 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9289 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9290 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9291 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9292 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9293 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9294 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9295 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9296 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9297 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9298 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9299 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9300 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9301 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9302 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9303 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9304 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9305 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9306 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9307 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9308 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9309 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9310 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9311 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9312 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9313 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9314 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9315 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9316 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9317 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9318 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9319 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9320 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9321 case V8DI_FTYPE_DI_V8DI_UQI
:
9322 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9323 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9324 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9325 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9326 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9327 case V32HI_FTYPE_V32HI_V32HI_USI
:
9328 case V32HI_FTYPE_V32QI_V32HI_USI
:
9329 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9330 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9331 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9332 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9333 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9334 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9335 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9336 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9337 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9338 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9339 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9340 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9341 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9342 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9343 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9344 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9345 case V32HI_FTYPE_V16SF_V16SF_USI
:
9346 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9347 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9348 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9349 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9350 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9351 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9352 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9353 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9356 case V32QI_FTYPE_V32QI_V32QI_INT
:
9357 case V16HI_FTYPE_V16HI_V16HI_INT
:
9358 case V16QI_FTYPE_V16QI_V16QI_INT
:
9359 case V4DI_FTYPE_V4DI_V4DI_INT
:
9360 case V8HI_FTYPE_V8HI_V8HI_INT
:
9361 case V8SI_FTYPE_V8SI_V8SI_INT
:
9362 case V8SI_FTYPE_V8SI_V4SI_INT
:
9363 case V8SF_FTYPE_V8SF_V8SF_INT
:
9364 case V8SF_FTYPE_V8SF_V4SF_INT
:
9365 case V4SI_FTYPE_V4SI_V4SI_INT
:
9366 case V4DF_FTYPE_V4DF_V4DF_INT
:
9367 case V16SF_FTYPE_V16SF_V16SF_INT
:
9368 case V16SF_FTYPE_V16SF_V4SF_INT
:
9369 case V16SI_FTYPE_V16SI_V4SI_INT
:
9370 case V4DF_FTYPE_V4DF_V2DF_INT
:
9371 case V4SF_FTYPE_V4SF_V4SF_INT
:
9372 case V2DI_FTYPE_V2DI_V2DI_INT
:
9373 case V4DI_FTYPE_V4DI_V2DI_INT
:
9374 case V2DF_FTYPE_V2DF_V2DF_INT
:
9375 case UQI_FTYPE_V8DI_V8UDI_INT
:
9376 case UQI_FTYPE_V8DF_V8DF_INT
:
9377 case UQI_FTYPE_V2DF_V2DF_INT
:
9378 case UQI_FTYPE_V4SF_V4SF_INT
:
9379 case UHI_FTYPE_V16SI_V16SI_INT
:
9380 case UHI_FTYPE_V16SF_V16SF_INT
:
9381 case V64QI_FTYPE_V64QI_V64QI_INT
:
9382 case V32HI_FTYPE_V32HI_V32HI_INT
:
9383 case V16SI_FTYPE_V16SI_V16SI_INT
:
9384 case V8DI_FTYPE_V8DI_V8DI_INT
:
9388 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9393 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9398 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9403 case V2DI_FTYPE_V2DI_UINT_UINT
:
9407 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9412 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9418 case QI_FTYPE_V8DF_INT_UQI
:
9419 case QI_FTYPE_V4DF_INT_UQI
:
9420 case QI_FTYPE_V2DF_INT_UQI
:
9421 case HI_FTYPE_V16SF_INT_UHI
:
9422 case QI_FTYPE_V8SF_INT_UQI
:
9423 case QI_FTYPE_V4SF_INT_UQI
:
9424 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9425 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9430 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9436 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9442 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9443 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9444 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9445 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9446 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9447 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9448 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9449 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9450 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9451 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9452 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9453 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9454 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9455 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9456 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9457 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9458 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9459 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9460 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9461 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9462 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9463 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9464 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9465 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9466 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9467 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9468 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9469 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9470 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9471 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9472 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9473 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9474 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9475 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9476 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9477 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9478 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9479 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9480 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9481 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9482 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9483 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9484 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9485 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9486 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9487 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9488 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9489 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9490 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9491 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9492 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9493 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9494 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9495 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9498 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9499 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9500 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9501 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9502 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9506 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9507 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9508 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9509 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9510 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9511 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9512 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9513 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9514 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9515 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9516 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9517 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9518 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9519 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9524 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9528 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9529 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9530 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9531 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9532 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9535 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9536 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9541 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9542 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9543 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9544 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9545 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9546 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9547 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9548 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9549 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9550 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9551 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9552 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9553 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9554 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9555 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9556 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9557 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9558 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9559 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9560 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9561 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9562 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9563 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9564 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9565 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9566 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9567 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9568 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9569 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9570 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9575 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9576 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9577 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9578 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9579 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9580 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9581 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9582 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9583 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9584 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9585 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9586 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9587 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9588 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9589 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9590 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9591 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9592 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9593 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9594 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9595 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9596 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9597 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9598 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9599 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9600 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9601 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9606 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9607 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9608 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9609 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9610 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9611 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9612 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9613 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9614 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9615 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9620 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9621 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9622 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9623 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9624 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9625 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9626 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9627 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9628 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9629 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9630 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9631 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9641 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9643 if (comparison
!= UNKNOWN
)
9645 gcc_assert (nargs
== 2);
9646 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9649 if (rmode
== VOIDmode
|| rmode
== tmode
)
9653 || GET_MODE (target
) != tmode
9654 || !insn_p
->operand
[0].predicate (target
, tmode
))
9655 target
= gen_reg_rtx (tmode
);
9656 else if (memory_operand (target
, tmode
))
9658 real_target
= target
;
9662 real_target
= gen_reg_rtx (tmode
);
9663 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9666 for (i
= 0; i
< nargs
; i
++)
9668 tree arg
= CALL_EXPR_ARG (exp
, i
);
9669 rtx op
= expand_normal (arg
);
9670 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9671 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9673 if (second_arg_count
&& i
== 1)
9675 /* SIMD shift insns take either an 8-bit immediate or
9676 register as count. But builtin functions take int as
9677 count. If count doesn't match, we put it in register.
9678 The instructions are using 64-bit count, if op is just
9679 32-bit, zero-extend it, as negative shift counts
9680 are undefined behavior and zero-extension is more
9684 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9685 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9687 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9688 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9689 op
= copy_to_reg (op
);
9692 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9693 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9698 case CODE_FOR_avx_vinsertf128v4di
:
9699 case CODE_FOR_avx_vextractf128v4di
:
9700 error ("the last argument must be an 1-bit immediate");
9703 case CODE_FOR_avx512f_cmpv8di3_mask
:
9704 case CODE_FOR_avx512f_cmpv16si3_mask
:
9705 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9706 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9707 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9708 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9709 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9710 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9711 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9712 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9713 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9714 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9715 error ("the last argument must be a 3-bit immediate");
9718 case CODE_FOR_sse4_1_roundsd
:
9719 case CODE_FOR_sse4_1_roundss
:
9721 case CODE_FOR_sse4_1_roundpd
:
9722 case CODE_FOR_sse4_1_roundps
:
9723 case CODE_FOR_avx_roundpd256
:
9724 case CODE_FOR_avx_roundps256
:
9726 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9727 case CODE_FOR_sse4_1_roundps_sfix
:
9728 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9729 case CODE_FOR_avx_roundps_sfix256
:
9731 case CODE_FOR_sse4_1_blendps
:
9732 case CODE_FOR_avx_blendpd256
:
9733 case CODE_FOR_avx_vpermilv4df
:
9734 case CODE_FOR_avx_vpermilv4df_mask
:
9735 case CODE_FOR_avx512f_getmantv8df_mask
:
9736 case CODE_FOR_avx512f_getmantv16sf_mask
:
9737 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9738 case CODE_FOR_avx512vl_getmantv4df_mask
:
9739 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9740 case CODE_FOR_avx512vl_getmantv2df_mask
:
9741 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9742 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9743 case CODE_FOR_avx512dq_rangepv4df_mask
:
9744 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9745 case CODE_FOR_avx512dq_rangepv2df_mask
:
9746 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9747 case CODE_FOR_avx_shufpd256_mask
:
9748 error ("the last argument must be a 4-bit immediate");
9751 case CODE_FOR_sha1rnds4
:
9752 case CODE_FOR_sse4_1_blendpd
:
9753 case CODE_FOR_avx_vpermilv2df
:
9754 case CODE_FOR_avx_vpermilv2df_mask
:
9755 case CODE_FOR_xop_vpermil2v2df3
:
9756 case CODE_FOR_xop_vpermil2v4sf3
:
9757 case CODE_FOR_xop_vpermil2v4df3
:
9758 case CODE_FOR_xop_vpermil2v8sf3
:
9759 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9760 case CODE_FOR_avx512f_vinserti32x4_mask
:
9761 case CODE_FOR_avx512f_vextractf32x4_mask
:
9762 case CODE_FOR_avx512f_vextracti32x4_mask
:
9763 case CODE_FOR_sse2_shufpd
:
9764 case CODE_FOR_sse2_shufpd_mask
:
9765 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9766 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9767 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9768 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9769 error ("the last argument must be a 2-bit immediate");
9772 case CODE_FOR_avx_vextractf128v4df
:
9773 case CODE_FOR_avx_vextractf128v8sf
:
9774 case CODE_FOR_avx_vextractf128v8si
:
9775 case CODE_FOR_avx_vinsertf128v4df
:
9776 case CODE_FOR_avx_vinsertf128v8sf
:
9777 case CODE_FOR_avx_vinsertf128v8si
:
9778 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9779 case CODE_FOR_avx512f_vinserti64x4_mask
:
9780 case CODE_FOR_avx512f_vextractf64x4_mask
:
9781 case CODE_FOR_avx512f_vextracti64x4_mask
:
9782 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9783 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9784 case CODE_FOR_avx512vl_vinsertv4df
:
9785 case CODE_FOR_avx512vl_vinsertv4di
:
9786 case CODE_FOR_avx512vl_vinsertv8sf
:
9787 case CODE_FOR_avx512vl_vinsertv8si
:
9788 error ("the last argument must be a 1-bit immediate");
9791 case CODE_FOR_avx_vmcmpv2df3
:
9792 case CODE_FOR_avx_vmcmpv4sf3
:
9793 case CODE_FOR_avx_cmpv2df3
:
9794 case CODE_FOR_avx_cmpv4sf3
:
9795 case CODE_FOR_avx_cmpv4df3
:
9796 case CODE_FOR_avx_cmpv8sf3
:
9797 case CODE_FOR_avx512f_cmpv8df3_mask
:
9798 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9799 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9800 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9801 error ("the last argument must be a 5-bit immediate");
9805 switch (nargs_constant
)
9808 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9809 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9811 error ("the next to last argument must be an 8-bit immediate");
9816 error ("the last argument must be an 8-bit immediate");
9826 if (VECTOR_MODE_P (mode
))
9827 op
= safe_vector_operand (op
, mode
);
9829 /* If we aren't optimizing, only allow one memory operand to
9831 if (memory_operand (op
, mode
))
9834 op
= fixup_modeless_constant (op
, mode
);
9836 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9838 if (optimize
|| !match
|| num_memory
> 1)
9839 op
= copy_to_mode_reg (mode
, op
);
9843 op
= copy_to_reg (op
);
9844 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9849 args
[i
].mode
= mode
;
9855 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9858 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9861 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9865 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9866 args
[2].op
, args
[3].op
);
9869 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9870 args
[2].op
, args
[3].op
, args
[4].op
);
9873 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9874 args
[2].op
, args
[3].op
, args
[4].op
,
9888 /* Transform pattern of following layout:
9890 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9896 ix86_erase_embedded_rounding (rtx pat
)
9898 if (GET_CODE (pat
) == INSN
)
9899 pat
= PATTERN (pat
);
9901 gcc_assert (GET_CODE (pat
) == SET
);
9902 rtx src
= SET_SRC (pat
);
9903 gcc_assert (XVECLEN (src
, 0) == 2);
9904 rtx p0
= XVECEXP (src
, 0, 0);
9905 gcc_assert (GET_CODE (src
) == UNSPEC
9906 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9907 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9911 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9914 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9915 tree exp
, rtx target
)
9918 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9919 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9920 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9921 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9922 rtx op0
= expand_normal (arg0
);
9923 rtx op1
= expand_normal (arg1
);
9924 rtx op2
= expand_normal (arg2
);
9925 rtx op3
= expand_normal (arg3
);
9926 enum insn_code icode
= d
->icode
;
9927 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9928 machine_mode mode0
= insn_p
->operand
[0].mode
;
9929 machine_mode mode1
= insn_p
->operand
[1].mode
;
9931 /* See avxintrin.h for values. */
9932 static const enum rtx_code comparisons
[32] =
9934 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9935 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
9936 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9937 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
9939 static const bool ordereds
[32] =
9941 true, true, true, false, false, false, false, true,
9942 false, false, false, true, true, true, true, false,
9943 true, true, true, false, false, false, false, true,
9944 false, false, false, true, true, true, true, false
9946 static const bool non_signalings
[32] =
9948 true, false, false, true, true, false, false, true,
9949 true, false, false, true, true, false, false, true,
9950 false, true, true, false, false, true, true, false,
9951 false, true, true, false, false, true, true, false
9954 if (!CONST_INT_P (op2
))
9956 error ("the third argument must be comparison constant");
9959 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
9961 error ("incorrect comparison mode");
9965 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
9967 error ("incorrect rounding operand");
9971 if (VECTOR_MODE_P (mode0
))
9972 op0
= safe_vector_operand (op0
, mode0
);
9973 if (VECTOR_MODE_P (mode1
))
9974 op1
= safe_vector_operand (op1
, mode1
);
9976 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
9977 bool ordered
= ordereds
[INTVAL (op2
)];
9978 bool non_signaling
= non_signalings
[INTVAL (op2
)];
9979 rtx const_val
= const0_rtx
;
9981 bool check_unordered
= false;
9982 machine_mode mode
= CCFPmode
;
9988 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
9995 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10005 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10012 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10013 if (!non_signaling
)
10020 case LE
: /* -> GE */
10021 case LT
: /* -> GT */
10022 case UNGE
: /* -> UNLE */
10023 case UNGT
: /* -> UNLT */
10024 std::swap (op0
, op1
);
10025 comparison
= swap_condition (comparison
);
10033 /* These are supported by CCFPmode. NB: Use ordered/signaling
10034 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10035 with NAN operands. */
10036 if (ordered
== non_signaling
)
10037 ordered
= !ordered
;
10040 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10041 _CMP_EQ_OQ/_CMP_EQ_OS. */
10042 check_unordered
= true;
10046 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10047 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10048 gcc_assert (!ordered
);
10049 check_unordered
= true;
10051 const_val
= const1_rtx
;
10054 gcc_unreachable ();
10057 target
= gen_reg_rtx (SImode
);
10058 emit_move_insn (target
, const_val
);
10059 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10061 if ((optimize
&& !register_operand (op0
, mode0
))
10062 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10063 op0
= copy_to_mode_reg (mode0
, op0
);
10064 if ((optimize
&& !register_operand (op1
, mode1
))
10065 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10066 op1
= copy_to_mode_reg (mode1
, op1
);
10069 1. COMI: ordered and signaling.
10070 2. UCOMI: unordered and non-signaling.
10073 icode
= (icode
== CODE_FOR_sse_comi_round
10074 ? CODE_FOR_sse_ucomi_round
10075 : CODE_FOR_sse2_ucomi_round
);
10077 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10081 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10082 if (INTVAL (op3
) == NO_ROUND
)
10084 pat
= ix86_erase_embedded_rounding (pat
);
10088 set_dst
= SET_DEST (pat
);
10092 gcc_assert (GET_CODE (pat
) == SET
);
10093 set_dst
= SET_DEST (pat
);
10098 rtx_code_label
*label
= NULL
;
10100 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10101 with NAN operands. */
10102 if (check_unordered
)
10104 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10106 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10107 label
= gen_label_rtx ();
10108 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10109 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10110 gen_rtx_LABEL_REF (VOIDmode
, label
),
10112 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10115 /* NB: Set CCFPmode and check a different CCmode which is in subset
10117 if (GET_MODE (set_dst
) != mode
)
10119 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10120 || mode
== CCOmode
|| mode
== CCPmode
10121 || mode
== CCSmode
|| mode
== CCZmode
);
10122 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10125 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10126 gen_rtx_fmt_ee (comparison
, QImode
,
10131 emit_label (label
);
10133 return SUBREG_REG (target
);
10137 ix86_expand_round_builtin (const struct builtin_description
*d
,
10138 tree exp
, rtx target
)
10141 unsigned int i
, nargs
;
10147 enum insn_code icode
= d
->icode
;
10148 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10149 machine_mode tmode
= insn_p
->operand
[0].mode
;
10150 unsigned int nargs_constant
= 0;
10151 unsigned int redundant_embed_rnd
= 0;
10153 switch ((enum ix86_builtin_func_type
) d
->flag
)
10155 case UINT64_FTYPE_V2DF_INT
:
10156 case UINT64_FTYPE_V4SF_INT
:
10157 case UINT_FTYPE_V2DF_INT
:
10158 case UINT_FTYPE_V4SF_INT
:
10159 case INT64_FTYPE_V2DF_INT
:
10160 case INT64_FTYPE_V4SF_INT
:
10161 case INT_FTYPE_V2DF_INT
:
10162 case INT_FTYPE_V4SF_INT
:
10165 case V4SF_FTYPE_V4SF_UINT_INT
:
10166 case V4SF_FTYPE_V4SF_UINT64_INT
:
10167 case V2DF_FTYPE_V2DF_UINT64_INT
:
10168 case V4SF_FTYPE_V4SF_INT_INT
:
10169 case V4SF_FTYPE_V4SF_INT64_INT
:
10170 case V2DF_FTYPE_V2DF_INT64_INT
:
10171 case V4SF_FTYPE_V4SF_V4SF_INT
:
10172 case V2DF_FTYPE_V2DF_V2DF_INT
:
10173 case V4SF_FTYPE_V4SF_V2DF_INT
:
10174 case V2DF_FTYPE_V2DF_V4SF_INT
:
10177 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10178 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10179 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10180 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10181 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10182 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10183 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10184 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10185 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10186 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10187 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10188 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10189 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10190 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10193 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10194 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10195 nargs_constant
= 2;
10198 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10199 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10200 return ix86_expand_sse_comi_round (d
, exp
, target
);
10201 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10202 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10203 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10204 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10205 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10206 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10207 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10208 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10211 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10212 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10213 nargs_constant
= 4;
10216 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10217 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10218 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10219 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10220 nargs_constant
= 3;
10223 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10224 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10225 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10226 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10227 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10228 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10230 nargs_constant
= 4;
10232 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10233 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10234 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10235 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10237 nargs_constant
= 3;
10240 gcc_unreachable ();
10242 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10246 || GET_MODE (target
) != tmode
10247 || !insn_p
->operand
[0].predicate (target
, tmode
))
10248 target
= gen_reg_rtx (tmode
);
10250 for (i
= 0; i
< nargs
; i
++)
10252 tree arg
= CALL_EXPR_ARG (exp
, i
);
10253 rtx op
= expand_normal (arg
);
10254 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10255 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10257 if (i
== nargs
- nargs_constant
)
10263 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10264 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10265 case CODE_FOR_avx512f_vgetmantv2df_round
:
10266 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10267 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10268 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10269 error ("the immediate argument must be a 4-bit immediate");
10271 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10272 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10273 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10274 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10275 error ("the immediate argument must be a 5-bit immediate");
10278 error ("the immediate argument must be an 8-bit immediate");
10283 else if (i
== nargs
-1)
10285 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10287 error ("incorrect rounding operand");
10291 /* If there is no rounding use normal version of the pattern. */
10292 if (INTVAL (op
) == NO_ROUND
)
10293 redundant_embed_rnd
= 1;
10297 if (VECTOR_MODE_P (mode
))
10298 op
= safe_vector_operand (op
, mode
);
10300 op
= fixup_modeless_constant (op
, mode
);
10302 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10304 if (optimize
|| !match
)
10305 op
= copy_to_mode_reg (mode
, op
);
10309 op
= copy_to_reg (op
);
10310 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10315 args
[i
].mode
= mode
;
10321 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10324 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10327 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10331 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10332 args
[2].op
, args
[3].op
);
10335 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10336 args
[2].op
, args
[3].op
, args
[4].op
);
10339 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10340 args
[2].op
, args
[3].op
, args
[4].op
,
10344 gcc_unreachable ();
10350 if (redundant_embed_rnd
)
10351 pat
= ix86_erase_embedded_rounding (pat
);
10357 /* Subroutine of ix86_expand_builtin to take care of special insns
10358 with variable number of operands. */
10361 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10362 tree exp
, rtx target
)
10366 unsigned int i
, nargs
, arg_adjust
, memory
;
10367 bool aligned_mem
= false;
10373 enum insn_code icode
= d
->icode
;
10374 bool last_arg_constant
= false;
10375 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10376 machine_mode tmode
= insn_p
->operand
[0].mode
;
10377 enum { load
, store
} klass
;
10379 switch ((enum ix86_builtin_func_type
) d
->flag
)
10381 case VOID_FTYPE_VOID
:
10382 emit_insn (GEN_FCN (icode
) (target
));
10384 case VOID_FTYPE_UINT64
:
10385 case VOID_FTYPE_UNSIGNED
:
10391 case INT_FTYPE_VOID
:
10392 case USHORT_FTYPE_VOID
:
10393 case UINT64_FTYPE_VOID
:
10394 case UINT_FTYPE_VOID
:
10395 case UNSIGNED_FTYPE_VOID
:
10400 case UINT64_FTYPE_PUNSIGNED
:
10401 case V2DI_FTYPE_PV2DI
:
10402 case V4DI_FTYPE_PV4DI
:
10403 case V32QI_FTYPE_PCCHAR
:
10404 case V16QI_FTYPE_PCCHAR
:
10405 case V8SF_FTYPE_PCV4SF
:
10406 case V8SF_FTYPE_PCFLOAT
:
10407 case V4SF_FTYPE_PCFLOAT
:
10408 case V4DF_FTYPE_PCV2DF
:
10409 case V4DF_FTYPE_PCDOUBLE
:
10410 case V2DF_FTYPE_PCDOUBLE
:
10411 case VOID_FTYPE_PVOID
:
10412 case V8DI_FTYPE_PV8DI
:
10418 case CODE_FOR_sse4_1_movntdqa
:
10419 case CODE_FOR_avx2_movntdqa
:
10420 case CODE_FOR_avx512f_movntdqa
:
10421 aligned_mem
= true;
10427 case VOID_FTYPE_PV2SF_V4SF
:
10428 case VOID_FTYPE_PV8DI_V8DI
:
10429 case VOID_FTYPE_PV4DI_V4DI
:
10430 case VOID_FTYPE_PV2DI_V2DI
:
10431 case VOID_FTYPE_PCHAR_V32QI
:
10432 case VOID_FTYPE_PCHAR_V16QI
:
10433 case VOID_FTYPE_PFLOAT_V16SF
:
10434 case VOID_FTYPE_PFLOAT_V8SF
:
10435 case VOID_FTYPE_PFLOAT_V4SF
:
10436 case VOID_FTYPE_PDOUBLE_V8DF
:
10437 case VOID_FTYPE_PDOUBLE_V4DF
:
10438 case VOID_FTYPE_PDOUBLE_V2DF
:
10439 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10440 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10441 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10442 case VOID_FTYPE_PINT_INT
:
10445 /* Reserve memory operand for target. */
10446 memory
= ARRAY_SIZE (args
);
10449 /* These builtins and instructions require the memory
10450 to be properly aligned. */
10451 case CODE_FOR_avx_movntv4di
:
10452 case CODE_FOR_sse2_movntv2di
:
10453 case CODE_FOR_avx_movntv8sf
:
10454 case CODE_FOR_sse_movntv4sf
:
10455 case CODE_FOR_sse4a_vmmovntv4sf
:
10456 case CODE_FOR_avx_movntv4df
:
10457 case CODE_FOR_sse2_movntv2df
:
10458 case CODE_FOR_sse4a_vmmovntv2df
:
10459 case CODE_FOR_sse2_movntidi
:
10460 case CODE_FOR_sse_movntq
:
10461 case CODE_FOR_sse2_movntisi
:
10462 case CODE_FOR_avx512f_movntv16sf
:
10463 case CODE_FOR_avx512f_movntv8df
:
10464 case CODE_FOR_avx512f_movntv8di
:
10465 aligned_mem
= true;
10471 case VOID_FTYPE_PVOID_PCVOID
:
10477 case V4SF_FTYPE_V4SF_PCV2SF
:
10478 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10483 case V8SF_FTYPE_PCV8SF_V8SI
:
10484 case V4DF_FTYPE_PCV4DF_V4DI
:
10485 case V4SF_FTYPE_PCV4SF_V4SI
:
10486 case V2DF_FTYPE_PCV2DF_V2DI
:
10487 case V8SI_FTYPE_PCV8SI_V8SI
:
10488 case V4DI_FTYPE_PCV4DI_V4DI
:
10489 case V4SI_FTYPE_PCV4SI_V4SI
:
10490 case V2DI_FTYPE_PCV2DI_V2DI
:
10491 case VOID_FTYPE_INT_INT64
:
10496 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10497 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10498 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10499 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10500 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10501 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10502 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10503 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10504 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10505 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10506 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10507 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10508 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10509 case VOID_FTYPE_PV32HI_V32HI_USI
:
10510 case VOID_FTYPE_PV32QI_V32QI_USI
:
10511 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10512 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10513 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10516 /* These builtins and instructions require the memory
10517 to be properly aligned. */
10518 case CODE_FOR_avx512f_storev16sf_mask
:
10519 case CODE_FOR_avx512f_storev16si_mask
:
10520 case CODE_FOR_avx512f_storev8df_mask
:
10521 case CODE_FOR_avx512f_storev8di_mask
:
10522 case CODE_FOR_avx512vl_storev8sf_mask
:
10523 case CODE_FOR_avx512vl_storev8si_mask
:
10524 case CODE_FOR_avx512vl_storev4df_mask
:
10525 case CODE_FOR_avx512vl_storev4di_mask
:
10526 case CODE_FOR_avx512vl_storev4sf_mask
:
10527 case CODE_FOR_avx512vl_storev4si_mask
:
10528 case CODE_FOR_avx512vl_storev2df_mask
:
10529 case CODE_FOR_avx512vl_storev2di_mask
:
10530 aligned_mem
= true;
10536 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10537 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10538 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10539 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10540 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10541 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10542 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10543 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10544 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10545 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10546 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10547 case VOID_FTYPE_PUDI_V8DI_UQI
:
10548 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10549 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10550 case VOID_FTYPE_PUDI_V2DI_UQI
:
10551 case VOID_FTYPE_PUDI_V4DI_UQI
:
10552 case VOID_FTYPE_PUSI_V2DI_UQI
:
10553 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10554 case VOID_FTYPE_PUDI_V4SI_UQI
:
10555 case VOID_FTYPE_PUSI_V4DI_UQI
:
10556 case VOID_FTYPE_PUHI_V2DI_UQI
:
10557 case VOID_FTYPE_PUDI_V8SI_UQI
:
10558 case VOID_FTYPE_PUSI_V4SI_UQI
:
10559 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10560 case VOID_FTYPE_PCHAR_V32QI_USI
:
10561 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10562 case VOID_FTYPE_PSHORT_V32HI_USI
:
10563 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10564 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10565 case VOID_FTYPE_PINT_V16SI_UHI
:
10566 case VOID_FTYPE_PINT_V8SI_UQI
:
10567 case VOID_FTYPE_PINT_V4SI_UQI
:
10568 case VOID_FTYPE_PINT64_V8DI_UQI
:
10569 case VOID_FTYPE_PINT64_V4DI_UQI
:
10570 case VOID_FTYPE_PINT64_V2DI_UQI
:
10571 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10572 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10573 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10574 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10575 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10576 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10577 case VOID_FTYPE_PV32QI_V32HI_USI
:
10578 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10579 case VOID_FTYPE_PUDI_V8HI_UQI
:
10582 /* Reserve memory operand for target. */
10583 memory
= ARRAY_SIZE (args
);
10585 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10586 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10587 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10588 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10589 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10590 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10591 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10592 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10593 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10594 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10595 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10596 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10597 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10598 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10599 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10600 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10601 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10602 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10605 /* These builtins and instructions require the memory
10606 to be properly aligned. */
10607 case CODE_FOR_avx512f_loadv16sf_mask
:
10608 case CODE_FOR_avx512f_loadv16si_mask
:
10609 case CODE_FOR_avx512f_loadv8df_mask
:
10610 case CODE_FOR_avx512f_loadv8di_mask
:
10611 case CODE_FOR_avx512vl_loadv8sf_mask
:
10612 case CODE_FOR_avx512vl_loadv8si_mask
:
10613 case CODE_FOR_avx512vl_loadv4df_mask
:
10614 case CODE_FOR_avx512vl_loadv4di_mask
:
10615 case CODE_FOR_avx512vl_loadv4sf_mask
:
10616 case CODE_FOR_avx512vl_loadv4si_mask
:
10617 case CODE_FOR_avx512vl_loadv2df_mask
:
10618 case CODE_FOR_avx512vl_loadv2di_mask
:
10619 case CODE_FOR_avx512bw_loadv64qi_mask
:
10620 case CODE_FOR_avx512vl_loadv32qi_mask
:
10621 case CODE_FOR_avx512vl_loadv16qi_mask
:
10622 case CODE_FOR_avx512bw_loadv32hi_mask
:
10623 case CODE_FOR_avx512vl_loadv16hi_mask
:
10624 case CODE_FOR_avx512vl_loadv8hi_mask
:
10625 aligned_mem
= true;
10631 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10632 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10633 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10634 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10635 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10636 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10637 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10638 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10639 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10640 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10641 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10642 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10643 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10644 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10645 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10646 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10647 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10648 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10653 case VOID_FTYPE_UINT_UINT_UINT
:
10654 case VOID_FTYPE_UINT64_UINT_UINT
:
10655 case UCHAR_FTYPE_UINT_UINT_UINT
:
10656 case UCHAR_FTYPE_UINT64_UINT_UINT
:
10659 memory
= ARRAY_SIZE (args
);
10660 last_arg_constant
= true;
10663 gcc_unreachable ();
10666 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10668 if (klass
== store
)
10670 arg
= CALL_EXPR_ARG (exp
, 0);
10671 op
= expand_normal (arg
);
10672 gcc_assert (target
== 0);
10675 op
= ix86_zero_extend_to_Pmode (op
);
10676 target
= gen_rtx_MEM (tmode
, op
);
10677 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10678 on it. Try to improve it using get_pointer_alignment,
10679 and if the special builtin is one that requires strict
10680 mode alignment, also from it's GET_MODE_ALIGNMENT.
10681 Failure to do so could lead to ix86_legitimate_combined_insn
10682 rejecting all changes to such insns. */
10683 unsigned int align
= get_pointer_alignment (arg
);
10684 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10685 align
= GET_MODE_ALIGNMENT (tmode
);
10686 if (MEM_ALIGN (target
) < align
)
10687 set_mem_align (target
, align
);
10690 target
= force_reg (tmode
, op
);
10698 || !register_operand (target
, tmode
)
10699 || GET_MODE (target
) != tmode
)
10700 target
= gen_reg_rtx (tmode
);
10703 for (i
= 0; i
< nargs
; i
++)
10705 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10708 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10709 op
= expand_normal (arg
);
10710 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10712 if (last_arg_constant
&& (i
+ 1) == nargs
)
10716 if (icode
== CODE_FOR_lwp_lwpvalsi3
10717 || icode
== CODE_FOR_lwp_lwpinssi3
10718 || icode
== CODE_FOR_lwp_lwpvaldi3
10719 || icode
== CODE_FOR_lwp_lwpinsdi3
)
10720 error ("the last argument must be a 32-bit immediate");
10722 error ("the last argument must be an 8-bit immediate");
10730 /* This must be the memory operand. */
10731 op
= ix86_zero_extend_to_Pmode (op
);
10732 op
= gen_rtx_MEM (mode
, op
);
10733 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10734 on it. Try to improve it using get_pointer_alignment,
10735 and if the special builtin is one that requires strict
10736 mode alignment, also from it's GET_MODE_ALIGNMENT.
10737 Failure to do so could lead to ix86_legitimate_combined_insn
10738 rejecting all changes to such insns. */
10739 unsigned int align
= get_pointer_alignment (arg
);
10740 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10741 align
= GET_MODE_ALIGNMENT (mode
);
10742 if (MEM_ALIGN (op
) < align
)
10743 set_mem_align (op
, align
);
10747 /* This must be register. */
10748 if (VECTOR_MODE_P (mode
))
10749 op
= safe_vector_operand (op
, mode
);
10751 op
= fixup_modeless_constant (op
, mode
);
10753 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10754 op
= copy_to_mode_reg (mode
, op
);
10757 op
= copy_to_reg (op
);
10758 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10764 args
[i
].mode
= mode
;
10770 pat
= GEN_FCN (icode
) (target
);
10773 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10776 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10779 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10782 gcc_unreachable ();
10788 return klass
== store
? 0 : target
;
10791 /* Return the integer constant in ARG. Constrain it to be in the range
10792 of the subparts of VEC_TYPE; issue an error if not. */
10795 get_element_number (tree vec_type
, tree arg
)
10797 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10799 if (!tree_fits_uhwi_p (arg
)
10800 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10802 error ("selector must be an integer constant in the range "
10810 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10811 ix86_expand_vector_init. We DO have language-level syntax for this, in
10812 the form of (type){ init-list }. Except that since we can't place emms
10813 instructions from inside the compiler, we can't allow the use of MMX
10814 registers unless the user explicitly asks for it. So we do *not* define
10815 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10816 we have builtins invoked by mmintrin.h that gives us license to emit
10817 these sorts of instructions. */
10820 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10822 machine_mode tmode
= TYPE_MODE (type
);
10823 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10824 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10825 rtvec v
= rtvec_alloc (n_elt
);
10827 gcc_assert (VECTOR_MODE_P (tmode
));
10828 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10830 for (i
= 0; i
< n_elt
; ++i
)
10832 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10833 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10836 if (!target
|| !register_operand (target
, tmode
))
10837 target
= gen_reg_rtx (tmode
);
10839 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10843 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10844 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10845 had a language-level syntax for referencing vector elements. */
10848 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10850 machine_mode tmode
, mode0
;
10855 arg0
= CALL_EXPR_ARG (exp
, 0);
10856 arg1
= CALL_EXPR_ARG (exp
, 1);
10858 op0
= expand_normal (arg0
);
10859 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10861 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10862 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10863 gcc_assert (VECTOR_MODE_P (mode0
));
10865 op0
= force_reg (mode0
, op0
);
10867 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10868 target
= gen_reg_rtx (tmode
);
10870 ix86_expand_vector_extract (true, target
, op0
, elt
);
10875 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10876 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10877 a language-level syntax for referencing vector elements. */
10880 ix86_expand_vec_set_builtin (tree exp
)
10882 machine_mode tmode
, mode1
;
10883 tree arg0
, arg1
, arg2
;
10885 rtx op0
, op1
, target
;
10887 arg0
= CALL_EXPR_ARG (exp
, 0);
10888 arg1
= CALL_EXPR_ARG (exp
, 1);
10889 arg2
= CALL_EXPR_ARG (exp
, 2);
10891 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10892 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10893 gcc_assert (VECTOR_MODE_P (tmode
));
10895 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10896 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10897 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10899 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10900 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10902 op0
= force_reg (tmode
, op0
);
10903 op1
= force_reg (mode1
, op1
);
10905 /* OP0 is the source of these builtin functions and shouldn't be
10906 modified. Create a copy, use it and return it as target. */
10907 target
= gen_reg_rtx (tmode
);
10908 emit_move_insn (target
, op0
);
10909 ix86_expand_vector_set (true, target
, op1
, elt
);
10914 /* Expand an expression EXP that calls a built-in function,
10915 with result going to TARGET if that's convenient
10916 (and in mode MODE if that's convenient).
10917 SUBTARGET may be used as the target for computing one of EXP's operands.
10918 IGNORE is nonzero if the value is to be ignored. */
10921 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10922 machine_mode mode
, int ignore
)
10925 enum insn_code icode
, icode2
;
10926 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10927 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10928 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10929 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10930 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
10932 /* For CPU builtins that can be folded, fold first and expand the fold. */
10935 case IX86_BUILTIN_CPU_INIT
:
10937 /* Make it call __cpu_indicator_init in libgcc. */
10938 tree call_expr
, fndecl
, type
;
10939 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
10940 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
10941 call_expr
= build_call_expr (fndecl
, 0);
10942 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
10944 case IX86_BUILTIN_CPU_IS
:
10945 case IX86_BUILTIN_CPU_SUPPORTS
:
10947 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10948 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
10949 gcc_assert (fold_expr
!= NULL_TREE
);
10950 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
10954 HOST_WIDE_INT isa
= ix86_isa_flags
;
10955 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
10956 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
10957 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
10958 /* The general case is we require all the ISAs specified in bisa{,2}
10960 The exceptions are:
10961 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10962 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10963 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10964 where for each such pair it is sufficient if either of the ISAs is
10965 enabled, plus if it is ored with other options also those others.
10966 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
10967 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10968 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
10969 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
10970 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
10971 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10972 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
10973 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
10974 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
10975 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10976 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
10977 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
10978 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
10979 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
)
10981 bisa
&= ~OPTION_MASK_ISA_MMX
;
10982 bisa
|= OPTION_MASK_ISA_SSE2
;
10984 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
10986 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
10987 if (TARGET_ABI_X32
)
10988 bisa
|= OPTION_MASK_ABI_X32
;
10990 bisa
|= OPTION_MASK_ABI_64
;
10991 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
10992 (enum fpmath_unit
) 0,
10993 (enum prefer_vector_width
) 0,
10996 error ("%qE needs unknown isa option", fndecl
);
10999 gcc_assert (opts
!= NULL
);
11000 error ("%qE needs isa option %s", fndecl
, opts
);
11003 return expand_call (exp
, target
, ignore
);
11008 case IX86_BUILTIN_MASKMOVQ
:
11009 case IX86_BUILTIN_MASKMOVDQU
:
11010 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11011 ? CODE_FOR_mmx_maskmovq
11012 : CODE_FOR_sse2_maskmovdqu
);
11013 /* Note the arg order is different from the operand order. */
11014 arg1
= CALL_EXPR_ARG (exp
, 0);
11015 arg2
= CALL_EXPR_ARG (exp
, 1);
11016 arg0
= CALL_EXPR_ARG (exp
, 2);
11017 op0
= expand_normal (arg0
);
11018 op1
= expand_normal (arg1
);
11019 op2
= expand_normal (arg2
);
11020 mode0
= insn_data
[icode
].operand
[0].mode
;
11021 mode1
= insn_data
[icode
].operand
[1].mode
;
11022 mode2
= insn_data
[icode
].operand
[2].mode
;
11024 op0
= ix86_zero_extend_to_Pmode (op0
);
11025 op0
= gen_rtx_MEM (mode1
, op0
);
11027 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11028 op0
= copy_to_mode_reg (mode0
, op0
);
11029 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11030 op1
= copy_to_mode_reg (mode1
, op1
);
11031 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11032 op2
= copy_to_mode_reg (mode2
, op2
);
11033 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11039 case IX86_BUILTIN_LDMXCSR
:
11040 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11041 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11042 emit_move_insn (target
, op0
);
11043 emit_insn (gen_sse_ldmxcsr (target
));
11046 case IX86_BUILTIN_STMXCSR
:
11047 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11048 emit_insn (gen_sse_stmxcsr (target
));
11049 return copy_to_mode_reg (SImode
, target
);
11051 case IX86_BUILTIN_CLFLUSH
:
11052 arg0
= CALL_EXPR_ARG (exp
, 0);
11053 op0
= expand_normal (arg0
);
11054 icode
= CODE_FOR_sse2_clflush
;
11055 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11056 op0
= ix86_zero_extend_to_Pmode (op0
);
11058 emit_insn (gen_sse2_clflush (op0
));
11061 case IX86_BUILTIN_CLWB
:
11062 arg0
= CALL_EXPR_ARG (exp
, 0);
11063 op0
= expand_normal (arg0
);
11064 icode
= CODE_FOR_clwb
;
11065 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11066 op0
= ix86_zero_extend_to_Pmode (op0
);
11068 emit_insn (gen_clwb (op0
));
11071 case IX86_BUILTIN_CLFLUSHOPT
:
11072 arg0
= CALL_EXPR_ARG (exp
, 0);
11073 op0
= expand_normal (arg0
);
11074 icode
= CODE_FOR_clflushopt
;
11075 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11076 op0
= ix86_zero_extend_to_Pmode (op0
);
11078 emit_insn (gen_clflushopt (op0
));
11081 case IX86_BUILTIN_MONITOR
:
11082 case IX86_BUILTIN_MONITORX
:
11083 arg0
= CALL_EXPR_ARG (exp
, 0);
11084 arg1
= CALL_EXPR_ARG (exp
, 1);
11085 arg2
= CALL_EXPR_ARG (exp
, 2);
11086 op0
= expand_normal (arg0
);
11087 op1
= expand_normal (arg1
);
11088 op2
= expand_normal (arg2
);
11090 op0
= ix86_zero_extend_to_Pmode (op0
);
11092 op1
= copy_to_mode_reg (SImode
, op1
);
11094 op2
= copy_to_mode_reg (SImode
, op2
);
11096 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11097 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11098 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11101 case IX86_BUILTIN_MWAIT
:
11102 arg0
= CALL_EXPR_ARG (exp
, 0);
11103 arg1
= CALL_EXPR_ARG (exp
, 1);
11104 op0
= expand_normal (arg0
);
11105 op1
= expand_normal (arg1
);
11107 op0
= copy_to_mode_reg (SImode
, op0
);
11109 op1
= copy_to_mode_reg (SImode
, op1
);
11110 emit_insn (gen_sse3_mwait (op0
, op1
));
11113 case IX86_BUILTIN_MWAITX
:
11114 arg0
= CALL_EXPR_ARG (exp
, 0);
11115 arg1
= CALL_EXPR_ARG (exp
, 1);
11116 arg2
= CALL_EXPR_ARG (exp
, 2);
11117 op0
= expand_normal (arg0
);
11118 op1
= expand_normal (arg1
);
11119 op2
= expand_normal (arg2
);
11121 op0
= copy_to_mode_reg (SImode
, op0
);
11123 op1
= copy_to_mode_reg (SImode
, op1
);
11125 op2
= copy_to_mode_reg (SImode
, op2
);
11126 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11129 case IX86_BUILTIN_UMONITOR
:
11130 arg0
= CALL_EXPR_ARG (exp
, 0);
11131 op0
= expand_normal (arg0
);
11133 op0
= ix86_zero_extend_to_Pmode (op0
);
11134 emit_insn (gen_umonitor (Pmode
, op0
));
11137 case IX86_BUILTIN_UMWAIT
:
11138 case IX86_BUILTIN_TPAUSE
:
11139 arg0
= CALL_EXPR_ARG (exp
, 0);
11140 arg1
= CALL_EXPR_ARG (exp
, 1);
11141 op0
= expand_normal (arg0
);
11142 op1
= expand_normal (arg1
);
11145 op0
= copy_to_mode_reg (SImode
, op0
);
11147 op1
= force_reg (DImode
, op1
);
11151 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11152 NULL
, 1, OPTAB_DIRECT
);
11155 case IX86_BUILTIN_UMWAIT
:
11156 icode
= CODE_FOR_umwait_rex64
;
11158 case IX86_BUILTIN_TPAUSE
:
11159 icode
= CODE_FOR_tpause_rex64
;
11162 gcc_unreachable ();
11165 op2
= gen_lowpart (SImode
, op2
);
11166 op1
= gen_lowpart (SImode
, op1
);
11167 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11173 case IX86_BUILTIN_UMWAIT
:
11174 icode
= CODE_FOR_umwait
;
11176 case IX86_BUILTIN_TPAUSE
:
11177 icode
= CODE_FOR_tpause
;
11180 gcc_unreachable ();
11182 pat
= GEN_FCN (icode
) (op0
, op1
);
11191 || !register_operand (target
, QImode
))
11192 target
= gen_reg_rtx (QImode
);
11194 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11196 emit_insn (gen_rtx_SET (target
, pat
));
11200 case IX86_BUILTIN_CLZERO
:
11201 arg0
= CALL_EXPR_ARG (exp
, 0);
11202 op0
= expand_normal (arg0
);
11204 op0
= ix86_zero_extend_to_Pmode (op0
);
11205 emit_insn (gen_clzero (Pmode
, op0
));
11208 case IX86_BUILTIN_CLDEMOTE
:
11209 arg0
= CALL_EXPR_ARG (exp
, 0);
11210 op0
= expand_normal (arg0
);
11211 icode
= CODE_FOR_cldemote
;
11212 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11213 op0
= ix86_zero_extend_to_Pmode (op0
);
11215 emit_insn (gen_cldemote (op0
));
11218 case IX86_BUILTIN_VEC_INIT_V2SI
:
11219 case IX86_BUILTIN_VEC_INIT_V4HI
:
11220 case IX86_BUILTIN_VEC_INIT_V8QI
:
11221 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11223 case IX86_BUILTIN_VEC_EXT_V2DF
:
11224 case IX86_BUILTIN_VEC_EXT_V2DI
:
11225 case IX86_BUILTIN_VEC_EXT_V4SF
:
11226 case IX86_BUILTIN_VEC_EXT_V4SI
:
11227 case IX86_BUILTIN_VEC_EXT_V8HI
:
11228 case IX86_BUILTIN_VEC_EXT_V2SI
:
11229 case IX86_BUILTIN_VEC_EXT_V4HI
:
11230 case IX86_BUILTIN_VEC_EXT_V16QI
:
11231 return ix86_expand_vec_ext_builtin (exp
, target
);
11233 case IX86_BUILTIN_VEC_SET_V2DI
:
11234 case IX86_BUILTIN_VEC_SET_V4SF
:
11235 case IX86_BUILTIN_VEC_SET_V4SI
:
11236 case IX86_BUILTIN_VEC_SET_V8HI
:
11237 case IX86_BUILTIN_VEC_SET_V4HI
:
11238 case IX86_BUILTIN_VEC_SET_V16QI
:
11239 return ix86_expand_vec_set_builtin (exp
);
11241 case IX86_BUILTIN_NANQ
:
11242 case IX86_BUILTIN_NANSQ
:
11243 return expand_call (exp
, target
, ignore
);
11245 case IX86_BUILTIN_RDPID
:
11247 op0
= gen_reg_rtx (word_mode
);
11251 insn
= gen_rdpid_rex64 (op0
);
11252 op0
= convert_to_mode (SImode
, op0
, 1);
11255 insn
= gen_rdpid (op0
);
11260 || !register_operand (target
, SImode
))
11261 target
= gen_reg_rtx (SImode
);
11263 emit_move_insn (target
, op0
);
11266 case IX86_BUILTIN_2INTERSECTD512
:
11267 case IX86_BUILTIN_2INTERSECTQ512
:
11268 case IX86_BUILTIN_2INTERSECTD256
:
11269 case IX86_BUILTIN_2INTERSECTQ256
:
11270 case IX86_BUILTIN_2INTERSECTD128
:
11271 case IX86_BUILTIN_2INTERSECTQ128
:
11272 arg0
= CALL_EXPR_ARG (exp
, 0);
11273 arg1
= CALL_EXPR_ARG (exp
, 1);
11274 arg2
= CALL_EXPR_ARG (exp
, 2);
11275 arg3
= CALL_EXPR_ARG (exp
, 3);
11276 op0
= expand_normal (arg0
);
11277 op1
= expand_normal (arg1
);
11278 op2
= expand_normal (arg2
);
11279 op3
= expand_normal (arg3
);
11281 if (!address_operand (op0
, VOIDmode
))
11283 op0
= convert_memory_address (Pmode
, op0
);
11284 op0
= copy_addr_to_reg (op0
);
11286 if (!address_operand (op1
, VOIDmode
))
11288 op1
= convert_memory_address (Pmode
, op1
);
11289 op1
= copy_addr_to_reg (op1
);
11294 case IX86_BUILTIN_2INTERSECTD512
:
11296 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11298 case IX86_BUILTIN_2INTERSECTQ512
:
11300 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11302 case IX86_BUILTIN_2INTERSECTD256
:
11304 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11306 case IX86_BUILTIN_2INTERSECTQ256
:
11308 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11310 case IX86_BUILTIN_2INTERSECTD128
:
11312 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11314 case IX86_BUILTIN_2INTERSECTQ128
:
11316 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11319 gcc_unreachable ();
11322 mode2
= insn_data
[icode
].operand
[1].mode
;
11323 mode3
= insn_data
[icode
].operand
[2].mode
;
11324 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11325 op2
= copy_to_mode_reg (mode2
, op2
);
11326 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11327 op3
= copy_to_mode_reg (mode3
, op3
);
11329 op4
= gen_reg_rtx (mode4
);
11330 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11331 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11332 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11333 gen_lowpart (mode0
, op4
));
11334 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11335 gen_highpart (mode0
, op4
));
11339 case IX86_BUILTIN_RDPMC
:
11340 case IX86_BUILTIN_RDTSC
:
11341 case IX86_BUILTIN_RDTSCP
:
11342 case IX86_BUILTIN_XGETBV
:
11344 op0
= gen_reg_rtx (DImode
);
11345 op1
= gen_reg_rtx (DImode
);
11347 if (fcode
== IX86_BUILTIN_RDPMC
)
11349 arg0
= CALL_EXPR_ARG (exp
, 0);
11350 op2
= expand_normal (arg0
);
11351 if (!register_operand (op2
, SImode
))
11352 op2
= copy_to_mode_reg (SImode
, op2
);
11354 insn
= (TARGET_64BIT
11355 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11356 : gen_rdpmc (op0
, op2
));
11359 else if (fcode
== IX86_BUILTIN_XGETBV
)
11361 arg0
= CALL_EXPR_ARG (exp
, 0);
11362 op2
= expand_normal (arg0
);
11363 if (!register_operand (op2
, SImode
))
11364 op2
= copy_to_mode_reg (SImode
, op2
);
11366 insn
= (TARGET_64BIT
11367 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11368 : gen_xgetbv (op0
, op2
));
11371 else if (fcode
== IX86_BUILTIN_RDTSC
)
11373 insn
= (TARGET_64BIT
11374 ? gen_rdtsc_rex64 (op0
, op1
)
11375 : gen_rdtsc (op0
));
11380 op2
= gen_reg_rtx (SImode
);
11382 insn
= (TARGET_64BIT
11383 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11384 : gen_rdtscp (op0
, op2
));
11387 arg0
= CALL_EXPR_ARG (exp
, 0);
11388 op4
= expand_normal (arg0
);
11389 if (!address_operand (op4
, VOIDmode
))
11391 op4
= convert_memory_address (Pmode
, op4
);
11392 op4
= copy_addr_to_reg (op4
);
11394 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11398 || !register_operand (target
, DImode
))
11399 target
= gen_reg_rtx (DImode
);
11403 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11404 op1
, 1, OPTAB_DIRECT
);
11405 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11406 op0
, 1, OPTAB_DIRECT
);
11409 emit_move_insn (target
, op0
);
11412 case IX86_BUILTIN_ENQCMD
:
11413 case IX86_BUILTIN_ENQCMDS
:
11414 case IX86_BUILTIN_MOVDIR64B
:
11416 arg0
= CALL_EXPR_ARG (exp
, 0);
11417 arg1
= CALL_EXPR_ARG (exp
, 1);
11418 op0
= expand_normal (arg0
);
11419 op1
= expand_normal (arg1
);
11421 op0
= ix86_zero_extend_to_Pmode (op0
);
11422 if (!address_operand (op1
, VOIDmode
))
11424 op1
= convert_memory_address (Pmode
, op1
);
11425 op1
= copy_addr_to_reg (op1
);
11427 op1
= gen_rtx_MEM (XImode
, op1
);
11429 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11431 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11438 target
= gen_reg_rtx (SImode
);
11439 emit_move_insn (target
, const0_rtx
);
11440 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11442 if (fcode
== IX86_BUILTIN_ENQCMD
)
11443 pat
= gen_enqcmd (UNSPECV_ENQCMD
, Pmode
, op0
, op1
);
11445 pat
= gen_enqcmd (UNSPECV_ENQCMDS
, Pmode
, op0
, op1
);
11449 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11450 gen_rtx_fmt_ee (EQ
, QImode
,
11454 return SUBREG_REG (target
);
11457 case IX86_BUILTIN_FXSAVE
:
11458 case IX86_BUILTIN_FXRSTOR
:
11459 case IX86_BUILTIN_FXSAVE64
:
11460 case IX86_BUILTIN_FXRSTOR64
:
11461 case IX86_BUILTIN_FNSTENV
:
11462 case IX86_BUILTIN_FLDENV
:
11466 case IX86_BUILTIN_FXSAVE
:
11467 icode
= CODE_FOR_fxsave
;
11469 case IX86_BUILTIN_FXRSTOR
:
11470 icode
= CODE_FOR_fxrstor
;
11472 case IX86_BUILTIN_FXSAVE64
:
11473 icode
= CODE_FOR_fxsave64
;
11475 case IX86_BUILTIN_FXRSTOR64
:
11476 icode
= CODE_FOR_fxrstor64
;
11478 case IX86_BUILTIN_FNSTENV
:
11479 icode
= CODE_FOR_fnstenv
;
11481 case IX86_BUILTIN_FLDENV
:
11482 icode
= CODE_FOR_fldenv
;
11485 gcc_unreachable ();
11488 arg0
= CALL_EXPR_ARG (exp
, 0);
11489 op0
= expand_normal (arg0
);
11491 if (!address_operand (op0
, VOIDmode
))
11493 op0
= convert_memory_address (Pmode
, op0
);
11494 op0
= copy_addr_to_reg (op0
);
11496 op0
= gen_rtx_MEM (mode0
, op0
);
11498 pat
= GEN_FCN (icode
) (op0
);
11503 case IX86_BUILTIN_XSETBV
:
11504 arg0
= CALL_EXPR_ARG (exp
, 0);
11505 arg1
= CALL_EXPR_ARG (exp
, 1);
11506 op0
= expand_normal (arg0
);
11507 op1
= expand_normal (arg1
);
11510 op0
= copy_to_mode_reg (SImode
, op0
);
11512 op1
= force_reg (DImode
, op1
);
11516 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11517 NULL
, 1, OPTAB_DIRECT
);
11519 icode
= CODE_FOR_xsetbv_rex64
;
11521 op2
= gen_lowpart (SImode
, op2
);
11522 op1
= gen_lowpart (SImode
, op1
);
11523 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11527 icode
= CODE_FOR_xsetbv
;
11529 pat
= GEN_FCN (icode
) (op0
, op1
);
11535 case IX86_BUILTIN_XSAVE
:
11536 case IX86_BUILTIN_XRSTOR
:
11537 case IX86_BUILTIN_XSAVE64
:
11538 case IX86_BUILTIN_XRSTOR64
:
11539 case IX86_BUILTIN_XSAVEOPT
:
11540 case IX86_BUILTIN_XSAVEOPT64
:
11541 case IX86_BUILTIN_XSAVES
:
11542 case IX86_BUILTIN_XRSTORS
:
11543 case IX86_BUILTIN_XSAVES64
:
11544 case IX86_BUILTIN_XRSTORS64
:
11545 case IX86_BUILTIN_XSAVEC
:
11546 case IX86_BUILTIN_XSAVEC64
:
11547 arg0
= CALL_EXPR_ARG (exp
, 0);
11548 arg1
= CALL_EXPR_ARG (exp
, 1);
11549 op0
= expand_normal (arg0
);
11550 op1
= expand_normal (arg1
);
11552 if (!address_operand (op0
, VOIDmode
))
11554 op0
= convert_memory_address (Pmode
, op0
);
11555 op0
= copy_addr_to_reg (op0
);
11557 op0
= gen_rtx_MEM (BLKmode
, op0
);
11559 op1
= force_reg (DImode
, op1
);
11563 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11564 NULL
, 1, OPTAB_DIRECT
);
11567 case IX86_BUILTIN_XSAVE
:
11568 icode
= CODE_FOR_xsave_rex64
;
11570 case IX86_BUILTIN_XRSTOR
:
11571 icode
= CODE_FOR_xrstor_rex64
;
11573 case IX86_BUILTIN_XSAVE64
:
11574 icode
= CODE_FOR_xsave64
;
11576 case IX86_BUILTIN_XRSTOR64
:
11577 icode
= CODE_FOR_xrstor64
;
11579 case IX86_BUILTIN_XSAVEOPT
:
11580 icode
= CODE_FOR_xsaveopt_rex64
;
11582 case IX86_BUILTIN_XSAVEOPT64
:
11583 icode
= CODE_FOR_xsaveopt64
;
11585 case IX86_BUILTIN_XSAVES
:
11586 icode
= CODE_FOR_xsaves_rex64
;
11588 case IX86_BUILTIN_XRSTORS
:
11589 icode
= CODE_FOR_xrstors_rex64
;
11591 case IX86_BUILTIN_XSAVES64
:
11592 icode
= CODE_FOR_xsaves64
;
11594 case IX86_BUILTIN_XRSTORS64
:
11595 icode
= CODE_FOR_xrstors64
;
11597 case IX86_BUILTIN_XSAVEC
:
11598 icode
= CODE_FOR_xsavec_rex64
;
11600 case IX86_BUILTIN_XSAVEC64
:
11601 icode
= CODE_FOR_xsavec64
;
11604 gcc_unreachable ();
11607 op2
= gen_lowpart (SImode
, op2
);
11608 op1
= gen_lowpart (SImode
, op1
);
11609 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11615 case IX86_BUILTIN_XSAVE
:
11616 icode
= CODE_FOR_xsave
;
11618 case IX86_BUILTIN_XRSTOR
:
11619 icode
= CODE_FOR_xrstor
;
11621 case IX86_BUILTIN_XSAVEOPT
:
11622 icode
= CODE_FOR_xsaveopt
;
11624 case IX86_BUILTIN_XSAVES
:
11625 icode
= CODE_FOR_xsaves
;
11627 case IX86_BUILTIN_XRSTORS
:
11628 icode
= CODE_FOR_xrstors
;
11630 case IX86_BUILTIN_XSAVEC
:
11631 icode
= CODE_FOR_xsavec
;
11634 gcc_unreachable ();
11636 pat
= GEN_FCN (icode
) (op0
, op1
);
11643 case IX86_BUILTIN_LLWPCB
:
11644 arg0
= CALL_EXPR_ARG (exp
, 0);
11645 op0
= expand_normal (arg0
);
11646 icode
= CODE_FOR_lwp_llwpcb
;
11647 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11648 op0
= ix86_zero_extend_to_Pmode (op0
);
11649 emit_insn (gen_lwp_llwpcb (op0
));
11652 case IX86_BUILTIN_SLWPCB
:
11653 icode
= CODE_FOR_lwp_slwpcb
;
11655 || !insn_data
[icode
].operand
[0].predicate (target
, Pmode
))
11656 target
= gen_reg_rtx (Pmode
);
11657 emit_insn (gen_lwp_slwpcb (target
));
11660 case IX86_BUILTIN_BEXTRI32
:
11661 case IX86_BUILTIN_BEXTRI64
:
11662 arg0
= CALL_EXPR_ARG (exp
, 0);
11663 arg1
= CALL_EXPR_ARG (exp
, 1);
11664 op0
= expand_normal (arg0
);
11665 op1
= expand_normal (arg1
);
11666 icode
= (fcode
== IX86_BUILTIN_BEXTRI32
11667 ? CODE_FOR_tbm_bextri_si
11668 : CODE_FOR_tbm_bextri_di
);
11669 if (!CONST_INT_P (op1
))
11671 error ("last argument must be an immediate");
11676 unsigned char length
= (INTVAL (op1
) >> 8) & 0xFF;
11677 unsigned char lsb_index
= INTVAL (op1
) & 0xFF;
11678 op1
= GEN_INT (length
);
11679 op2
= GEN_INT (lsb_index
);
11681 mode1
= insn_data
[icode
].operand
[1].mode
;
11682 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11683 op0
= copy_to_mode_reg (mode1
, op0
);
11685 mode0
= insn_data
[icode
].operand
[0].mode
;
11687 || !register_operand (target
, mode0
))
11688 target
= gen_reg_rtx (mode0
);
11690 pat
= GEN_FCN (icode
) (target
, op0
, op1
, op2
);
11696 case IX86_BUILTIN_RDRAND16_STEP
:
11697 icode
= CODE_FOR_rdrandhi_1
;
11701 case IX86_BUILTIN_RDRAND32_STEP
:
11702 icode
= CODE_FOR_rdrandsi_1
;
11706 case IX86_BUILTIN_RDRAND64_STEP
:
11707 icode
= CODE_FOR_rdranddi_1
;
11711 arg0
= CALL_EXPR_ARG (exp
, 0);
11712 op1
= expand_normal (arg0
);
11713 if (!address_operand (op1
, VOIDmode
))
11715 op1
= convert_memory_address (Pmode
, op1
);
11716 op1
= copy_addr_to_reg (op1
);
11719 op0
= gen_reg_rtx (mode0
);
11720 emit_insn (GEN_FCN (icode
) (op0
));
11722 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11724 op1
= gen_reg_rtx (SImode
);
11725 emit_move_insn (op1
, CONST1_RTX (SImode
));
11727 /* Emit SImode conditional move. */
11728 if (mode0
== HImode
)
11730 if (TARGET_ZERO_EXTEND_WITH_AND
11731 && optimize_function_for_speed_p (cfun
))
11733 op2
= force_reg (SImode
, const0_rtx
);
11735 emit_insn (gen_movstricthi
11736 (gen_lowpart (HImode
, op2
), op0
));
11740 op2
= gen_reg_rtx (SImode
);
11742 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11745 else if (mode0
== SImode
)
11748 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11751 || !register_operand (target
, SImode
))
11752 target
= gen_reg_rtx (SImode
);
11754 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11756 emit_insn (gen_rtx_SET (target
,
11757 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11760 case IX86_BUILTIN_RDSEED16_STEP
:
11761 icode
= CODE_FOR_rdseedhi_1
;
11765 case IX86_BUILTIN_RDSEED32_STEP
:
11766 icode
= CODE_FOR_rdseedsi_1
;
11770 case IX86_BUILTIN_RDSEED64_STEP
:
11771 icode
= CODE_FOR_rdseeddi_1
;
11775 arg0
= CALL_EXPR_ARG (exp
, 0);
11776 op1
= expand_normal (arg0
);
11777 if (!address_operand (op1
, VOIDmode
))
11779 op1
= convert_memory_address (Pmode
, op1
);
11780 op1
= copy_addr_to_reg (op1
);
11783 op0
= gen_reg_rtx (mode0
);
11784 emit_insn (GEN_FCN (icode
) (op0
));
11786 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11788 op2
= gen_reg_rtx (QImode
);
11790 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11792 emit_insn (gen_rtx_SET (op2
, pat
));
11795 || !register_operand (target
, SImode
))
11796 target
= gen_reg_rtx (SImode
);
11798 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11801 case IX86_BUILTIN_SBB32
:
11802 icode
= CODE_FOR_subborrowsi
;
11803 icode2
= CODE_FOR_subborrowsi_0
;
11809 case IX86_BUILTIN_SBB64
:
11810 icode
= CODE_FOR_subborrowdi
;
11811 icode2
= CODE_FOR_subborrowdi_0
;
11817 case IX86_BUILTIN_ADDCARRYX32
:
11818 icode
= CODE_FOR_addcarrysi
;
11819 icode2
= CODE_FOR_addcarrysi_0
;
11825 case IX86_BUILTIN_ADDCARRYX64
:
11826 icode
= CODE_FOR_addcarrydi
;
11827 icode2
= CODE_FOR_addcarrydi_0
;
11833 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11834 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11835 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11836 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11838 op1
= expand_normal (arg0
);
11839 if (!integer_zerop (arg0
))
11840 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11842 op2
= expand_normal (arg1
);
11843 if (!register_operand (op2
, mode0
))
11844 op2
= copy_to_mode_reg (mode0
, op2
);
11846 op3
= expand_normal (arg2
);
11847 if (!register_operand (op3
, mode0
))
11848 op3
= copy_to_mode_reg (mode0
, op3
);
11850 op4
= expand_normal (arg3
);
11851 if (!address_operand (op4
, VOIDmode
))
11853 op4
= convert_memory_address (Pmode
, op4
);
11854 op4
= copy_addr_to_reg (op4
);
11857 op0
= gen_reg_rtx (mode0
);
11858 if (integer_zerop (arg0
))
11860 /* If arg0 is 0, optimize right away into add or sub
11861 instruction that sets CCCmode flags. */
11862 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11863 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11867 /* Generate CF from input operand. */
11868 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11870 /* Generate instruction that consumes CF. */
11871 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11872 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11873 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11874 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11877 /* Return current CF value. */
11879 target
= gen_reg_rtx (QImode
);
11881 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11882 emit_insn (gen_rtx_SET (target
, pat
));
11884 /* Store the result. */
11885 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11889 case IX86_BUILTIN_READ_FLAGS
:
11890 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11893 || target
== NULL_RTX
11894 || !nonimmediate_operand (target
, word_mode
)
11895 || GET_MODE (target
) != word_mode
)
11896 target
= gen_reg_rtx (word_mode
);
11898 emit_insn (gen_pop (target
));
11901 case IX86_BUILTIN_WRITE_FLAGS
:
11903 arg0
= CALL_EXPR_ARG (exp
, 0);
11904 op0
= expand_normal (arg0
);
11905 if (!general_no_elim_operand (op0
, word_mode
))
11906 op0
= copy_to_mode_reg (word_mode
, op0
);
11908 emit_insn (gen_push (op0
));
11909 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11912 case IX86_BUILTIN_KTESTC8
:
11913 icode
= CODE_FOR_ktestqi
;
11917 case IX86_BUILTIN_KTESTZ8
:
11918 icode
= CODE_FOR_ktestqi
;
11922 case IX86_BUILTIN_KTESTC16
:
11923 icode
= CODE_FOR_ktesthi
;
11927 case IX86_BUILTIN_KTESTZ16
:
11928 icode
= CODE_FOR_ktesthi
;
11932 case IX86_BUILTIN_KTESTC32
:
11933 icode
= CODE_FOR_ktestsi
;
11937 case IX86_BUILTIN_KTESTZ32
:
11938 icode
= CODE_FOR_ktestsi
;
11942 case IX86_BUILTIN_KTESTC64
:
11943 icode
= CODE_FOR_ktestdi
;
11947 case IX86_BUILTIN_KTESTZ64
:
11948 icode
= CODE_FOR_ktestdi
;
11952 case IX86_BUILTIN_KORTESTC8
:
11953 icode
= CODE_FOR_kortestqi
;
11957 case IX86_BUILTIN_KORTESTZ8
:
11958 icode
= CODE_FOR_kortestqi
;
11962 case IX86_BUILTIN_KORTESTC16
:
11963 icode
= CODE_FOR_kortesthi
;
11967 case IX86_BUILTIN_KORTESTZ16
:
11968 icode
= CODE_FOR_kortesthi
;
11972 case IX86_BUILTIN_KORTESTC32
:
11973 icode
= CODE_FOR_kortestsi
;
11977 case IX86_BUILTIN_KORTESTZ32
:
11978 icode
= CODE_FOR_kortestsi
;
11982 case IX86_BUILTIN_KORTESTC64
:
11983 icode
= CODE_FOR_kortestdi
;
11987 case IX86_BUILTIN_KORTESTZ64
:
11988 icode
= CODE_FOR_kortestdi
;
11992 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
11993 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
11994 op0
= expand_normal (arg0
);
11995 op1
= expand_normal (arg1
);
11997 mode0
= insn_data
[icode
].operand
[0].mode
;
11998 mode1
= insn_data
[icode
].operand
[1].mode
;
12000 if (GET_MODE (op0
) != VOIDmode
)
12001 op0
= force_reg (GET_MODE (op0
), op0
);
12003 op0
= gen_lowpart (mode0
, op0
);
12005 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12006 op0
= copy_to_mode_reg (mode0
, op0
);
12008 if (GET_MODE (op1
) != VOIDmode
)
12009 op1
= force_reg (GET_MODE (op1
), op1
);
12011 op1
= gen_lowpart (mode1
, op1
);
12013 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12014 op1
= copy_to_mode_reg (mode1
, op1
);
12016 target
= gen_reg_rtx (QImode
);
12018 /* Emit kortest. */
12019 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12020 /* And use setcc to return result from flags. */
12021 ix86_expand_setcc (target
, EQ
,
12022 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12025 case IX86_BUILTIN_GATHERSIV2DF
:
12026 icode
= CODE_FOR_avx2_gathersiv2df
;
12028 case IX86_BUILTIN_GATHERSIV4DF
:
12029 icode
= CODE_FOR_avx2_gathersiv4df
;
12031 case IX86_BUILTIN_GATHERDIV2DF
:
12032 icode
= CODE_FOR_avx2_gatherdiv2df
;
12034 case IX86_BUILTIN_GATHERDIV4DF
:
12035 icode
= CODE_FOR_avx2_gatherdiv4df
;
12037 case IX86_BUILTIN_GATHERSIV4SF
:
12038 icode
= CODE_FOR_avx2_gathersiv4sf
;
12040 case IX86_BUILTIN_GATHERSIV8SF
:
12041 icode
= CODE_FOR_avx2_gathersiv8sf
;
12043 case IX86_BUILTIN_GATHERDIV4SF
:
12044 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12046 case IX86_BUILTIN_GATHERDIV8SF
:
12047 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12049 case IX86_BUILTIN_GATHERSIV2DI
:
12050 icode
= CODE_FOR_avx2_gathersiv2di
;
12052 case IX86_BUILTIN_GATHERSIV4DI
:
12053 icode
= CODE_FOR_avx2_gathersiv4di
;
12055 case IX86_BUILTIN_GATHERDIV2DI
:
12056 icode
= CODE_FOR_avx2_gatherdiv2di
;
12058 case IX86_BUILTIN_GATHERDIV4DI
:
12059 icode
= CODE_FOR_avx2_gatherdiv4di
;
12061 case IX86_BUILTIN_GATHERSIV4SI
:
12062 icode
= CODE_FOR_avx2_gathersiv4si
;
12064 case IX86_BUILTIN_GATHERSIV8SI
:
12065 icode
= CODE_FOR_avx2_gathersiv8si
;
12067 case IX86_BUILTIN_GATHERDIV4SI
:
12068 icode
= CODE_FOR_avx2_gatherdiv4si
;
12070 case IX86_BUILTIN_GATHERDIV8SI
:
12071 icode
= CODE_FOR_avx2_gatherdiv8si
;
12073 case IX86_BUILTIN_GATHERALTSIV4DF
:
12074 icode
= CODE_FOR_avx2_gathersiv4df
;
12076 case IX86_BUILTIN_GATHERALTDIV8SF
:
12077 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12079 case IX86_BUILTIN_GATHERALTSIV4DI
:
12080 icode
= CODE_FOR_avx2_gathersiv4di
;
12082 case IX86_BUILTIN_GATHERALTDIV8SI
:
12083 icode
= CODE_FOR_avx2_gatherdiv8si
;
12085 case IX86_BUILTIN_GATHER3SIV16SF
:
12086 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12088 case IX86_BUILTIN_GATHER3SIV8DF
:
12089 icode
= CODE_FOR_avx512f_gathersiv8df
;
12091 case IX86_BUILTIN_GATHER3DIV16SF
:
12092 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12094 case IX86_BUILTIN_GATHER3DIV8DF
:
12095 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12097 case IX86_BUILTIN_GATHER3SIV16SI
:
12098 icode
= CODE_FOR_avx512f_gathersiv16si
;
12100 case IX86_BUILTIN_GATHER3SIV8DI
:
12101 icode
= CODE_FOR_avx512f_gathersiv8di
;
12103 case IX86_BUILTIN_GATHER3DIV16SI
:
12104 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12106 case IX86_BUILTIN_GATHER3DIV8DI
:
12107 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12109 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12110 icode
= CODE_FOR_avx512f_gathersiv8df
;
12112 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12113 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12115 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12116 icode
= CODE_FOR_avx512f_gathersiv8di
;
12118 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12119 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12121 case IX86_BUILTIN_GATHER3SIV2DF
:
12122 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12124 case IX86_BUILTIN_GATHER3SIV4DF
:
12125 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12127 case IX86_BUILTIN_GATHER3DIV2DF
:
12128 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12130 case IX86_BUILTIN_GATHER3DIV4DF
:
12131 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12133 case IX86_BUILTIN_GATHER3SIV4SF
:
12134 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12136 case IX86_BUILTIN_GATHER3SIV8SF
:
12137 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12139 case IX86_BUILTIN_GATHER3DIV4SF
:
12140 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12142 case IX86_BUILTIN_GATHER3DIV8SF
:
12143 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12145 case IX86_BUILTIN_GATHER3SIV2DI
:
12146 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12148 case IX86_BUILTIN_GATHER3SIV4DI
:
12149 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12151 case IX86_BUILTIN_GATHER3DIV2DI
:
12152 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12154 case IX86_BUILTIN_GATHER3DIV4DI
:
12155 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12157 case IX86_BUILTIN_GATHER3SIV4SI
:
12158 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12160 case IX86_BUILTIN_GATHER3SIV8SI
:
12161 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12163 case IX86_BUILTIN_GATHER3DIV4SI
:
12164 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12166 case IX86_BUILTIN_GATHER3DIV8SI
:
12167 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12169 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12170 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12172 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12173 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12175 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12176 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12178 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12179 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12181 case IX86_BUILTIN_SCATTERSIV16SF
:
12182 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12184 case IX86_BUILTIN_SCATTERSIV8DF
:
12185 icode
= CODE_FOR_avx512f_scattersiv8df
;
12187 case IX86_BUILTIN_SCATTERDIV16SF
:
12188 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12190 case IX86_BUILTIN_SCATTERDIV8DF
:
12191 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12193 case IX86_BUILTIN_SCATTERSIV16SI
:
12194 icode
= CODE_FOR_avx512f_scattersiv16si
;
12196 case IX86_BUILTIN_SCATTERSIV8DI
:
12197 icode
= CODE_FOR_avx512f_scattersiv8di
;
12199 case IX86_BUILTIN_SCATTERDIV16SI
:
12200 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12202 case IX86_BUILTIN_SCATTERDIV8DI
:
12203 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12205 case IX86_BUILTIN_SCATTERSIV8SF
:
12206 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12208 case IX86_BUILTIN_SCATTERSIV4SF
:
12209 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12211 case IX86_BUILTIN_SCATTERSIV4DF
:
12212 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12214 case IX86_BUILTIN_SCATTERSIV2DF
:
12215 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12217 case IX86_BUILTIN_SCATTERDIV8SF
:
12218 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12220 case IX86_BUILTIN_SCATTERDIV4SF
:
12221 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12223 case IX86_BUILTIN_SCATTERDIV4DF
:
12224 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12226 case IX86_BUILTIN_SCATTERDIV2DF
:
12227 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12229 case IX86_BUILTIN_SCATTERSIV8SI
:
12230 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12232 case IX86_BUILTIN_SCATTERSIV4SI
:
12233 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12235 case IX86_BUILTIN_SCATTERSIV4DI
:
12236 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12238 case IX86_BUILTIN_SCATTERSIV2DI
:
12239 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12241 case IX86_BUILTIN_SCATTERDIV8SI
:
12242 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12244 case IX86_BUILTIN_SCATTERDIV4SI
:
12245 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12247 case IX86_BUILTIN_SCATTERDIV4DI
:
12248 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12250 case IX86_BUILTIN_SCATTERDIV2DI
:
12251 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12253 case IX86_BUILTIN_GATHERPFDPD
:
12254 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12255 goto vec_prefetch_gen
;
12256 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12257 icode
= CODE_FOR_avx512f_scattersiv8df
;
12259 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12260 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12262 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12263 icode
= CODE_FOR_avx512f_scattersiv8di
;
12265 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12266 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12268 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12269 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12271 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12272 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12274 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12275 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12277 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12278 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12280 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12281 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12283 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12284 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12286 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12287 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12289 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12290 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12292 case IX86_BUILTIN_GATHERPFDPS
:
12293 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12294 goto vec_prefetch_gen
;
12295 case IX86_BUILTIN_GATHERPFQPD
:
12296 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12297 goto vec_prefetch_gen
;
12298 case IX86_BUILTIN_GATHERPFQPS
:
12299 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12300 goto vec_prefetch_gen
;
12301 case IX86_BUILTIN_SCATTERPFDPD
:
12302 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12303 goto vec_prefetch_gen
;
12304 case IX86_BUILTIN_SCATTERPFDPS
:
12305 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12306 goto vec_prefetch_gen
;
12307 case IX86_BUILTIN_SCATTERPFQPD
:
12308 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12309 goto vec_prefetch_gen
;
12310 case IX86_BUILTIN_SCATTERPFQPS
:
12311 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12312 goto vec_prefetch_gen
;
12316 rtx (*gen
) (rtx
, rtx
);
12318 arg0
= CALL_EXPR_ARG (exp
, 0);
12319 arg1
= CALL_EXPR_ARG (exp
, 1);
12320 arg2
= CALL_EXPR_ARG (exp
, 2);
12321 arg3
= CALL_EXPR_ARG (exp
, 3);
12322 arg4
= CALL_EXPR_ARG (exp
, 4);
12323 op0
= expand_normal (arg0
);
12324 op1
= expand_normal (arg1
);
12325 op2
= expand_normal (arg2
);
12326 op3
= expand_normal (arg3
);
12327 op4
= expand_normal (arg4
);
12328 /* Note the arg order is different from the operand order. */
12329 mode0
= insn_data
[icode
].operand
[1].mode
;
12330 mode2
= insn_data
[icode
].operand
[3].mode
;
12331 mode3
= insn_data
[icode
].operand
[4].mode
;
12332 mode4
= insn_data
[icode
].operand
[5].mode
;
12334 if (target
== NULL_RTX
12335 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12336 || !insn_data
[icode
].operand
[0].predicate (target
,
12337 GET_MODE (target
)))
12338 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12340 subtarget
= target
;
12344 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12345 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12346 half
= gen_reg_rtx (V8SImode
);
12347 if (!nonimmediate_operand (op2
, V16SImode
))
12348 op2
= copy_to_mode_reg (V16SImode
, op2
);
12349 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12352 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12353 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12354 case IX86_BUILTIN_GATHERALTSIV4DF
:
12355 case IX86_BUILTIN_GATHERALTSIV4DI
:
12356 half
= gen_reg_rtx (V4SImode
);
12357 if (!nonimmediate_operand (op2
, V8SImode
))
12358 op2
= copy_to_mode_reg (V8SImode
, op2
);
12359 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12362 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12363 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12364 half
= gen_reg_rtx (mode0
);
12365 if (mode0
== V8SFmode
)
12366 gen
= gen_vec_extract_lo_v16sf
;
12368 gen
= gen_vec_extract_lo_v16si
;
12369 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12370 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12371 emit_insn (gen (half
, op0
));
12373 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12375 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12376 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12377 case IX86_BUILTIN_GATHERALTDIV8SF
:
12378 case IX86_BUILTIN_GATHERALTDIV8SI
:
12379 half
= gen_reg_rtx (mode0
);
12380 if (mode0
== V4SFmode
)
12381 gen
= gen_vec_extract_lo_v8sf
;
12383 gen
= gen_vec_extract_lo_v8si
;
12384 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12385 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12386 emit_insn (gen (half
, op0
));
12388 if (VECTOR_MODE_P (GET_MODE (op3
)))
12390 half
= gen_reg_rtx (mode0
);
12391 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12392 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12393 emit_insn (gen (half
, op3
));
12401 /* Force memory operand only with base register here. But we
12402 don't want to do it on memory operand for other builtin
12404 op1
= ix86_zero_extend_to_Pmode (op1
);
12406 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12407 op0
= copy_to_mode_reg (mode0
, op0
);
12408 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12409 op1
= copy_to_mode_reg (Pmode
, op1
);
12410 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12411 op2
= copy_to_mode_reg (mode2
, op2
);
12413 op3
= fixup_modeless_constant (op3
, mode3
);
12415 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12417 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12418 op3
= copy_to_mode_reg (mode3
, op3
);
12422 op3
= copy_to_reg (op3
);
12423 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12425 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12427 error ("the last argument must be scale 1, 2, 4, 8");
12431 /* Optimize. If mask is known to have all high bits set,
12432 replace op0 with pc_rtx to signal that the instruction
12433 overwrites the whole destination and doesn't use its
12434 previous contents. */
12437 if (TREE_CODE (arg3
) == INTEGER_CST
)
12439 if (integer_all_onesp (arg3
))
12442 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12444 unsigned int negative
= 0;
12445 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12447 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12448 if (TREE_CODE (cst
) == INTEGER_CST
12449 && tree_int_cst_sign_bit (cst
))
12451 else if (TREE_CODE (cst
) == REAL_CST
12452 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12455 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12458 else if (TREE_CODE (arg3
) == SSA_NAME
12459 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12461 /* Recognize also when mask is like:
12462 __v2df src = _mm_setzero_pd ();
12463 __v2df mask = _mm_cmpeq_pd (src, src);
12465 __v8sf src = _mm256_setzero_ps ();
12466 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12467 as that is a cheaper way to load all ones into
12468 a register than having to load a constant from
12470 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12471 if (is_gimple_call (def_stmt
))
12473 tree fndecl
= gimple_call_fndecl (def_stmt
);
12475 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12476 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12478 case IX86_BUILTIN_CMPPD
:
12479 case IX86_BUILTIN_CMPPS
:
12480 case IX86_BUILTIN_CMPPD256
:
12481 case IX86_BUILTIN_CMPPS256
:
12482 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12485 case IX86_BUILTIN_CMPEQPD
:
12486 case IX86_BUILTIN_CMPEQPS
:
12487 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12488 && initializer_zerop (gimple_call_arg (def_stmt
,
12499 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12506 case IX86_BUILTIN_GATHER3DIV16SF
:
12507 if (target
== NULL_RTX
)
12508 target
= gen_reg_rtx (V8SFmode
);
12509 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12511 case IX86_BUILTIN_GATHER3DIV16SI
:
12512 if (target
== NULL_RTX
)
12513 target
= gen_reg_rtx (V8SImode
);
12514 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12516 case IX86_BUILTIN_GATHER3DIV8SF
:
12517 case IX86_BUILTIN_GATHERDIV8SF
:
12518 if (target
== NULL_RTX
)
12519 target
= gen_reg_rtx (V4SFmode
);
12520 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12522 case IX86_BUILTIN_GATHER3DIV8SI
:
12523 case IX86_BUILTIN_GATHERDIV8SI
:
12524 if (target
== NULL_RTX
)
12525 target
= gen_reg_rtx (V4SImode
);
12526 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12529 target
= subtarget
;
12535 arg0
= CALL_EXPR_ARG (exp
, 0);
12536 arg1
= CALL_EXPR_ARG (exp
, 1);
12537 arg2
= CALL_EXPR_ARG (exp
, 2);
12538 arg3
= CALL_EXPR_ARG (exp
, 3);
12539 arg4
= CALL_EXPR_ARG (exp
, 4);
12540 op0
= expand_normal (arg0
);
12541 op1
= expand_normal (arg1
);
12542 op2
= expand_normal (arg2
);
12543 op3
= expand_normal (arg3
);
12544 op4
= expand_normal (arg4
);
12545 mode1
= insn_data
[icode
].operand
[1].mode
;
12546 mode2
= insn_data
[icode
].operand
[2].mode
;
12547 mode3
= insn_data
[icode
].operand
[3].mode
;
12548 mode4
= insn_data
[icode
].operand
[4].mode
;
12550 /* Scatter instruction stores operand op3 to memory with
12551 indices from op2 and scale from op4 under writemask op1.
12552 If index operand op2 has more elements then source operand
12553 op3 one need to use only its low half. And vice versa. */
12556 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12557 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12558 half
= gen_reg_rtx (V8SImode
);
12559 if (!nonimmediate_operand (op2
, V16SImode
))
12560 op2
= copy_to_mode_reg (V16SImode
, op2
);
12561 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12564 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12565 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12566 half
= gen_reg_rtx (mode3
);
12567 if (mode3
== V8SFmode
)
12568 gen
= gen_vec_extract_lo_v16sf
;
12570 gen
= gen_vec_extract_lo_v16si
;
12571 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12572 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12573 emit_insn (gen (half
, op3
));
12576 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12577 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12578 half
= gen_reg_rtx (V4SImode
);
12579 if (!nonimmediate_operand (op2
, V8SImode
))
12580 op2
= copy_to_mode_reg (V8SImode
, op2
);
12581 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12584 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12585 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12586 half
= gen_reg_rtx (mode3
);
12587 if (mode3
== V4SFmode
)
12588 gen
= gen_vec_extract_lo_v8sf
;
12590 gen
= gen_vec_extract_lo_v8si
;
12591 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12592 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12593 emit_insn (gen (half
, op3
));
12596 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12597 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12598 if (!nonimmediate_operand (op2
, V4SImode
))
12599 op2
= copy_to_mode_reg (V4SImode
, op2
);
12601 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12602 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12603 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12604 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12610 /* Force memory operand only with base register here. But we
12611 don't want to do it on memory operand for other builtin
12613 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12615 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12616 op0
= copy_to_mode_reg (Pmode
, op0
);
12618 op1
= fixup_modeless_constant (op1
, mode1
);
12620 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12622 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12623 op1
= copy_to_mode_reg (mode1
, op1
);
12627 op1
= copy_to_reg (op1
);
12628 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12631 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12632 op2
= copy_to_mode_reg (mode2
, op2
);
12634 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12635 op3
= copy_to_mode_reg (mode3
, op3
);
12637 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12639 error ("the last argument must be scale 1, 2, 4, 8");
12643 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12651 arg0
= CALL_EXPR_ARG (exp
, 0);
12652 arg1
= CALL_EXPR_ARG (exp
, 1);
12653 arg2
= CALL_EXPR_ARG (exp
, 2);
12654 arg3
= CALL_EXPR_ARG (exp
, 3);
12655 arg4
= CALL_EXPR_ARG (exp
, 4);
12656 op0
= expand_normal (arg0
);
12657 op1
= expand_normal (arg1
);
12658 op2
= expand_normal (arg2
);
12659 op3
= expand_normal (arg3
);
12660 op4
= expand_normal (arg4
);
12661 mode0
= insn_data
[icode
].operand
[0].mode
;
12662 mode1
= insn_data
[icode
].operand
[1].mode
;
12663 mode3
= insn_data
[icode
].operand
[3].mode
;
12664 mode4
= insn_data
[icode
].operand
[4].mode
;
12666 op0
= fixup_modeless_constant (op0
, mode0
);
12668 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12670 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12671 op0
= copy_to_mode_reg (mode0
, op0
);
12675 op0
= copy_to_reg (op0
);
12676 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12679 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12680 op1
= copy_to_mode_reg (mode1
, op1
);
12682 /* Force memory operand only with base register here. But we
12683 don't want to do it on memory operand for other builtin
12685 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12687 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12688 op2
= copy_to_mode_reg (Pmode
, op2
);
12690 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12692 error ("the forth argument must be scale 1, 2, 4, 8");
12696 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12698 error ("incorrect hint operand");
12702 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12710 case IX86_BUILTIN_XABORT
:
12711 icode
= CODE_FOR_xabort
;
12712 arg0
= CALL_EXPR_ARG (exp
, 0);
12713 op0
= expand_normal (arg0
);
12714 mode0
= insn_data
[icode
].operand
[0].mode
;
12715 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12717 error ("the argument to %<xabort%> intrinsic must "
12718 "be an 8-bit immediate");
12721 emit_insn (gen_xabort (op0
));
12724 case IX86_BUILTIN_RSTORSSP
:
12725 case IX86_BUILTIN_CLRSSBSY
:
12726 arg0
= CALL_EXPR_ARG (exp
, 0);
12727 op0
= expand_normal (arg0
);
12728 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12729 ? CODE_FOR_rstorssp
12730 : CODE_FOR_clrssbsy
);
12731 if (!address_operand (op0
, VOIDmode
))
12733 op1
= convert_memory_address (Pmode
, op0
);
12734 op0
= copy_addr_to_reg (op1
);
12736 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (Pmode
, op0
)));
12739 case IX86_BUILTIN_WRSSD
:
12740 case IX86_BUILTIN_WRSSQ
:
12741 case IX86_BUILTIN_WRUSSD
:
12742 case IX86_BUILTIN_WRUSSQ
:
12743 arg0
= CALL_EXPR_ARG (exp
, 0);
12744 op0
= expand_normal (arg0
);
12745 arg1
= CALL_EXPR_ARG (exp
, 1);
12746 op1
= expand_normal (arg1
);
12749 case IX86_BUILTIN_WRSSD
:
12750 icode
= CODE_FOR_wrsssi
;
12753 case IX86_BUILTIN_WRSSQ
:
12754 icode
= CODE_FOR_wrssdi
;
12757 case IX86_BUILTIN_WRUSSD
:
12758 icode
= CODE_FOR_wrusssi
;
12761 case IX86_BUILTIN_WRUSSQ
:
12762 icode
= CODE_FOR_wrussdi
;
12766 op0
= force_reg (mode
, op0
);
12767 if (!address_operand (op1
, VOIDmode
))
12769 op2
= convert_memory_address (Pmode
, op1
);
12770 op1
= copy_addr_to_reg (op2
);
12772 emit_insn (GEN_FCN (icode
) (op0
, gen_rtx_MEM (mode
, op1
)));
12779 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12780 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12782 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12783 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12787 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12788 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12790 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12791 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12792 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12793 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12795 machine_mode mode
, wide_mode
, nar_mode
;
12797 nar_mode
= V4SFmode
;
12799 wide_mode
= V64SFmode
;
12800 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12801 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12805 case IX86_BUILTIN_4FMAPS
:
12806 fcn
= gen_avx5124fmaddps_4fmaddps
;
12810 case IX86_BUILTIN_4DPWSSD
:
12811 nar_mode
= V4SImode
;
12813 wide_mode
= V64SImode
;
12814 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12818 case IX86_BUILTIN_4DPWSSDS
:
12819 nar_mode
= V4SImode
;
12821 wide_mode
= V64SImode
;
12822 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12826 case IX86_BUILTIN_4FNMAPS
:
12827 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12831 case IX86_BUILTIN_4FNMAPS_MASK
:
12832 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12833 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12836 case IX86_BUILTIN_4DPWSSD_MASK
:
12837 nar_mode
= V4SImode
;
12839 wide_mode
= V64SImode
;
12840 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12841 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12844 case IX86_BUILTIN_4DPWSSDS_MASK
:
12845 nar_mode
= V4SImode
;
12847 wide_mode
= V64SImode
;
12848 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12849 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12852 case IX86_BUILTIN_4FMAPS_MASK
:
12862 wide_reg
= gen_reg_rtx (wide_mode
);
12863 for (i
= 0; i
< 4; i
++)
12865 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12866 ops
[i
] = expand_normal (args
[i
]);
12868 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12872 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12873 accum
= force_reg (mode
, accum
);
12875 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12876 addr
= force_reg (Pmode
, addr
);
12878 mem
= gen_rtx_MEM (nar_mode
, addr
);
12880 target
= gen_reg_rtx (mode
);
12882 emit_move_insn (target
, accum
);
12885 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12889 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12891 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12893 if (CONST_INT_P (mask
))
12894 mask
= fixup_modeless_constant (mask
, HImode
);
12896 mask
= force_reg (HImode
, mask
);
12898 if (GET_MODE (mask
) != HImode
)
12899 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12901 /* If merge is 0 then we're about to emit z-masked variant. */
12902 if (const0_operand (merge
, mode
))
12903 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12904 /* If merge is the same as accum then emit merge-masked variant. */
12905 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12907 merge
= force_reg (mode
, merge
);
12908 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12910 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12913 target
= gen_reg_rtx (mode
);
12914 emit_move_insn (target
, merge
);
12915 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12921 case IX86_BUILTIN_4FNMASS
:
12922 fcn
= gen_avx5124fmaddps_4fnmaddss
;
12926 case IX86_BUILTIN_4FMASS
:
12927 fcn
= gen_avx5124fmaddps_4fmaddss
;
12931 case IX86_BUILTIN_4FNMASS_MASK
:
12932 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
12933 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
12936 case IX86_BUILTIN_4FMASS_MASK
:
12945 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
12946 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
12950 wide_reg
= gen_reg_rtx (V64SFmode
);
12951 for (i
= 0; i
< 4; i
++)
12954 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12955 ops
[i
] = expand_normal (args
[i
]);
12957 tmp
= gen_reg_rtx (SFmode
);
12958 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
12960 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
12961 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
12964 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12965 accum
= force_reg (V4SFmode
, accum
);
12967 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12968 addr
= force_reg (Pmode
, addr
);
12970 mem
= gen_rtx_MEM (V4SFmode
, addr
);
12972 target
= gen_reg_rtx (V4SFmode
);
12974 emit_move_insn (target
, accum
);
12977 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12981 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12983 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12985 if (CONST_INT_P (mask
))
12986 mask
= fixup_modeless_constant (mask
, QImode
);
12988 mask
= force_reg (QImode
, mask
);
12990 if (GET_MODE (mask
) != QImode
)
12991 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
12993 /* If merge is 0 then we're about to emit z-masked variant. */
12994 if (const0_operand (merge
, mode
))
12995 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12996 /* If merge is the same as accum then emit merge-masked
12998 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13000 merge
= force_reg (mode
, merge
);
13001 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13003 /* Merge with something unknown might happen if we z-mask
13007 target
= gen_reg_rtx (mode
);
13008 emit_move_insn (target
, merge
);
13009 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13014 case IX86_BUILTIN_RDPID
:
13015 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13017 case IX86_BUILTIN_FABSQ
:
13018 case IX86_BUILTIN_COPYSIGNQ
:
13020 /* Emit a normal call if SSE isn't available. */
13021 return expand_call (exp
, target
, ignore
);
13024 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13028 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13029 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13031 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13032 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13035 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13036 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13038 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13039 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13042 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13043 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13045 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13046 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13049 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13050 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13052 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13053 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13056 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13057 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13059 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13060 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13061 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13062 (enum ix86_builtin_func_type
)
13063 d
->flag
, d
->comparison
);
13066 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13067 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13069 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13070 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13074 if (fcode
>= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13075 && fcode
<= IX86_BUILTIN__BDESC_CET_NORMAL_LAST
)
13077 i
= fcode
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
;
13078 return ix86_expand_special_args_builtin (bdesc_cet_rdssp
+ i
, exp
,
13082 gcc_unreachable ();
13085 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13086 fill target with val via vec_duplicate. */
13089 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13095 /* First attempt to recognize VAL as-is. */
13096 dup
= gen_vec_duplicate (mode
, val
);
13097 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13098 if (recog_memoized (insn
) < 0)
13101 machine_mode innermode
= GET_MODE_INNER (mode
);
13104 /* If that fails, force VAL into a register. */
13107 reg
= force_reg (innermode
, val
);
13108 if (GET_MODE (reg
) != innermode
)
13109 reg
= gen_lowpart (innermode
, reg
);
13110 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13111 seq
= get_insns ();
13114 emit_insn_before (seq
, insn
);
13116 ok
= recog_memoized (insn
) >= 0;
13122 /* Get a vector mode of the same size as the original but with elements
13123 twice as wide. This is only guaranteed to apply to integral vectors. */
13125 static machine_mode
13126 get_mode_wider_vector (machine_mode o
)
13128 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13129 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13130 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13131 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13135 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13136 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13138 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13139 with all elements equal to VAR. Return true if successful. */
13142 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13143 rtx target
, rtx val
)
13167 return ix86_vector_duplicate_value (mode
, target
, val
);
13172 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13176 val
= gen_lowpart (SImode
, val
);
13177 x
= gen_rtx_TRUNCATE (HImode
, val
);
13178 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13179 emit_insn (gen_rtx_SET (target
, x
));
13191 return ix86_vector_duplicate_value (mode
, target
, val
);
13195 struct expand_vec_perm_d dperm
;
13199 memset (&dperm
, 0, sizeof (dperm
));
13200 dperm
.target
= target
;
13201 dperm
.vmode
= mode
;
13202 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13203 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13204 dperm
.one_operand_p
= true;
13206 /* Extend to SImode using a paradoxical SUBREG. */
13207 tmp1
= gen_reg_rtx (SImode
);
13208 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13210 /* Insert the SImode value as low element of a V4SImode vector. */
13211 tmp2
= gen_reg_rtx (V4SImode
);
13212 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13213 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13215 ok
= (expand_vec_perm_1 (&dperm
)
13216 || expand_vec_perm_broadcast_1 (&dperm
));
13224 return ix86_vector_duplicate_value (mode
, target
, val
);
13231 /* Replicate the value once into the next wider mode and recurse. */
13233 machine_mode smode
, wsmode
, wvmode
;
13236 smode
= GET_MODE_INNER (mode
);
13237 wvmode
= get_mode_wider_vector (mode
);
13238 wsmode
= GET_MODE_INNER (wvmode
);
13240 val
= convert_modes (wsmode
, smode
, val
, true);
13241 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13242 GEN_INT (GET_MODE_BITSIZE (smode
)),
13243 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13244 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13246 x
= gen_reg_rtx (wvmode
);
13247 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13249 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13256 return ix86_vector_duplicate_value (mode
, target
, val
);
13259 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13260 rtx x
= gen_reg_rtx (hvmode
);
13262 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13265 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13266 emit_insn (gen_rtx_SET (target
, x
));
13272 if (TARGET_AVX512BW
)
13273 return ix86_vector_duplicate_value (mode
, target
, val
);
13276 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13277 rtx x
= gen_reg_rtx (hvmode
);
13279 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13282 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13283 emit_insn (gen_rtx_SET (target
, x
));
13292 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13293 whose ONE_VAR element is VAR, and other elements are zero. Return true
13297 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13298 rtx target
, rtx var
, int one_var
)
13300 machine_mode vsimode
;
13303 bool use_vector_set
= false;
13304 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13309 /* For SSE4.1, we normally use vector set. But if the second
13310 element is zero and inter-unit moves are OK, we use movq
13312 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13313 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13319 use_vector_set
= TARGET_SSE4_1
;
13322 use_vector_set
= TARGET_SSE2
;
13325 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13328 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13332 use_vector_set
= TARGET_AVX
;
13335 use_vector_set
= TARGET_AVX
;
13336 gen_vec_set_0
= gen_vec_setv8si_0
;
13339 use_vector_set
= TARGET_AVX
;
13340 gen_vec_set_0
= gen_vec_setv8sf_0
;
13343 use_vector_set
= TARGET_AVX
;
13344 gen_vec_set_0
= gen_vec_setv4df_0
;
13347 /* Use ix86_expand_vector_set in 64bit mode only. */
13348 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13349 gen_vec_set_0
= gen_vec_setv4di_0
;
13352 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13353 gen_vec_set_0
= gen_vec_setv16si_0
;
13356 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13357 gen_vec_set_0
= gen_vec_setv16sf_0
;
13360 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13361 gen_vec_set_0
= gen_vec_setv8df_0
;
13364 /* Use ix86_expand_vector_set in 64bit mode only. */
13365 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13366 gen_vec_set_0
= gen_vec_setv8di_0
;
13372 if (use_vector_set
)
13374 if (gen_vec_set_0
&& one_var
== 0)
13376 var
= force_reg (GET_MODE_INNER (mode
), var
);
13377 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13380 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13381 var
= force_reg (GET_MODE_INNER (mode
), var
);
13382 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13398 var
= force_reg (GET_MODE_INNER (mode
), var
);
13399 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13400 emit_insn (gen_rtx_SET (target
, x
));
13405 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13406 new_target
= gen_reg_rtx (mode
);
13408 new_target
= target
;
13409 var
= force_reg (GET_MODE_INNER (mode
), var
);
13410 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13411 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13412 emit_insn (gen_rtx_SET (new_target
, x
));
13415 /* We need to shuffle the value to the correct position, so
13416 create a new pseudo to store the intermediate result. */
13418 /* With SSE2, we can use the integer shuffle insns. */
13419 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13421 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13423 GEN_INT (one_var
== 1 ? 0 : 1),
13424 GEN_INT (one_var
== 2 ? 0 : 1),
13425 GEN_INT (one_var
== 3 ? 0 : 1)));
13426 if (target
!= new_target
)
13427 emit_move_insn (target
, new_target
);
13431 /* Otherwise convert the intermediate result to V4SFmode and
13432 use the SSE1 shuffle instructions. */
13433 if (mode
!= V4SFmode
)
13435 tmp
= gen_reg_rtx (V4SFmode
);
13436 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13441 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13443 GEN_INT (one_var
== 1 ? 0 : 1),
13444 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13445 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13447 if (mode
!= V4SFmode
)
13448 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13449 else if (tmp
!= target
)
13450 emit_move_insn (target
, tmp
);
13452 else if (target
!= new_target
)
13453 emit_move_insn (target
, new_target
);
13458 vsimode
= V4SImode
;
13464 vsimode
= V2SImode
;
13470 /* Zero extend the variable element to SImode and recurse. */
13471 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13473 x
= gen_reg_rtx (vsimode
);
13474 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13476 gcc_unreachable ();
13478 emit_move_insn (target
, gen_lowpart (mode
, x
));
13486 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13487 consisting of the values in VALS. It is known that all elements
13488 except ONE_VAR are constants. Return true if successful. */
13491 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13492 rtx target
, rtx vals
, int one_var
)
13494 rtx var
= XVECEXP (vals
, 0, one_var
);
13495 machine_mode wmode
;
13498 const_vec
= copy_rtx (vals
);
13499 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13500 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13508 /* For the two element vectors, it's just as easy to use
13509 the general case. */
13513 /* Use ix86_expand_vector_set in 64bit mode only. */
13534 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13539 /* There's no way to set one QImode entry easily. Combine
13540 the variable value with its adjacent constant value, and
13541 promote to an HImode set. */
13542 x
= XVECEXP (vals
, 0, one_var
^ 1);
13545 var
= convert_modes (HImode
, QImode
, var
, true);
13546 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13547 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13548 x
= GEN_INT (INTVAL (x
) & 0xff);
13552 var
= convert_modes (HImode
, QImode
, var
, true);
13553 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13555 if (x
!= const0_rtx
)
13556 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13557 1, OPTAB_LIB_WIDEN
);
13559 x
= gen_reg_rtx (wmode
);
13560 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13561 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13563 emit_move_insn (target
, gen_lowpart (mode
, x
));
13570 emit_move_insn (target
, const_vec
);
13571 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13575 /* A subroutine of ix86_expand_vector_init_general. Use vector
13576 concatenate to handle the most general case: all values variable,
13577 and none identical. */
13580 ix86_expand_vector_init_concat (machine_mode mode
,
13581 rtx target
, rtx
*ops
, int n
)
13583 machine_mode half_mode
= VOIDmode
;
13594 half_mode
= V8SImode
;
13597 half_mode
= V8SFmode
;
13600 half_mode
= V4DImode
;
13603 half_mode
= V4DFmode
;
13606 half_mode
= V4SImode
;
13609 half_mode
= V4SFmode
;
13612 half_mode
= V2DImode
;
13615 half_mode
= V2DFmode
;
13618 half_mode
= V2SImode
;
13621 half_mode
= V2SFmode
;
13624 half_mode
= DImode
;
13627 half_mode
= SImode
;
13630 half_mode
= DFmode
;
13633 half_mode
= SFmode
;
13636 gcc_unreachable ();
13639 if (!register_operand (ops
[1], half_mode
))
13640 ops
[1] = force_reg (half_mode
, ops
[1]);
13641 if (!register_operand (ops
[0], half_mode
))
13642 ops
[0] = force_reg (half_mode
, ops
[0]);
13643 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13651 half_mode
= V2DImode
;
13654 half_mode
= V2DFmode
;
13657 half_mode
= V2SImode
;
13660 half_mode
= V2SFmode
;
13663 gcc_unreachable ();
13671 half_mode
= V4DImode
;
13674 half_mode
= V4DFmode
;
13677 half_mode
= V4SImode
;
13680 half_mode
= V4SFmode
;
13683 gcc_unreachable ();
13691 half_mode
= V8SImode
;
13694 half_mode
= V8SFmode
;
13697 gcc_unreachable ();
13702 /* FIXME: We process inputs backward to help RA. PR 36222. */
13704 for (j
= 1; j
!= -1; j
--)
13706 half
[j
] = gen_reg_rtx (half_mode
);
13710 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
13714 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
13718 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
13719 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
13723 gcc_unreachable ();
13725 ix86_expand_vector_init (false, half
[j
],
13726 gen_rtx_PARALLEL (half_mode
, v
));
13729 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
13733 gcc_unreachable ();
13737 /* A subroutine of ix86_expand_vector_init_general. Use vector
13738 interleave to handle the most general case: all values variable,
13739 and none identical. */
13742 ix86_expand_vector_init_interleave (machine_mode mode
,
13743 rtx target
, rtx
*ops
, int n
)
13745 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13748 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13749 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13750 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13755 gen_load_even
= gen_vec_setv8hi
;
13756 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13757 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13758 inner_mode
= HImode
;
13759 first_imode
= V4SImode
;
13760 second_imode
= V2DImode
;
13761 third_imode
= VOIDmode
;
13764 gen_load_even
= gen_vec_setv16qi
;
13765 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13766 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13767 inner_mode
= QImode
;
13768 first_imode
= V8HImode
;
13769 second_imode
= V4SImode
;
13770 third_imode
= V2DImode
;
13773 gcc_unreachable ();
13776 for (i
= 0; i
< n
; i
++)
13778 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13779 op0
= gen_reg_rtx (SImode
);
13780 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13782 /* Insert the SImode value as low element of V4SImode vector. */
13783 op1
= gen_reg_rtx (V4SImode
);
13784 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13785 gen_rtx_VEC_DUPLICATE (V4SImode
,
13787 CONST0_RTX (V4SImode
),
13789 emit_insn (gen_rtx_SET (op1
, op0
));
13791 /* Cast the V4SImode vector back to a vector in orignal mode. */
13792 op0
= gen_reg_rtx (mode
);
13793 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13795 /* Load even elements into the second position. */
13796 emit_insn (gen_load_even (op0
,
13797 force_reg (inner_mode
,
13801 /* Cast vector to FIRST_IMODE vector. */
13802 ops
[i
] = gen_reg_rtx (first_imode
);
13803 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13806 /* Interleave low FIRST_IMODE vectors. */
13807 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13809 op0
= gen_reg_rtx (first_imode
);
13810 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13812 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13813 ops
[j
] = gen_reg_rtx (second_imode
);
13814 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13817 /* Interleave low SECOND_IMODE vectors. */
13818 switch (second_imode
)
13821 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13823 op0
= gen_reg_rtx (second_imode
);
13824 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13827 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13829 ops
[j
] = gen_reg_rtx (third_imode
);
13830 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13832 second_imode
= V2DImode
;
13833 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13837 op0
= gen_reg_rtx (second_imode
);
13838 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13841 /* Cast the SECOND_IMODE vector back to a vector on original
13843 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13847 gcc_unreachable ();
13851 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13852 all values variable, and none identical. */
13855 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13856 rtx target
, rtx vals
)
13858 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13859 machine_mode half_mode
= VOIDmode
;
13860 machine_mode quarter_mode
= VOIDmode
;
13867 if (!mmx_ok
&& !TARGET_SSE
)
13883 n
= GET_MODE_NUNITS (mode
);
13884 for (i
= 0; i
< n
; i
++)
13885 ops
[i
] = XVECEXP (vals
, 0, i
);
13886 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13890 for (i
= 0; i
< 2; i
++)
13891 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13892 op0
= gen_reg_rtx (V4DImode
);
13893 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13894 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13898 for (i
= 0; i
< 4; i
++)
13899 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13900 ops
[4] = gen_reg_rtx (V4DImode
);
13901 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
13902 ops
[5] = gen_reg_rtx (V4DImode
);
13903 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
13904 op0
= gen_reg_rtx (V8DImode
);
13905 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
13906 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13910 half_mode
= V16QImode
;
13914 half_mode
= V8HImode
;
13918 n
= GET_MODE_NUNITS (mode
);
13919 for (i
= 0; i
< n
; i
++)
13920 ops
[i
] = XVECEXP (vals
, 0, i
);
13921 op0
= gen_reg_rtx (half_mode
);
13922 op1
= gen_reg_rtx (half_mode
);
13923 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
13925 ix86_expand_vector_init_interleave (half_mode
, op1
,
13926 &ops
[n
>> 1], n
>> 2);
13927 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
13931 quarter_mode
= V16QImode
;
13932 half_mode
= V32QImode
;
13936 quarter_mode
= V8HImode
;
13937 half_mode
= V16HImode
;
13941 n
= GET_MODE_NUNITS (mode
);
13942 for (i
= 0; i
< n
; i
++)
13943 ops
[i
] = XVECEXP (vals
, 0, i
);
13944 op0
= gen_reg_rtx (quarter_mode
);
13945 op1
= gen_reg_rtx (quarter_mode
);
13946 op2
= gen_reg_rtx (quarter_mode
);
13947 op3
= gen_reg_rtx (quarter_mode
);
13948 op4
= gen_reg_rtx (half_mode
);
13949 op5
= gen_reg_rtx (half_mode
);
13950 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
13952 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
13953 &ops
[n
>> 2], n
>> 3);
13954 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
13955 &ops
[n
>> 1], n
>> 3);
13956 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
13957 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
13958 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
13959 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
13960 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
13964 if (!TARGET_SSE4_1
)
13972 /* Don't use ix86_expand_vector_init_interleave if we can't
13973 move from GPR to SSE register directly. */
13974 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
13977 n
= GET_MODE_NUNITS (mode
);
13978 for (i
= 0; i
< n
; i
++)
13979 ops
[i
] = XVECEXP (vals
, 0, i
);
13980 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
13988 gcc_unreachable ();
13992 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
13993 machine_mode inner_mode
;
13994 rtx words
[4], shift
;
13996 inner_mode
= GET_MODE_INNER (mode
);
13997 n_elts
= GET_MODE_NUNITS (mode
);
13998 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
13999 n_elt_per_word
= n_elts
/ n_words
;
14000 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14002 for (i
= 0; i
< n_words
; ++i
)
14004 rtx word
= NULL_RTX
;
14006 for (j
= 0; j
< n_elt_per_word
; ++j
)
14008 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14009 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14015 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14016 word
, 1, OPTAB_LIB_WIDEN
);
14017 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14018 word
, 1, OPTAB_LIB_WIDEN
);
14026 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14027 else if (n_words
== 2)
14029 rtx tmp
= gen_reg_rtx (mode
);
14030 emit_clobber (tmp
);
14031 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14032 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14033 emit_move_insn (target
, tmp
);
14035 else if (n_words
== 4)
14037 rtx tmp
= gen_reg_rtx (V4SImode
);
14038 gcc_assert (word_mode
== SImode
);
14039 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14040 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14041 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14044 gcc_unreachable ();
14048 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14049 instructions unless MMX_OK is true. */
14052 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14054 machine_mode mode
= GET_MODE (target
);
14055 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14056 int n_elts
= GET_MODE_NUNITS (mode
);
14057 int n_var
= 0, one_var
= -1;
14058 bool all_same
= true, all_const_zero
= true;
14062 /* Handle first initialization from vector elts. */
14063 if (n_elts
!= XVECLEN (vals
, 0))
14065 rtx subtarget
= target
;
14066 x
= XVECEXP (vals
, 0, 0);
14067 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14068 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14070 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14071 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14073 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14074 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14075 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14076 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14077 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14078 subtarget
= gen_reg_rtx (mode
);
14080 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14081 if (subtarget
!= target
)
14082 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14085 gcc_unreachable ();
14088 for (i
= 0; i
< n_elts
; ++i
)
14090 x
= XVECEXP (vals
, 0, i
);
14091 if (!(CONST_SCALAR_INT_P (x
)
14092 || CONST_DOUBLE_P (x
)
14093 || CONST_FIXED_P (x
)))
14094 n_var
++, one_var
= i
;
14095 else if (x
!= CONST0_RTX (inner_mode
))
14096 all_const_zero
= false;
14097 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14101 /* Constants are best loaded from the constant pool. */
14104 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14108 /* If all values are identical, broadcast the value. */
14110 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14111 XVECEXP (vals
, 0, 0)))
14114 /* Values where only one field is non-constant are best loaded from
14115 the pool and overwritten via move later. */
14119 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14120 XVECEXP (vals
, 0, one_var
),
14124 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14128 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14132 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14134 machine_mode mode
= GET_MODE (target
);
14135 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14136 machine_mode half_mode
;
14137 bool use_vec_merge
= false;
14139 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14141 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14142 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14143 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14144 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14145 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14146 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14148 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14150 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14151 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14152 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14153 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14154 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14155 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14158 machine_mode mmode
= VOIDmode
;
14159 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14164 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14172 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14173 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14175 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14177 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14178 emit_insn (gen_rtx_SET (target
, tmp
));
14184 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14188 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14189 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14191 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14193 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14194 emit_insn (gen_rtx_SET (target
, tmp
));
14198 /* NB: For ELT == 0, use standard scalar operation patterns which
14199 preserve the rest of the vector for combiner:
14202 (vec_duplicate:V2DF (reg:DF))
14212 /* For the two element vectors, we implement a VEC_CONCAT with
14213 the extraction of the other element. */
14215 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14216 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14219 op0
= val
, op1
= tmp
;
14221 op0
= tmp
, op1
= val
;
14223 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14224 emit_insn (gen_rtx_SET (target
, tmp
));
14229 use_vec_merge
= TARGET_SSE4_1
;
14236 use_vec_merge
= true;
14240 /* tmp = target = A B C D */
14241 tmp
= copy_to_reg (target
);
14242 /* target = A A B B */
14243 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14244 /* target = X A B B */
14245 ix86_expand_vector_set (false, target
, val
, 0);
14246 /* target = A X C D */
14247 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14248 const1_rtx
, const0_rtx
,
14249 GEN_INT (2+4), GEN_INT (3+4)));
14253 /* tmp = target = A B C D */
14254 tmp
= copy_to_reg (target
);
14255 /* tmp = X B C D */
14256 ix86_expand_vector_set (false, tmp
, val
, 0);
14257 /* target = A B X D */
14258 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14259 const0_rtx
, const1_rtx
,
14260 GEN_INT (0+4), GEN_INT (3+4)));
14264 /* tmp = target = A B C D */
14265 tmp
= copy_to_reg (target
);
14266 /* tmp = X B C D */
14267 ix86_expand_vector_set (false, tmp
, val
, 0);
14268 /* target = A B X D */
14269 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14270 const0_rtx
, const1_rtx
,
14271 GEN_INT (2+4), GEN_INT (0+4)));
14275 gcc_unreachable ();
14280 use_vec_merge
= TARGET_SSE4_1
;
14284 /* Element 0 handled by vec_merge below. */
14287 use_vec_merge
= true;
14293 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14294 store into element 0, then shuffle them back. */
14298 order
[0] = GEN_INT (elt
);
14299 order
[1] = const1_rtx
;
14300 order
[2] = const2_rtx
;
14301 order
[3] = GEN_INT (3);
14302 order
[elt
] = const0_rtx
;
14304 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14305 order
[1], order
[2], order
[3]));
14307 ix86_expand_vector_set (false, target
, val
, 0);
14309 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14310 order
[1], order
[2], order
[3]));
14314 /* For SSE1, we have to reuse the V4SF code. */
14315 rtx t
= gen_reg_rtx (V4SFmode
);
14316 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14317 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14318 emit_move_insn (target
, gen_lowpart (mode
, t
));
14323 use_vec_merge
= TARGET_SSE2
;
14326 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14330 use_vec_merge
= TARGET_SSE4_1
;
14334 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14338 half_mode
= V16QImode
;
14344 half_mode
= V8HImode
;
14350 half_mode
= V4SImode
;
14356 half_mode
= V2DImode
;
14362 half_mode
= V4SFmode
;
14368 half_mode
= V2DFmode
;
14374 /* Compute offset. */
14378 gcc_assert (i
<= 1);
14380 /* Extract the half. */
14381 tmp
= gen_reg_rtx (half_mode
);
14382 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14384 /* Put val in tmp at elt. */
14385 ix86_expand_vector_set (false, tmp
, val
, elt
);
14388 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14392 if (TARGET_AVX512F
)
14395 gen_blendm
= gen_avx512f_blendmv8df
;
14400 if (TARGET_AVX512F
)
14403 gen_blendm
= gen_avx512f_blendmv8di
;
14408 if (TARGET_AVX512F
)
14411 gen_blendm
= gen_avx512f_blendmv16sf
;
14416 if (TARGET_AVX512F
)
14419 gen_blendm
= gen_avx512f_blendmv16si
;
14424 if (TARGET_AVX512BW
)
14427 gen_blendm
= gen_avx512bw_blendmv32hi
;
14429 else if (TARGET_AVX512F
)
14431 half_mode
= E_V8HImode
;
14438 if (TARGET_AVX512BW
)
14441 gen_blendm
= gen_avx512bw_blendmv64qi
;
14443 else if (TARGET_AVX512F
)
14445 half_mode
= E_V16QImode
;
14452 /* Compute offset. */
14456 gcc_assert (i
<= 3);
14459 /* Extract the quarter. */
14460 tmp
= gen_reg_rtx (V4SImode
);
14461 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14462 rtx mask
= gen_reg_rtx (QImode
);
14464 emit_move_insn (mask
, constm1_rtx
);
14465 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14468 tmp2
= gen_reg_rtx (half_mode
);
14469 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14472 /* Put val in tmp at elt. */
14473 ix86_expand_vector_set (false, tmp
, val
, elt
);
14476 tmp2
= gen_reg_rtx (V16SImode
);
14477 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14478 mask
= gen_reg_rtx (HImode
);
14479 emit_move_insn (mask
, constm1_rtx
);
14480 tmp
= gen_lowpart (V4SImode
, tmp
);
14481 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14483 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14491 if (mmode
!= VOIDmode
)
14493 tmp
= gen_reg_rtx (mode
);
14494 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14495 /* The avx512*_blendm<mode> expanders have different operand order
14496 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14497 elements where the mask is set and second input operand otherwise,
14498 in {sse,avx}*_*blend* the first input operand is used for elements
14499 where the mask is clear and second input operand otherwise. */
14500 emit_insn (gen_blendm (target
, target
, tmp
,
14502 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14505 else if (use_vec_merge
)
14508 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14509 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14510 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14511 emit_insn (gen_rtx_SET (target
, tmp
));
14515 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14517 emit_move_insn (mem
, target
);
14519 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14520 emit_move_insn (tmp
, val
);
14522 emit_move_insn (target
, mem
);
14527 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14529 machine_mode mode
= GET_MODE (vec
);
14530 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14531 bool use_vec_extr
= false;
14537 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14551 use_vec_extr
= true;
14555 use_vec_extr
= TARGET_SSE4_1
;
14567 tmp
= gen_reg_rtx (mode
);
14568 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14569 GEN_INT (elt
), GEN_INT (elt
),
14570 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14574 tmp
= gen_reg_rtx (mode
);
14575 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14579 gcc_unreachable ();
14582 use_vec_extr
= true;
14587 use_vec_extr
= TARGET_SSE4_1
;
14601 tmp
= gen_reg_rtx (mode
);
14602 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14603 GEN_INT (elt
), GEN_INT (elt
),
14604 GEN_INT (elt
), GEN_INT (elt
)));
14608 tmp
= gen_reg_rtx (mode
);
14609 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14613 gcc_unreachable ();
14616 use_vec_extr
= true;
14621 /* For SSE1, we have to reuse the V4SF code. */
14622 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14623 gen_lowpart (V4SFmode
, vec
), elt
);
14629 use_vec_extr
= TARGET_SSE2
;
14632 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14636 use_vec_extr
= TARGET_SSE4_1
;
14640 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
14642 tmp
= gen_reg_rtx (SImode
);
14643 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
14645 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
14653 tmp
= gen_reg_rtx (V4SFmode
);
14655 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14657 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14658 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14666 tmp
= gen_reg_rtx (V2DFmode
);
14668 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14670 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14671 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14679 tmp
= gen_reg_rtx (V16QImode
);
14681 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14683 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14684 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14692 tmp
= gen_reg_rtx (V8HImode
);
14694 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14696 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14697 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14705 tmp
= gen_reg_rtx (V4SImode
);
14707 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14709 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14710 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14718 tmp
= gen_reg_rtx (V2DImode
);
14720 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14722 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14723 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14729 if (TARGET_AVX512BW
)
14731 tmp
= gen_reg_rtx (V16HImode
);
14733 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14735 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14736 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14742 if (TARGET_AVX512BW
)
14744 tmp
= gen_reg_rtx (V32QImode
);
14746 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14748 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14749 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14755 tmp
= gen_reg_rtx (V8SFmode
);
14757 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14759 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14760 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14764 tmp
= gen_reg_rtx (V4DFmode
);
14766 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14768 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14769 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14773 tmp
= gen_reg_rtx (V8SImode
);
14775 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14777 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14778 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14782 tmp
= gen_reg_rtx (V4DImode
);
14784 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14786 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14787 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14791 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14792 /* ??? Could extract the appropriate HImode element and shift. */
14801 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14802 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14804 /* Let the rtl optimizers know about the zero extension performed. */
14805 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14807 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14808 target
= gen_lowpart (SImode
, target
);
14811 emit_insn (gen_rtx_SET (target
, tmp
));
14815 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14817 emit_move_insn (mem
, vec
);
14819 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14820 emit_move_insn (target
, tmp
);
14824 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14825 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14826 The upper bits of DEST are undefined, though they shouldn't cause
14827 exceptions (some bits from src or all zeros are ok). */
14830 emit_reduc_half (rtx dest
, rtx src
, int i
)
14833 switch (GET_MODE (src
))
14837 tem
= gen_sse_movhlps (dest
, src
, src
);
14839 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14840 GEN_INT (1 + 4), GEN_INT (1 + 4));
14843 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14849 d
= gen_reg_rtx (V1TImode
);
14850 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14855 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14857 tem
= gen_avx_shufps256 (dest
, src
, src
,
14858 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14862 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14864 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14872 if (GET_MODE (dest
) != V4DImode
)
14873 d
= gen_reg_rtx (V4DImode
);
14874 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14875 gen_lowpart (V4DImode
, src
),
14880 d
= gen_reg_rtx (V2TImode
);
14881 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14889 d
= gen_reg_rtx (V4TImode
);
14890 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
14900 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14901 gen_lowpart (V16SImode
, src
),
14902 gen_lowpart (V16SImode
, src
),
14903 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14904 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14905 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14906 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14907 GEN_INT (0xC), GEN_INT (0xD),
14908 GEN_INT (0xE), GEN_INT (0xF),
14909 GEN_INT (0x10), GEN_INT (0x11),
14910 GEN_INT (0x12), GEN_INT (0x13),
14911 GEN_INT (0x14), GEN_INT (0x15),
14912 GEN_INT (0x16), GEN_INT (0x17));
14914 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
14915 gen_lowpart (V16SImode
, src
),
14916 GEN_INT (i
== 128 ? 0x2 : 0x1),
14920 GEN_INT (i
== 128 ? 0x6 : 0x5),
14924 GEN_INT (i
== 128 ? 0xA : 0x9),
14928 GEN_INT (i
== 128 ? 0xE : 0xD),
14934 gcc_unreachable ();
14938 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
14941 /* Expand a vector reduction. FN is the binary pattern to reduce;
14942 DEST is the destination; IN is the input vector. */
14945 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
14947 rtx half
, dst
, vec
= in
;
14948 machine_mode mode
= GET_MODE (in
);
14951 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
14953 && mode
== V8HImode
14954 && fn
== gen_uminv8hi3
)
14956 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
14960 for (i
= GET_MODE_BITSIZE (mode
);
14961 i
> GET_MODE_UNIT_BITSIZE (mode
);
14964 half
= gen_reg_rtx (mode
);
14965 emit_reduc_half (half
, vec
, i
);
14966 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
14969 dst
= gen_reg_rtx (mode
);
14970 emit_insn (fn (dst
, half
, vec
));
14975 /* Output code to perform a conditional jump to LABEL, if C2 flag in
14976 FP status register is set. */
14979 ix86_emit_fp_unordered_jump (rtx label
)
14981 rtx reg
= gen_reg_rtx (HImode
);
14985 emit_insn (gen_x86_fnstsw_1 (reg
));
14987 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
14989 emit_insn (gen_x86_sahf_1 (reg
));
14991 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
14992 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
14996 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
14998 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
14999 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15002 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15003 gen_rtx_LABEL_REF (VOIDmode
, label
),
15005 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15006 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15007 JUMP_LABEL (insn
) = label
;
15010 /* Output code to perform an sinh XFmode calculation. */
15012 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15014 rtx e1
= gen_reg_rtx (XFmode
);
15015 rtx e2
= gen_reg_rtx (XFmode
);
15016 rtx scratch
= gen_reg_rtx (HImode
);
15017 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15018 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15020 rtx_code_label
*jump_label
= gen_label_rtx ();
15023 /* scratch = fxam (op1) */
15024 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15026 /* e1 = expm1 (|op1|) */
15027 emit_insn (gen_absxf2 (e2
, op1
));
15028 emit_insn (gen_expm1xf2 (e1
, e2
));
15030 /* e2 = e1 / (e1 + 1.0) + e1 */
15031 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15032 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15033 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15034 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15036 /* flags = signbit (op1) */
15037 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15039 /* if (flags) then e2 = -e2 */
15040 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15041 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15042 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15044 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15045 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15046 JUMP_LABEL (insn
) = jump_label
;
15048 emit_insn (gen_negxf2 (e2
, e2
));
15050 emit_label (jump_label
);
15051 LABEL_NUSES (jump_label
) = 1;
15053 /* op0 = 0.5 * e2 */
15054 half
= force_reg (XFmode
, half
);
15055 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15058 /* Output code to perform an cosh XFmode calculation. */
15060 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15062 rtx e1
= gen_reg_rtx (XFmode
);
15063 rtx e2
= gen_reg_rtx (XFmode
);
15064 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15067 /* e1 = exp (op1) */
15068 emit_insn (gen_expxf2 (e1
, op1
));
15070 /* e2 = e1 + 1.0 / e1 */
15071 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15072 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15073 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15075 /* op0 = 0.5 * e2 */
15076 half
= force_reg (XFmode
, half
);
15077 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15080 /* Output code to perform an tanh XFmode calculation. */
15082 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15084 rtx e1
= gen_reg_rtx (XFmode
);
15085 rtx e2
= gen_reg_rtx (XFmode
);
15086 rtx scratch
= gen_reg_rtx (HImode
);
15087 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15089 rtx_code_label
*jump_label
= gen_label_rtx ();
15092 /* scratch = fxam (op1) */
15093 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15095 /* e1 = expm1 (-|2 * op1|) */
15096 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15097 emit_insn (gen_absxf2 (e2
, e2
));
15098 emit_insn (gen_negxf2 (e2
, e2
));
15099 emit_insn (gen_expm1xf2 (e1
, e2
));
15101 /* e2 = e1 / (e1 + 2.0) */
15102 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15103 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15104 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15106 /* flags = signbit (op1) */
15107 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15109 /* if (!flags) then e2 = -e2 */
15110 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15111 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15112 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15114 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15115 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15116 JUMP_LABEL (insn
) = jump_label
;
15118 emit_insn (gen_negxf2 (e2
, e2
));
15120 emit_label (jump_label
);
15121 LABEL_NUSES (jump_label
) = 1;
15123 emit_move_insn (op0
, e2
);
15126 /* Output code to perform an asinh XFmode calculation. */
15128 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15130 rtx e1
= gen_reg_rtx (XFmode
);
15131 rtx e2
= gen_reg_rtx (XFmode
);
15132 rtx scratch
= gen_reg_rtx (HImode
);
15133 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15135 rtx_code_label
*jump_label
= gen_label_rtx ();
15138 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15139 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15140 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15141 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15142 emit_insn (gen_sqrtxf2 (e2
, e2
));
15143 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15146 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15148 /* scratch = fxam (op1) */
15149 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15151 /* e1 = e1 + |op1| */
15152 emit_insn (gen_absxf2 (e2
, op1
));
15153 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15155 /* e2 = log1p (e1) */
15156 ix86_emit_i387_log1p (e2
, e1
);
15158 /* flags = signbit (op1) */
15159 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15161 /* if (flags) then e2 = -e2 */
15162 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15163 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15164 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15166 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15167 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15168 JUMP_LABEL (insn
) = jump_label
;
15170 emit_insn (gen_negxf2 (e2
, e2
));
15172 emit_label (jump_label
);
15173 LABEL_NUSES (jump_label
) = 1;
15175 emit_move_insn (op0
, e2
);
15178 /* Output code to perform an acosh XFmode calculation. */
15180 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15182 rtx e1
= gen_reg_rtx (XFmode
);
15183 rtx e2
= gen_reg_rtx (XFmode
);
15184 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15186 /* e2 = sqrt (op1 + 1.0) */
15187 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15188 emit_insn (gen_sqrtxf2 (e2
, e2
));
15190 /* e1 = sqrt (op1 - 1.0) */
15191 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15192 emit_insn (gen_sqrtxf2 (e1
, e1
));
15195 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15197 /* e1 = e1 + op1 */
15198 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15200 /* op0 = log (e1) */
15201 emit_insn (gen_logxf2 (op0
, e1
));
15204 /* Output code to perform an atanh XFmode calculation. */
15206 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15208 rtx e1
= gen_reg_rtx (XFmode
);
15209 rtx e2
= gen_reg_rtx (XFmode
);
15210 rtx scratch
= gen_reg_rtx (HImode
);
15211 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15212 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15214 rtx_code_label
*jump_label
= gen_label_rtx ();
15217 /* scratch = fxam (op1) */
15218 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15221 emit_insn (gen_absxf2 (e2
, op1
));
15223 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15224 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15225 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15226 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15227 emit_insn (gen_negxf2 (e2
, e2
));
15228 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15230 /* e2 = log1p (e1) */
15231 ix86_emit_i387_log1p (e2
, e1
);
15233 /* flags = signbit (op1) */
15234 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15236 /* if (!flags) then e2 = -e2 */
15237 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15238 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15239 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15241 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15242 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15243 JUMP_LABEL (insn
) = jump_label
;
15245 emit_insn (gen_negxf2 (e2
, e2
));
15247 emit_label (jump_label
);
15248 LABEL_NUSES (jump_label
) = 1;
15250 /* op0 = 0.5 * e2 */
15251 half
= force_reg (XFmode
, half
);
15252 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15255 /* Output code to perform a log1p XFmode calculation. */
15257 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15259 rtx_code_label
*label1
= gen_label_rtx ();
15260 rtx_code_label
*label2
= gen_label_rtx ();
15262 rtx tmp
= gen_reg_rtx (XFmode
);
15263 rtx res
= gen_reg_rtx (XFmode
);
15264 rtx cst
, cstln2
, cst1
;
15267 cst
= const_double_from_real_value
15268 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15269 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15271 emit_insn (gen_absxf2 (tmp
, op1
));
15273 cst
= force_reg (XFmode
, cst
);
15274 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15275 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15276 insn
= get_last_insn ();
15277 JUMP_LABEL (insn
) = label1
;
15279 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15280 emit_jump (label2
);
15282 emit_label (label1
);
15283 LABEL_NUSES (label1
) = 1;
15285 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15286 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15287 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15289 emit_label (label2
);
15290 LABEL_NUSES (label2
) = 1;
15292 emit_move_insn (op0
, res
);
15295 /* Emit code for round calculation. */
15296 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15298 machine_mode inmode
= GET_MODE (op1
);
15299 machine_mode outmode
= GET_MODE (op0
);
15300 rtx e1
= gen_reg_rtx (XFmode
);
15301 rtx e2
= gen_reg_rtx (XFmode
);
15302 rtx scratch
= gen_reg_rtx (HImode
);
15303 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15304 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15305 rtx res
= gen_reg_rtx (outmode
);
15306 rtx_code_label
*jump_label
= gen_label_rtx ();
15307 rtx (*floor_insn
) (rtx
, rtx
);
15308 rtx (*neg_insn
) (rtx
, rtx
);
15316 tmp
= gen_reg_rtx (XFmode
);
15318 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15324 gcc_unreachable ();
15330 floor_insn
= gen_frndintxf2_floor
;
15331 neg_insn
= gen_negsf2
;
15334 floor_insn
= gen_frndintxf2_floor
;
15335 neg_insn
= gen_negdf2
;
15338 floor_insn
= gen_frndintxf2_floor
;
15339 neg_insn
= gen_negxf2
;
15342 floor_insn
= gen_lfloorxfhi2
;
15343 neg_insn
= gen_neghi2
;
15346 floor_insn
= gen_lfloorxfsi2
;
15347 neg_insn
= gen_negsi2
;
15350 floor_insn
= gen_lfloorxfdi2
;
15351 neg_insn
= gen_negdi2
;
15354 gcc_unreachable ();
15357 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15359 /* scratch = fxam(op1) */
15360 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15362 /* e1 = fabs(op1) */
15363 emit_insn (gen_absxf2 (e1
, op1
));
15365 /* e2 = e1 + 0.5 */
15366 half
= force_reg (XFmode
, half
);
15367 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15369 /* res = floor(e2) */
15375 tmp
= gen_reg_rtx (XFmode
);
15377 emit_insn (floor_insn (tmp
, e2
));
15378 emit_insn (gen_rtx_SET (res
,
15379 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15380 UNSPEC_TRUNC_NOOP
)));
15384 emit_insn (floor_insn (res
, e2
));
15387 /* flags = signbit(a) */
15388 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15390 /* if (flags) then res = -res */
15391 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15392 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15393 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15395 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15396 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15397 JUMP_LABEL (insn
) = jump_label
;
15399 emit_insn (neg_insn (res
, res
));
15401 emit_label (jump_label
);
15402 LABEL_NUSES (jump_label
) = 1;
15404 emit_move_insn (op0
, res
);
15407 /* Output code to perform a Newton-Rhapson approximation of a single precision
15408 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15410 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15412 rtx x0
, x1
, e0
, e1
;
15414 x0
= gen_reg_rtx (mode
);
15415 e0
= gen_reg_rtx (mode
);
15416 e1
= gen_reg_rtx (mode
);
15417 x1
= gen_reg_rtx (mode
);
15419 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15421 b
= force_reg (mode
, b
);
15423 /* x0 = rcp(b) estimate */
15424 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15426 if (TARGET_AVX512ER
)
15428 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15431 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15435 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15439 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15443 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15446 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15449 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15452 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15455 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15458 /* Output code to perform a Newton-Rhapson approximation of a
15459 single precision floating point [reciprocal] square root. */
15461 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15463 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15467 x0
= gen_reg_rtx (mode
);
15468 e0
= gen_reg_rtx (mode
);
15469 e1
= gen_reg_rtx (mode
);
15470 e2
= gen_reg_rtx (mode
);
15471 e3
= gen_reg_rtx (mode
);
15473 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15476 /* res = rsqrt28(a) estimate */
15477 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15481 /* x0 = rsqrt28(a) estimate */
15482 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15484 /* res = rcp28(x0) estimate */
15485 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15491 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15492 mthree
= const_double_from_real_value (r
, SFmode
);
15494 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15495 mhalf
= const_double_from_real_value (r
, SFmode
);
15496 unspec
= UNSPEC_RSQRT
;
15498 if (VECTOR_MODE_P (mode
))
15500 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15501 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15502 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15503 if (GET_MODE_SIZE (mode
) == 64)
15504 unspec
= UNSPEC_RSQRT14
;
15507 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15508 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15510 a
= force_reg (mode
, a
);
15512 /* x0 = rsqrt(a) estimate */
15513 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15516 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15519 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15522 /* Handle masked compare. */
15523 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15525 mask
= gen_reg_rtx (HImode
);
15526 /* Imm value 0x4 corresponds to not-equal comparison. */
15527 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15528 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15532 mask
= gen_reg_rtx (mode
);
15533 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15534 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15538 mthree
= force_reg (mode
, mthree
);
15541 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15543 unsigned vector_size
= GET_MODE_SIZE (mode
);
15545 || (TARGET_AVX512F
&& vector_size
== 64)
15546 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
15547 emit_insn (gen_rtx_SET (e2
,
15548 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
15552 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15555 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15558 mhalf
= force_reg (mode
, mhalf
);
15560 /* e3 = -.5 * x0 */
15561 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15563 /* e3 = -.5 * e0 */
15564 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15565 /* ret = e2 * e3 */
15566 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15569 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15570 mask for masking out the sign-bit is stored in *SMASK, if that is
15574 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15576 machine_mode vmode
, mode
= GET_MODE (op0
);
15579 xa
= gen_reg_rtx (mode
);
15580 if (mode
== SFmode
)
15582 else if (mode
== DFmode
)
15586 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15587 if (!VECTOR_MODE_P (mode
))
15589 /* We need to generate a scalar mode mask in this case. */
15590 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15591 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15592 mask
= gen_reg_rtx (mode
);
15593 emit_insn (gen_rtx_SET (mask
, tmp
));
15595 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15603 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15604 swapping the operands if SWAP_OPERANDS is true. The expanded
15605 code is a forward jump to a newly created label in case the
15606 comparison is true. The generated label rtx is returned. */
15607 static rtx_code_label
*
15608 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15609 bool swap_operands
)
15611 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15612 rtx_code_label
*label
;
15616 std::swap (op0
, op1
);
15618 label
= gen_label_rtx ();
15619 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15620 if (unordered_compare
)
15621 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15622 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15623 emit_insn (gen_rtx_SET (reg
, tmp
));
15624 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15625 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15626 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15627 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15628 JUMP_LABEL (tmp
) = label
;
15633 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15634 using comparison code CODE. Operands are swapped for the comparison if
15635 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15637 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15638 bool swap_operands
)
15640 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15641 machine_mode mode
= GET_MODE (op0
);
15642 rtx mask
= gen_reg_rtx (mode
);
15645 std::swap (op0
, op1
);
15647 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15649 emit_insn (insn (mask
, op0
, op1
,
15650 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15654 /* Expand copysign from SIGN to the positive value ABS_VALUE
15655 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15659 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15661 machine_mode mode
= GET_MODE (sign
);
15662 rtx sgn
= gen_reg_rtx (mode
);
15663 if (mask
== NULL_RTX
)
15665 machine_mode vmode
;
15667 if (mode
== SFmode
)
15669 else if (mode
== DFmode
)
15674 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15675 if (!VECTOR_MODE_P (mode
))
15677 /* We need to generate a scalar mode mask in this case. */
15678 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15679 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15680 mask
= gen_reg_rtx (mode
);
15681 emit_insn (gen_rtx_SET (mask
, tmp
));
15685 mask
= gen_rtx_NOT (mode
, mask
);
15686 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15687 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15690 /* Expand SSE sequence for computing lround from OP1 storing
15694 ix86_expand_lround (rtx op0
, rtx op1
)
15696 /* C code for the stuff we're doing below:
15697 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15700 machine_mode mode
= GET_MODE (op1
);
15701 const struct real_format
*fmt
;
15702 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15705 /* load nextafter (0.5, 0.0) */
15706 fmt
= REAL_MODE_FORMAT (mode
);
15707 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15708 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15710 /* adj = copysign (0.5, op1) */
15711 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15712 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15714 /* adj = op1 + adj */
15715 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15717 /* op0 = (imode)adj */
15718 expand_fix (op0
, adj
, 0);
15721 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15725 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15727 /* C code for the stuff we're doing below (for do_floor):
15729 xi -= (double)xi > op1 ? 1 : 0;
15732 machine_mode fmode
= GET_MODE (op1
);
15733 machine_mode imode
= GET_MODE (op0
);
15734 rtx ireg
, freg
, tmp
;
15735 rtx_code_label
*label
;
15737 /* reg = (long)op1 */
15738 ireg
= gen_reg_rtx (imode
);
15739 expand_fix (ireg
, op1
, 0);
15741 /* freg = (double)reg */
15742 freg
= gen_reg_rtx (fmode
);
15743 expand_float (freg
, ireg
, 0);
15745 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15746 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15747 freg
, op1
, !do_floor
);
15748 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15749 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15750 emit_move_insn (ireg
, tmp
);
15752 emit_label (label
);
15753 LABEL_NUSES (label
) = 1;
15755 emit_move_insn (op0
, ireg
);
15758 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15759 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15762 ix86_gen_TWO52 (machine_mode mode
)
15764 REAL_VALUE_TYPE TWO52r
;
15767 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15768 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15769 TWO52
= force_reg (mode
, TWO52
);
15774 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15777 ix86_expand_rint (rtx operand0
, rtx operand1
)
15779 /* C code for the stuff we're doing below:
15780 xa = fabs (operand1);
15781 if (!isless (xa, 2**52))
15784 if (flag_rounding_math)
15786 two52 = copysign (two52, operand1);
15789 xa = xa + two52 - two52;
15790 return copysign (xa, operand1);
15792 machine_mode mode
= GET_MODE (operand0
);
15793 rtx res
, xa
, TWO52
, two52
, mask
;
15794 rtx_code_label
*label
;
15796 res
= gen_reg_rtx (mode
);
15797 emit_move_insn (res
, operand1
);
15799 /* xa = abs (operand1) */
15800 xa
= ix86_expand_sse_fabs (res
, &mask
);
15802 /* if (!isless (xa, TWO52)) goto label; */
15803 TWO52
= ix86_gen_TWO52 (mode
);
15804 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15807 if (flag_rounding_math
)
15809 two52
= gen_reg_rtx (mode
);
15810 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15814 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15815 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15817 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15819 emit_label (label
);
15820 LABEL_NUSES (label
) = 1;
15822 emit_move_insn (operand0
, res
);
15825 /* Expand SSE2 sequence for computing floor or ceil
15826 from OPERAND1 storing into OPERAND0. */
15828 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15830 /* C code for the stuff we expand below.
15831 double xa = fabs (x), x2;
15832 if (!isless (xa, TWO52))
15834 x2 = (double)(long)x;
15841 if (HONOR_SIGNED_ZEROS (mode))
15842 return copysign (x2, x);
15845 machine_mode mode
= GET_MODE (operand0
);
15846 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15847 rtx_code_label
*label
;
15849 TWO52
= ix86_gen_TWO52 (mode
);
15851 /* Temporary for holding the result, initialized to the input
15852 operand to ease control flow. */
15853 res
= gen_reg_rtx (mode
);
15854 emit_move_insn (res
, operand1
);
15856 /* xa = abs (operand1) */
15857 xa
= ix86_expand_sse_fabs (res
, &mask
);
15859 /* if (!isless (xa, TWO52)) goto label; */
15860 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15862 /* xa = (double)(long)x */
15863 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15864 expand_fix (xi
, res
, 0);
15865 expand_float (xa
, xi
, 0);
15868 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15870 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15871 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15872 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15873 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15874 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15875 emit_move_insn (res
, tmp
);
15877 if (HONOR_SIGNED_ZEROS (mode
))
15878 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15880 emit_label (label
);
15881 LABEL_NUSES (label
) = 1;
15883 emit_move_insn (operand0
, res
);
15886 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15887 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15888 that is only available on 64bit targets. */
15890 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15892 /* C code for the stuff we expand below.
15893 double xa = fabs (x), x2;
15894 if (!isless (xa, TWO52))
15896 xa = xa + TWO52 - TWO52;
15897 x2 = copysign (xa, x);
15904 if (HONOR_SIGNED_ZEROS (mode))
15905 x2 = copysign (x2, x);
15908 machine_mode mode
= GET_MODE (operand0
);
15909 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15910 rtx_code_label
*label
;
15912 TWO52
= ix86_gen_TWO52 (mode
);
15914 /* Temporary for holding the result, initialized to the input
15915 operand to ease control flow. */
15916 res
= gen_reg_rtx (mode
);
15917 emit_move_insn (res
, operand1
);
15919 /* xa = abs (operand1) */
15920 xa
= ix86_expand_sse_fabs (res
, &mask
);
15922 /* if (!isless (xa, TWO52)) goto label; */
15923 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15925 /* xa = xa + TWO52 - TWO52; */
15926 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15927 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
15929 /* xa = copysign (xa, operand1) */
15930 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
15933 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15935 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15936 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15937 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15938 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15939 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15940 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
15941 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
15942 emit_move_insn (res
, tmp
);
15944 emit_label (label
);
15945 LABEL_NUSES (label
) = 1;
15947 emit_move_insn (operand0
, res
);
15950 /* Expand SSE sequence for computing trunc
15951 from OPERAND1 storing into OPERAND0. */
15953 ix86_expand_trunc (rtx operand0
, rtx operand1
)
15955 /* C code for SSE variant we expand below.
15956 double xa = fabs (x), x2;
15957 if (!isless (xa, TWO52))
15959 x2 = (double)(long)x;
15960 if (HONOR_SIGNED_ZEROS (mode))
15961 return copysign (x2, x);
15964 machine_mode mode
= GET_MODE (operand0
);
15965 rtx xa
, xi
, TWO52
, res
, mask
;
15966 rtx_code_label
*label
;
15968 TWO52
= ix86_gen_TWO52 (mode
);
15970 /* Temporary for holding the result, initialized to the input
15971 operand to ease control flow. */
15972 res
= gen_reg_rtx (mode
);
15973 emit_move_insn (res
, operand1
);
15975 /* xa = abs (operand1) */
15976 xa
= ix86_expand_sse_fabs (res
, &mask
);
15978 /* if (!isless (xa, TWO52)) goto label; */
15979 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15981 /* x = (double)(long)x */
15982 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15983 expand_fix (xi
, res
, 0);
15984 expand_float (res
, xi
, 0);
15986 if (HONOR_SIGNED_ZEROS (mode
))
15987 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15989 emit_label (label
);
15990 LABEL_NUSES (label
) = 1;
15992 emit_move_insn (operand0
, res
);
15995 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15996 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15997 that is only available on 64bit targets. */
15999 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16001 machine_mode mode
= GET_MODE (operand0
);
16002 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
16003 rtx_code_label
*label
;
16005 /* C code for SSE variant we expand below.
16006 double xa = fabs (x), x2;
16007 if (!isless (xa, TWO52))
16009 xa2 = xa + TWO52 - TWO52;
16013 x2 = copysign (xa2, x);
16017 TWO52
= ix86_gen_TWO52 (mode
);
16019 /* Temporary for holding the result, initialized to the input
16020 operand to ease control flow. */
16021 res
= gen_reg_rtx (mode
);
16022 emit_move_insn (res
, operand1
);
16024 /* xa = abs (operand1) */
16025 xa
= ix86_expand_sse_fabs (res
, &smask
);
16027 /* if (!isless (xa, TWO52)) goto label; */
16028 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16030 /* res = xa + TWO52 - TWO52; */
16031 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16032 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
16033 emit_move_insn (res
, tmp
);
16036 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16038 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16039 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
16040 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
16041 tmp
= expand_simple_binop (mode
, MINUS
,
16042 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
16043 emit_move_insn (res
, tmp
);
16045 /* res = copysign (res, operand1) */
16046 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
16048 emit_label (label
);
16049 LABEL_NUSES (label
) = 1;
16051 emit_move_insn (operand0
, res
);
16054 /* Expand SSE sequence for computing round
16055 from OPERAND1 storing into OPERAND0. */
16057 ix86_expand_round (rtx operand0
, rtx operand1
)
16059 /* C code for the stuff we're doing below:
16060 double xa = fabs (x);
16061 if (!isless (xa, TWO52))
16063 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16064 return copysign (xa, x);
16066 machine_mode mode
= GET_MODE (operand0
);
16067 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16068 rtx_code_label
*label
;
16069 const struct real_format
*fmt
;
16070 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16072 /* Temporary for holding the result, initialized to the input
16073 operand to ease control flow. */
16074 res
= gen_reg_rtx (mode
);
16075 emit_move_insn (res
, operand1
);
16077 TWO52
= ix86_gen_TWO52 (mode
);
16078 xa
= ix86_expand_sse_fabs (res
, &mask
);
16079 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16081 /* load nextafter (0.5, 0.0) */
16082 fmt
= REAL_MODE_FORMAT (mode
);
16083 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16084 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16086 /* xa = xa + 0.5 */
16087 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16088 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16090 /* xa = (double)(int64_t)xa */
16091 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16092 expand_fix (xi
, xa
, 0);
16093 expand_float (xa
, xi
, 0);
16095 /* res = copysign (xa, operand1) */
16096 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16098 emit_label (label
);
16099 LABEL_NUSES (label
) = 1;
16101 emit_move_insn (operand0
, res
);
16104 /* Expand SSE sequence for computing round from OPERAND1 storing
16105 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16106 that is only available on 64bit targets. */
16108 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16110 /* C code for the stuff we expand below.
16111 double xa = fabs (x), xa2, x2;
16112 if (!isless (xa, TWO52))
16114 Using the absolute value and copying back sign makes
16115 -0.0 -> -0.0 correct.
16116 xa2 = xa + TWO52 - TWO52;
16121 else if (dxa > 0.5)
16123 x2 = copysign (xa2, x);
16126 machine_mode mode
= GET_MODE (operand0
);
16127 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16128 rtx_code_label
*label
;
16130 TWO52
= ix86_gen_TWO52 (mode
);
16132 /* Temporary for holding the result, initialized to the input
16133 operand to ease control flow. */
16134 res
= gen_reg_rtx (mode
);
16135 emit_move_insn (res
, operand1
);
16137 /* xa = abs (operand1) */
16138 xa
= ix86_expand_sse_fabs (res
, &mask
);
16140 /* if (!isless (xa, TWO52)) goto label; */
16141 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16143 /* xa2 = xa + TWO52 - TWO52; */
16144 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16145 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16147 /* dxa = xa2 - xa; */
16148 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16150 /* generate 0.5, 1.0 and -0.5 */
16151 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16152 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16153 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16157 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16158 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16159 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16160 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16161 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16162 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16163 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16164 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16166 /* res = copysign (xa2, operand1) */
16167 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
16169 emit_label (label
);
16170 LABEL_NUSES (label
) = 1;
16172 emit_move_insn (operand0
, res
);
16175 /* Expand SSE sequence for computing round
16176 from OP1 storing into OP0 using sse4 round insn. */
16178 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16180 machine_mode mode
= GET_MODE (op0
);
16181 rtx e1
, e2
, res
, half
;
16182 const struct real_format
*fmt
;
16183 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16184 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16185 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16190 gen_copysign
= gen_copysignsf3
;
16191 gen_round
= gen_sse4_1_roundsf2
;
16194 gen_copysign
= gen_copysigndf3
;
16195 gen_round
= gen_sse4_1_rounddf2
;
16198 gcc_unreachable ();
16201 /* round (a) = trunc (a + copysign (0.5, a)) */
16203 /* load nextafter (0.5, 0.0) */
16204 fmt
= REAL_MODE_FORMAT (mode
);
16205 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16206 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16207 half
= const_double_from_real_value (pred_half
, mode
);
16209 /* e1 = copysign (0.5, op1) */
16210 e1
= gen_reg_rtx (mode
);
16211 emit_insn (gen_copysign (e1
, half
, op1
));
16213 /* e2 = op1 + e1 */
16214 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16216 /* res = trunc (e2) */
16217 res
= gen_reg_rtx (mode
);
16218 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16220 emit_move_insn (op0
, res
);
16223 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16224 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16225 insn every time. */
16227 static GTY(()) rtx_insn
*vselect_insn
;
16229 /* Initialize vselect_insn. */
16232 init_vselect_insn (void)
16237 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16238 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16239 XVECEXP (x
, 0, i
) = const0_rtx
;
16240 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16242 x
= gen_rtx_SET (const0_rtx
, x
);
16244 vselect_insn
= emit_insn (x
);
16248 /* Construct (set target (vec_select op0 (parallel perm))) and
16249 return true if that's a valid instruction in the active ISA. */
16252 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16253 unsigned nelt
, bool testing_p
)
16256 rtx x
, save_vconcat
;
16259 if (vselect_insn
== NULL_RTX
)
16260 init_vselect_insn ();
16262 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16263 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16264 for (i
= 0; i
< nelt
; ++i
)
16265 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16266 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16267 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16268 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16269 SET_DEST (PATTERN (vselect_insn
)) = target
;
16270 icode
= recog_memoized (vselect_insn
);
16272 if (icode
>= 0 && !testing_p
)
16273 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16275 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16276 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16277 INSN_CODE (vselect_insn
) = -1;
16282 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16285 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16286 const unsigned char *perm
, unsigned nelt
,
16289 machine_mode v2mode
;
16293 if (vselect_insn
== NULL_RTX
)
16294 init_vselect_insn ();
16296 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16298 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16299 PUT_MODE (x
, v2mode
);
16302 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16303 XEXP (x
, 0) = const0_rtx
;
16304 XEXP (x
, 1) = const0_rtx
;
16308 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16309 using movss or movsd. */
16311 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16313 machine_mode vmode
= d
->vmode
;
16314 unsigned i
, nelt
= d
->nelt
;
16317 if (d
->one_operand_p
)
16320 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16321 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
16322 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16325 /* Only the first element is changed. */
16326 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16328 for (i
= 1; i
< nelt
; ++i
)
16329 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16335 if (d
->perm
[0] == nelt
)
16336 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16338 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16340 emit_insn (gen_rtx_SET (d
->target
, x
));
16345 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16346 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16349 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16351 machine_mode mmode
, vmode
= d
->vmode
;
16352 unsigned i
, nelt
= d
->nelt
;
16353 unsigned HOST_WIDE_INT mask
;
16354 rtx target
, op0
, op1
, maskop
, x
;
16355 rtx rperm
[32], vperm
;
16357 if (d
->one_operand_p
)
16359 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16360 && (TARGET_AVX512BW
16361 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16363 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16365 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16367 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16372 /* This is a blend, not a permute. Elements must stay in their
16373 respective lanes. */
16374 for (i
= 0; i
< nelt
; ++i
)
16376 unsigned e
= d
->perm
[i
];
16377 if (!(e
== i
|| e
== i
+ nelt
))
16384 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16385 decision should be extracted elsewhere, so that we only try that
16386 sequence once all budget==3 options have been tried. */
16387 target
= d
->target
;
16406 for (i
= 0; i
< nelt
; ++i
)
16407 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16411 for (i
= 0; i
< 2; ++i
)
16412 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16417 for (i
= 0; i
< 4; ++i
)
16418 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16423 /* See if bytes move in pairs so we can use pblendw with
16424 an immediate argument, rather than pblendvb with a vector
16426 for (i
= 0; i
< 16; i
+= 2)
16427 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16430 for (i
= 0; i
< nelt
; ++i
)
16431 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16434 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16435 vperm
= force_reg (vmode
, vperm
);
16437 if (GET_MODE_SIZE (vmode
) == 16)
16438 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16440 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16441 if (target
!= d
->target
)
16442 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16446 for (i
= 0; i
< 8; ++i
)
16447 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16452 target
= gen_reg_rtx (vmode
);
16453 op0
= gen_lowpart (vmode
, op0
);
16454 op1
= gen_lowpart (vmode
, op1
);
16458 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16459 for (i
= 0; i
< 32; i
+= 2)
16460 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16462 /* See if bytes move in quadruplets. If yes, vpblendd
16463 with immediate can be used. */
16464 for (i
= 0; i
< 32; i
+= 4)
16465 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16469 /* See if bytes move the same in both lanes. If yes,
16470 vpblendw with immediate can be used. */
16471 for (i
= 0; i
< 16; i
+= 2)
16472 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16475 /* Use vpblendw. */
16476 for (i
= 0; i
< 16; ++i
)
16477 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16482 /* Use vpblendd. */
16483 for (i
= 0; i
< 8; ++i
)
16484 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16489 /* See if words move in pairs. If yes, vpblendd can be used. */
16490 for (i
= 0; i
< 16; i
+= 2)
16491 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16495 /* See if words move the same in both lanes. If not,
16496 vpblendvb must be used. */
16497 for (i
= 0; i
< 8; i
++)
16498 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16500 /* Use vpblendvb. */
16501 for (i
= 0; i
< 32; ++i
)
16502 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16506 target
= gen_reg_rtx (vmode
);
16507 op0
= gen_lowpart (vmode
, op0
);
16508 op1
= gen_lowpart (vmode
, op1
);
16509 goto finish_pblendvb
;
16512 /* Use vpblendw. */
16513 for (i
= 0; i
< 16; ++i
)
16514 mask
|= (d
->perm
[i
] >= 16) << i
;
16518 /* Use vpblendd. */
16519 for (i
= 0; i
< 8; ++i
)
16520 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16525 /* Use vpblendd. */
16526 for (i
= 0; i
< 4; ++i
)
16527 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16532 gcc_unreachable ();
16555 if (mmode
!= VOIDmode
)
16556 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16558 maskop
= GEN_INT (mask
);
16560 /* This matches five different patterns with the different modes. */
16561 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16562 x
= gen_rtx_SET (target
, x
);
16564 if (target
!= d
->target
)
16565 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16570 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16571 in terms of the variable form of vpermilps.
16573 Note that we will have already failed the immediate input vpermilps,
16574 which requires that the high and low part shuffle be identical; the
16575 variable form doesn't require that. */
16578 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16580 rtx rperm
[8], vperm
;
16583 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16586 /* We can only permute within the 128-bit lane. */
16587 for (i
= 0; i
< 8; ++i
)
16589 unsigned e
= d
->perm
[i
];
16590 if (i
< 4 ? e
>= 4 : e
< 4)
16597 for (i
= 0; i
< 8; ++i
)
16599 unsigned e
= d
->perm
[i
];
16601 /* Within each 128-bit lane, the elements of op0 are numbered
16602 from 0 and the elements of op1 are numbered from 4. */
16608 rperm
[i
] = GEN_INT (e
);
16611 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16612 vperm
= force_reg (V8SImode
, vperm
);
16613 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16618 /* Return true if permutation D can be performed as VMODE permutation
16622 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16624 unsigned int i
, j
, chunk
;
16626 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16627 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16628 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16631 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16634 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16635 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16636 if (d
->perm
[i
] & (chunk
- 1))
16639 for (j
= 1; j
< chunk
; ++j
)
16640 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16646 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16647 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16650 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16652 unsigned i
, nelt
, eltsz
, mask
;
16653 unsigned char perm
[64];
16654 machine_mode vmode
= V16QImode
;
16655 rtx rperm
[64], vperm
, target
, op0
, op1
;
16659 if (!d
->one_operand_p
)
16661 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16664 && valid_perm_using_mode_p (V2TImode
, d
))
16669 /* Use vperm2i128 insn. The pattern uses
16670 V4DImode instead of V2TImode. */
16671 target
= d
->target
;
16672 if (d
->vmode
!= V4DImode
)
16673 target
= gen_reg_rtx (V4DImode
);
16674 op0
= gen_lowpart (V4DImode
, d
->op0
);
16675 op1
= gen_lowpart (V4DImode
, d
->op1
);
16677 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16678 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16679 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16680 if (target
!= d
->target
)
16681 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16689 if (GET_MODE_SIZE (d
->vmode
) == 16)
16694 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16699 /* V4DImode should be already handled through
16700 expand_vselect by vpermq instruction. */
16701 gcc_assert (d
->vmode
!= V4DImode
);
16704 if (d
->vmode
== V8SImode
16705 || d
->vmode
== V16HImode
16706 || d
->vmode
== V32QImode
)
16708 /* First see if vpermq can be used for
16709 V8SImode/V16HImode/V32QImode. */
16710 if (valid_perm_using_mode_p (V4DImode
, d
))
16712 for (i
= 0; i
< 4; i
++)
16713 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16716 target
= gen_reg_rtx (V4DImode
);
16717 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16720 emit_move_insn (d
->target
,
16721 gen_lowpart (d
->vmode
, target
));
16727 /* Next see if vpermd can be used. */
16728 if (valid_perm_using_mode_p (V8SImode
, d
))
16731 /* Or if vpermps can be used. */
16732 else if (d
->vmode
== V8SFmode
)
16735 if (vmode
== V32QImode
)
16737 /* vpshufb only works intra lanes, it is not
16738 possible to shuffle bytes in between the lanes. */
16739 for (i
= 0; i
< nelt
; ++i
)
16740 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16744 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16746 if (!TARGET_AVX512BW
)
16749 /* If vpermq didn't work, vpshufb won't work either. */
16750 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16754 if (d
->vmode
== V16SImode
16755 || d
->vmode
== V32HImode
16756 || d
->vmode
== V64QImode
)
16758 /* First see if vpermq can be used for
16759 V16SImode/V32HImode/V64QImode. */
16760 if (valid_perm_using_mode_p (V8DImode
, d
))
16762 for (i
= 0; i
< 8; i
++)
16763 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16766 target
= gen_reg_rtx (V8DImode
);
16767 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16770 emit_move_insn (d
->target
,
16771 gen_lowpart (d
->vmode
, target
));
16777 /* Next see if vpermd can be used. */
16778 if (valid_perm_using_mode_p (V16SImode
, d
))
16781 /* Or if vpermps can be used. */
16782 else if (d
->vmode
== V16SFmode
)
16784 if (vmode
== V64QImode
)
16786 /* vpshufb only works intra lanes, it is not
16787 possible to shuffle bytes in between the lanes. */
16788 for (i
= 0; i
< nelt
; ++i
)
16789 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
16800 if (vmode
== V8SImode
)
16801 for (i
= 0; i
< 8; ++i
)
16802 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16803 else if (vmode
== V16SImode
)
16804 for (i
= 0; i
< 16; ++i
)
16805 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16808 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16809 if (!d
->one_operand_p
)
16810 mask
= 2 * nelt
- 1;
16811 else if (vmode
== V16QImode
)
16813 else if (vmode
== V64QImode
)
16814 mask
= nelt
/ 4 - 1;
16816 mask
= nelt
/ 2 - 1;
16818 for (i
= 0; i
< nelt
; ++i
)
16820 unsigned j
, e
= d
->perm
[i
] & mask
;
16821 for (j
= 0; j
< eltsz
; ++j
)
16822 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16826 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16827 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16828 vperm
= force_reg (vmode
, vperm
);
16830 target
= d
->target
;
16831 if (d
->vmode
!= vmode
)
16832 target
= gen_reg_rtx (vmode
);
16833 op0
= gen_lowpart (vmode
, d
->op0
);
16834 if (d
->one_operand_p
)
16836 if (vmode
== V16QImode
)
16837 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16838 else if (vmode
== V32QImode
)
16839 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16840 else if (vmode
== V64QImode
)
16841 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16842 else if (vmode
== V8SFmode
)
16843 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16844 else if (vmode
== V8SImode
)
16845 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16846 else if (vmode
== V16SFmode
)
16847 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16848 else if (vmode
== V16SImode
)
16849 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16851 gcc_unreachable ();
16855 op1
= gen_lowpart (vmode
, d
->op1
);
16856 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16858 if (target
!= d
->target
)
16859 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16864 /* For V*[QHS]Imode permutations, check if the same permutation
16865 can't be performed in a 2x, 4x or 8x wider inner mode. */
16868 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16869 struct expand_vec_perm_d
*nd
)
16872 machine_mode mode
= VOIDmode
;
16876 case E_V16QImode
: mode
= V8HImode
; break;
16877 case E_V32QImode
: mode
= V16HImode
; break;
16878 case E_V64QImode
: mode
= V32HImode
; break;
16879 case E_V8HImode
: mode
= V4SImode
; break;
16880 case E_V16HImode
: mode
= V8SImode
; break;
16881 case E_V32HImode
: mode
= V16SImode
; break;
16882 case E_V4SImode
: mode
= V2DImode
; break;
16883 case E_V8SImode
: mode
= V4DImode
; break;
16884 case E_V16SImode
: mode
= V8DImode
; break;
16885 default: return false;
16887 for (i
= 0; i
< d
->nelt
; i
+= 2)
16888 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16891 nd
->nelt
= d
->nelt
/ 2;
16892 for (i
= 0; i
< nd
->nelt
; i
++)
16893 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16894 if (GET_MODE_INNER (mode
) != DImode
)
16895 canonicalize_vector_int_perm (nd
, nd
);
16898 nd
->one_operand_p
= d
->one_operand_p
;
16899 nd
->testing_p
= d
->testing_p
;
16900 if (d
->op0
== d
->op1
)
16901 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16904 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16905 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16908 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16910 nd
->target
= gen_reg_rtx (nd
->vmode
);
16915 /* Try to expand one-operand permutation with constant mask. */
16918 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16920 machine_mode mode
= GET_MODE (d
->op0
);
16921 machine_mode maskmode
= mode
;
16922 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
16923 rtx target
, op0
, mask
;
16926 if (!rtx_equal_p (d
->op0
, d
->op1
))
16929 if (!TARGET_AVX512F
)
16935 gen
= gen_avx512f_permvarv16si
;
16938 gen
= gen_avx512f_permvarv16sf
;
16939 maskmode
= V16SImode
;
16942 gen
= gen_avx512f_permvarv8di
;
16945 gen
= gen_avx512f_permvarv8df
;
16946 maskmode
= V8DImode
;
16952 target
= d
->target
;
16954 for (int i
= 0; i
< d
->nelt
; ++i
)
16955 vec
[i
] = GEN_INT (d
->perm
[i
]);
16956 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
16957 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
16961 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
16963 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
16964 in a single instruction. */
16967 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
16969 unsigned i
, nelt
= d
->nelt
;
16970 struct expand_vec_perm_d nd
;
16972 /* Check plain VEC_SELECT first, because AVX has instructions that could
16973 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
16974 input where SEL+CONCAT may not. */
16975 if (d
->one_operand_p
)
16977 int mask
= nelt
- 1;
16978 bool identity_perm
= true;
16979 bool broadcast_perm
= true;
16981 for (i
= 0; i
< nelt
; i
++)
16983 nd
.perm
[i
] = d
->perm
[i
] & mask
;
16984 if (nd
.perm
[i
] != i
)
16985 identity_perm
= false;
16987 broadcast_perm
= false;
16993 emit_move_insn (d
->target
, d
->op0
);
16996 else if (broadcast_perm
&& TARGET_AVX2
)
16998 /* Use vpbroadcast{b,w,d}. */
16999 rtx (*gen
) (rtx
, rtx
) = NULL
;
17003 if (TARGET_AVX512BW
)
17004 gen
= gen_avx512bw_vec_dupv64qi_1
;
17007 gen
= gen_avx2_pbroadcastv32qi_1
;
17010 if (TARGET_AVX512BW
)
17011 gen
= gen_avx512bw_vec_dupv32hi_1
;
17014 gen
= gen_avx2_pbroadcastv16hi_1
;
17017 if (TARGET_AVX512F
)
17018 gen
= gen_avx512f_vec_dupv16si_1
;
17021 gen
= gen_avx2_pbroadcastv8si_1
;
17024 gen
= gen_avx2_pbroadcastv16qi
;
17027 gen
= gen_avx2_pbroadcastv8hi
;
17030 if (TARGET_AVX512F
)
17031 gen
= gen_avx512f_vec_dupv16sf_1
;
17034 gen
= gen_avx2_vec_dupv8sf_1
;
17037 if (TARGET_AVX512F
)
17038 gen
= gen_avx512f_vec_dupv8df_1
;
17041 if (TARGET_AVX512F
)
17042 gen
= gen_avx512f_vec_dupv8di_1
;
17044 /* For other modes prefer other shuffles this function creates. */
17050 emit_insn (gen (d
->target
, d
->op0
));
17055 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17058 /* There are plenty of patterns in sse.md that are written for
17059 SEL+CONCAT and are not replicated for a single op. Perhaps
17060 that should be changed, to avoid the nastiness here. */
17062 /* Recognize interleave style patterns, which means incrementing
17063 every other permutation operand. */
17064 for (i
= 0; i
< nelt
; i
+= 2)
17066 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17067 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17069 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17073 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17076 for (i
= 0; i
< nelt
; i
+= 4)
17078 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17079 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17080 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17081 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17084 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17090 /* Try movss/movsd instructions. */
17091 if (expand_vec_perm_movs (d
))
17094 /* Finally, try the fully general two operand permute. */
17095 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17099 /* Recognize interleave style patterns with reversed operands. */
17100 if (!d
->one_operand_p
)
17102 for (i
= 0; i
< nelt
; ++i
)
17104 unsigned e
= d
->perm
[i
];
17112 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17117 /* Try the SSE4.1 blend variable merge instructions. */
17118 if (expand_vec_perm_blend (d
))
17121 /* Try one of the AVX vpermil variable permutations. */
17122 if (expand_vec_perm_vpermil (d
))
17125 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17126 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17127 if (expand_vec_perm_pshufb (d
))
17130 /* Try the AVX2 vpalignr instruction. */
17131 if (expand_vec_perm_palignr (d
, true))
17134 /* Try the AVX512F vperm{s,d} instructions. */
17135 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17138 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17139 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17142 /* See if we can get the same permutation in different vector integer
17144 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17147 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17153 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17154 in terms of a pair of pshuflw + pshufhw instructions. */
17157 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17159 unsigned char perm2
[MAX_VECT_LEN
];
17163 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17166 /* The two permutations only operate in 64-bit lanes. */
17167 for (i
= 0; i
< 4; ++i
)
17168 if (d
->perm
[i
] >= 4)
17170 for (i
= 4; i
< 8; ++i
)
17171 if (d
->perm
[i
] < 4)
17177 /* Emit the pshuflw. */
17178 memcpy (perm2
, d
->perm
, 4);
17179 for (i
= 4; i
< 8; ++i
)
17181 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17184 /* Emit the pshufhw. */
17185 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17186 for (i
= 0; i
< 4; ++i
)
17188 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17194 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17195 the permutation using the SSSE3 palignr instruction. This succeeds
17196 when all of the elements in PERM fit within one vector and we merely
17197 need to shift them down so that a single vector permutation has a
17198 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17199 the vpalignr instruction itself can perform the requested permutation. */
17202 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17204 unsigned i
, nelt
= d
->nelt
;
17205 unsigned min
, max
, minswap
, maxswap
;
17206 bool in_order
, ok
, swap
= false;
17208 struct expand_vec_perm_d dcopy
;
17210 /* Even with AVX, palignr only operates on 128-bit vectors,
17211 in AVX2 palignr operates on both 128-bit lanes. */
17212 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17213 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17218 minswap
= 2 * nelt
;
17220 for (i
= 0; i
< nelt
; ++i
)
17222 unsigned e
= d
->perm
[i
];
17223 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17224 if (GET_MODE_SIZE (d
->vmode
) == 32)
17226 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17227 eswap
= e
^ (nelt
/ 2);
17233 if (eswap
< minswap
)
17235 if (eswap
> maxswap
)
17239 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17241 if (d
->one_operand_p
17243 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17244 ? nelt
/ 2 : nelt
))
17251 /* Given that we have SSSE3, we know we'll be able to implement the
17252 single operand permutation after the palignr with pshufb for
17253 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17255 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17261 dcopy
.op0
= d
->op1
;
17262 dcopy
.op1
= d
->op0
;
17263 for (i
= 0; i
< nelt
; ++i
)
17264 dcopy
.perm
[i
] ^= nelt
;
17268 for (i
= 0; i
< nelt
; ++i
)
17270 unsigned e
= dcopy
.perm
[i
];
17271 if (GET_MODE_SIZE (d
->vmode
) == 32
17273 && (e
& (nelt
/ 2 - 1)) < min
)
17274 e
= e
- min
- (nelt
/ 2);
17281 dcopy
.one_operand_p
= true;
17283 if (single_insn_only_p
&& !in_order
)
17286 /* For AVX2, test whether we can permute the result in one instruction. */
17291 dcopy
.op1
= dcopy
.op0
;
17292 return expand_vec_perm_1 (&dcopy
);
17295 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17296 if (GET_MODE_SIZE (d
->vmode
) == 16)
17298 target
= gen_reg_rtx (TImode
);
17299 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17300 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17304 target
= gen_reg_rtx (V2TImode
);
17305 emit_insn (gen_avx2_palignrv2ti (target
,
17306 gen_lowpart (V2TImode
, dcopy
.op1
),
17307 gen_lowpart (V2TImode
, dcopy
.op0
),
17311 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17313 /* Test for the degenerate case where the alignment by itself
17314 produces the desired permutation. */
17317 emit_move_insn (d
->target
, dcopy
.op0
);
17321 ok
= expand_vec_perm_1 (&dcopy
);
17322 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17327 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17328 the permutation using the SSE4_1 pblendv instruction. Potentially
17329 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17332 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17334 unsigned i
, which
, nelt
= d
->nelt
;
17335 struct expand_vec_perm_d dcopy
, dcopy1
;
17336 machine_mode vmode
= d
->vmode
;
17339 /* Use the same checks as in expand_vec_perm_blend. */
17340 if (d
->one_operand_p
)
17342 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17344 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17346 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17351 /* Figure out where permutation elements stay not in their
17352 respective lanes. */
17353 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17355 unsigned e
= d
->perm
[i
];
17357 which
|= (e
< nelt
? 1 : 2);
17359 /* We can pblend the part where elements stay not in their
17360 respective lanes only when these elements are all in one
17361 half of a permutation.
17362 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17363 lanes, but both 8 and 9 >= 8
17364 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17365 respective lanes and 8 >= 8, but 2 not. */
17366 if (which
!= 1 && which
!= 2)
17368 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17371 /* First we apply one operand permutation to the part where
17372 elements stay not in their respective lanes. */
17375 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17377 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17379 dcopy
.target
= gen_reg_rtx (vmode
);
17380 dcopy
.one_operand_p
= true;
17382 for (i
= 0; i
< nelt
; ++i
)
17383 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17385 ok
= expand_vec_perm_1 (&dcopy
);
17386 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17393 /* Next we put permuted elements into their positions. */
17396 dcopy1
.op1
= dcopy
.target
;
17398 dcopy1
.op0
= dcopy
.target
;
17400 for (i
= 0; i
< nelt
; ++i
)
17401 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17403 ok
= expand_vec_perm_blend (&dcopy1
);
17409 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17411 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17412 a two vector permutation into a single vector permutation by using
17413 an interleave operation to merge the vectors. */
17416 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17418 struct expand_vec_perm_d dremap
, dfinal
;
17419 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17420 unsigned HOST_WIDE_INT contents
;
17421 unsigned char remap
[2 * MAX_VECT_LEN
];
17423 bool ok
, same_halves
= false;
17425 if (GET_MODE_SIZE (d
->vmode
) == 16)
17427 if (d
->one_operand_p
)
17430 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17434 /* For 32-byte modes allow even d->one_operand_p.
17435 The lack of cross-lane shuffling in some instructions
17436 might prevent a single insn shuffle. */
17438 dfinal
.testing_p
= true;
17439 /* If expand_vec_perm_interleave3 can expand this into
17440 a 3 insn sequence, give up and let it be expanded as
17441 3 insn sequence. While that is one insn longer,
17442 it doesn't need a memory operand and in the common
17443 case that both interleave low and high permutations
17444 with the same operands are adjacent needs 4 insns
17445 for both after CSE. */
17446 if (expand_vec_perm_interleave3 (&dfinal
))
17452 /* Examine from whence the elements come. */
17454 for (i
= 0; i
< nelt
; ++i
)
17455 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17457 memset (remap
, 0xff, sizeof (remap
));
17460 if (GET_MODE_SIZE (d
->vmode
) == 16)
17462 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17464 /* Split the two input vectors into 4 halves. */
17465 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17470 /* If the elements from the low halves use interleave low, and similarly
17471 for interleave high. If the elements are from mis-matched halves, we
17472 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17473 if ((contents
& (h1
| h3
)) == contents
)
17476 for (i
= 0; i
< nelt2
; ++i
)
17479 remap
[i
+ nelt
] = i
* 2 + 1;
17480 dremap
.perm
[i
* 2] = i
;
17481 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17483 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17484 dremap
.vmode
= V4SFmode
;
17486 else if ((contents
& (h2
| h4
)) == contents
)
17489 for (i
= 0; i
< nelt2
; ++i
)
17491 remap
[i
+ nelt2
] = i
* 2;
17492 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17493 dremap
.perm
[i
* 2] = i
+ nelt2
;
17494 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17496 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17497 dremap
.vmode
= V4SFmode
;
17499 else if ((contents
& (h1
| h4
)) == contents
)
17502 for (i
= 0; i
< nelt2
; ++i
)
17505 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17506 dremap
.perm
[i
] = i
;
17507 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17512 dremap
.vmode
= V2DImode
;
17514 dremap
.perm
[0] = 0;
17515 dremap
.perm
[1] = 3;
17518 else if ((contents
& (h2
| h3
)) == contents
)
17521 for (i
= 0; i
< nelt2
; ++i
)
17523 remap
[i
+ nelt2
] = i
;
17524 remap
[i
+ nelt
] = i
+ nelt2
;
17525 dremap
.perm
[i
] = i
+ nelt2
;
17526 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17531 dremap
.vmode
= V2DImode
;
17533 dremap
.perm
[0] = 1;
17534 dremap
.perm
[1] = 2;
17542 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17543 unsigned HOST_WIDE_INT q
[8];
17544 unsigned int nonzero_halves
[4];
17546 /* Split the two input vectors into 8 quarters. */
17547 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17548 for (i
= 1; i
< 8; ++i
)
17549 q
[i
] = q
[0] << (nelt4
* i
);
17550 for (i
= 0; i
< 4; ++i
)
17551 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17553 nonzero_halves
[nzcnt
] = i
;
17559 gcc_assert (d
->one_operand_p
);
17560 nonzero_halves
[1] = nonzero_halves
[0];
17561 same_halves
= true;
17563 else if (d
->one_operand_p
)
17565 gcc_assert (nonzero_halves
[0] == 0);
17566 gcc_assert (nonzero_halves
[1] == 1);
17571 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17573 /* Attempt to increase the likelihood that dfinal
17574 shuffle will be intra-lane. */
17575 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17578 /* vperm2f128 or vperm2i128. */
17579 for (i
= 0; i
< nelt2
; ++i
)
17581 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17582 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17583 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17584 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17587 if (d
->vmode
!= V8SFmode
17588 && d
->vmode
!= V4DFmode
17589 && d
->vmode
!= V8SImode
)
17591 dremap
.vmode
= V8SImode
;
17593 for (i
= 0; i
< 4; ++i
)
17595 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17596 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17600 else if (d
->one_operand_p
)
17602 else if (TARGET_AVX2
17603 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17606 for (i
= 0; i
< nelt4
; ++i
)
17609 remap
[i
+ nelt
] = i
* 2 + 1;
17610 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17611 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17612 dremap
.perm
[i
* 2] = i
;
17613 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17614 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17615 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17618 else if (TARGET_AVX2
17619 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17622 for (i
= 0; i
< nelt4
; ++i
)
17624 remap
[i
+ nelt4
] = i
* 2;
17625 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17626 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17627 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17628 dremap
.perm
[i
* 2] = i
+ nelt4
;
17629 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17630 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17631 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17638 /* Use the remapping array set up above to move the elements from their
17639 swizzled locations into their final destinations. */
17641 for (i
= 0; i
< nelt
; ++i
)
17643 unsigned e
= remap
[d
->perm
[i
]];
17644 gcc_assert (e
< nelt
);
17645 /* If same_halves is true, both halves of the remapped vector are the
17646 same. Avoid cross-lane accesses if possible. */
17647 if (same_halves
&& i
>= nelt2
)
17649 gcc_assert (e
< nelt2
);
17650 dfinal
.perm
[i
] = e
+ nelt2
;
17653 dfinal
.perm
[i
] = e
;
17657 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17658 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17660 dfinal
.op1
= dfinal
.op0
;
17661 dfinal
.one_operand_p
= true;
17663 /* Test if the final remap can be done with a single insn. For V4SFmode or
17664 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17666 ok
= expand_vec_perm_1 (&dfinal
);
17667 seq
= get_insns ();
17676 if (dremap
.vmode
!= dfinal
.vmode
)
17678 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17679 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17682 ok
= expand_vec_perm_1 (&dremap
);
17689 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17690 a single vector cross-lane permutation into vpermq followed
17691 by any of the single insn permutations. */
17694 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17696 struct expand_vec_perm_d dremap
, dfinal
;
17697 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17698 unsigned contents
[2];
17702 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17703 && d
->one_operand_p
))
17708 for (i
= 0; i
< nelt2
; ++i
)
17710 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17711 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17714 for (i
= 0; i
< 2; ++i
)
17716 unsigned int cnt
= 0;
17717 for (j
= 0; j
< 4; ++j
)
17718 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17726 dremap
.vmode
= V4DImode
;
17728 dremap
.target
= gen_reg_rtx (V4DImode
);
17729 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17730 dremap
.op1
= dremap
.op0
;
17731 dremap
.one_operand_p
= true;
17732 for (i
= 0; i
< 2; ++i
)
17734 unsigned int cnt
= 0;
17735 for (j
= 0; j
< 4; ++j
)
17736 if ((contents
[i
] & (1u << j
)) != 0)
17737 dremap
.perm
[2 * i
+ cnt
++] = j
;
17738 for (; cnt
< 2; ++cnt
)
17739 dremap
.perm
[2 * i
+ cnt
] = 0;
17743 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17744 dfinal
.op1
= dfinal
.op0
;
17745 dfinal
.one_operand_p
= true;
17746 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17750 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17751 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17753 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17754 dfinal
.perm
[i
] |= nelt4
;
17756 gcc_unreachable ();
17759 ok
= expand_vec_perm_1 (&dremap
);
17762 ok
= expand_vec_perm_1 (&dfinal
);
17768 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17770 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17771 a vector permutation using two instructions, vperm2f128 resp.
17772 vperm2i128 followed by any single in-lane permutation. */
17775 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17777 struct expand_vec_perm_d dfirst
, dsecond
;
17778 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17782 || GET_MODE_SIZE (d
->vmode
) != 32
17783 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17787 dsecond
.one_operand_p
= false;
17788 dsecond
.testing_p
= true;
17790 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17791 immediate. For perm < 16 the second permutation uses
17792 d->op0 as first operand, for perm >= 16 it uses d->op1
17793 as first operand. The second operand is the result of
17795 for (perm
= 0; perm
< 32; perm
++)
17797 /* Ignore permutations which do not move anything cross-lane. */
17800 /* The second shuffle for e.g. V4DFmode has
17801 0123 and ABCD operands.
17802 Ignore AB23, as 23 is already in the second lane
17803 of the first operand. */
17804 if ((perm
& 0xc) == (1 << 2)) continue;
17805 /* And 01CD, as 01 is in the first lane of the first
17807 if ((perm
& 3) == 0) continue;
17808 /* And 4567, as then the vperm2[fi]128 doesn't change
17809 anything on the original 4567 second operand. */
17810 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17814 /* The second shuffle for e.g. V4DFmode has
17815 4567 and ABCD operands.
17816 Ignore AB67, as 67 is already in the second lane
17817 of the first operand. */
17818 if ((perm
& 0xc) == (3 << 2)) continue;
17819 /* And 45CD, as 45 is in the first lane of the first
17821 if ((perm
& 3) == 2) continue;
17822 /* And 0123, as then the vperm2[fi]128 doesn't change
17823 anything on the original 0123 first operand. */
17824 if ((perm
& 0xf) == (1 << 2)) continue;
17827 for (i
= 0; i
< nelt
; i
++)
17829 j
= d
->perm
[i
] / nelt2
;
17830 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17831 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17832 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17833 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17841 ok
= expand_vec_perm_1 (&dsecond
);
17852 /* Found a usable second shuffle. dfirst will be
17853 vperm2f128 on d->op0 and d->op1. */
17854 dsecond
.testing_p
= false;
17856 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17857 for (i
= 0; i
< nelt
; i
++)
17858 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17859 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17861 canonicalize_perm (&dfirst
);
17862 ok
= expand_vec_perm_1 (&dfirst
);
17865 /* And dsecond is some single insn shuffle, taking
17866 d->op0 and result of vperm2f128 (if perm < 16) or
17867 d->op1 and result of vperm2f128 (otherwise). */
17869 dsecond
.op0
= dsecond
.op1
;
17870 dsecond
.op1
= dfirst
.target
;
17872 ok
= expand_vec_perm_1 (&dsecond
);
17878 /* For one operand, the only useful vperm2f128 permutation is 0x01
17880 if (d
->one_operand_p
)
17887 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17888 a two vector permutation using 2 intra-lane interleave insns
17889 and cross-lane shuffle for 32-byte vectors. */
17892 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17895 rtx (*gen
) (rtx
, rtx
, rtx
);
17897 if (d
->one_operand_p
)
17899 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17901 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17907 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17909 for (i
= 0; i
< nelt
; i
+= 2)
17910 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17911 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17921 gen
= gen_vec_interleave_highv32qi
;
17923 gen
= gen_vec_interleave_lowv32qi
;
17927 gen
= gen_vec_interleave_highv16hi
;
17929 gen
= gen_vec_interleave_lowv16hi
;
17933 gen
= gen_vec_interleave_highv8si
;
17935 gen
= gen_vec_interleave_lowv8si
;
17939 gen
= gen_vec_interleave_highv4di
;
17941 gen
= gen_vec_interleave_lowv4di
;
17945 gen
= gen_vec_interleave_highv8sf
;
17947 gen
= gen_vec_interleave_lowv8sf
;
17951 gen
= gen_vec_interleave_highv4df
;
17953 gen
= gen_vec_interleave_lowv4df
;
17956 gcc_unreachable ();
17959 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
17963 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
17964 a single vector permutation using a single intra-lane vector
17965 permutation, vperm2f128 swapping the lanes and vblend* insn blending
17966 the non-swapped and swapped vectors together. */
17969 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
17971 struct expand_vec_perm_d dfirst
, dsecond
;
17972 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17975 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
17979 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
17980 || !d
->one_operand_p
)
17984 for (i
= 0; i
< nelt
; i
++)
17985 dfirst
.perm
[i
] = 0xff;
17986 for (i
= 0, msk
= 0; i
< nelt
; i
++)
17988 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
17989 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
17991 dfirst
.perm
[j
] = d
->perm
[i
];
17995 for (i
= 0; i
< nelt
; i
++)
17996 if (dfirst
.perm
[i
] == 0xff)
17997 dfirst
.perm
[i
] = i
;
18000 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18003 ok
= expand_vec_perm_1 (&dfirst
);
18004 seq
= get_insns ();
18016 dsecond
.op0
= dfirst
.target
;
18017 dsecond
.op1
= dfirst
.target
;
18018 dsecond
.one_operand_p
= true;
18019 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18020 for (i
= 0; i
< nelt
; i
++)
18021 dsecond
.perm
[i
] = i
^ nelt2
;
18023 ok
= expand_vec_perm_1 (&dsecond
);
18026 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18027 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18031 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18032 permutation using two vperm2f128, followed by a vshufpd insn blending
18033 the two vectors together. */
18036 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18038 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18041 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18051 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18052 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18053 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18054 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18055 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18056 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18057 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18058 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18059 dthird
.perm
[0] = (d
->perm
[0] % 2);
18060 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18061 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18062 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18064 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18065 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18066 dthird
.op0
= dfirst
.target
;
18067 dthird
.op1
= dsecond
.target
;
18068 dthird
.one_operand_p
= false;
18070 canonicalize_perm (&dfirst
);
18071 canonicalize_perm (&dsecond
);
18073 ok
= expand_vec_perm_1 (&dfirst
)
18074 && expand_vec_perm_1 (&dsecond
)
18075 && expand_vec_perm_1 (&dthird
);
18082 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18084 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18085 a two vector permutation using two intra-lane vector
18086 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18087 the non-swapped and swapped vectors together. */
18090 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18092 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18093 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18094 rtx_insn
*seq1
, *seq2
;
18096 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18100 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18101 || d
->one_operand_p
)
18106 for (i
= 0; i
< nelt
; i
++)
18108 dfirst
.perm
[i
] = 0xff;
18109 dsecond
.perm
[i
] = 0xff;
18111 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18113 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18116 dfirst
.perm
[j
] = d
->perm
[i
];
18117 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18121 dsecond
.perm
[j
] = d
->perm
[i
];
18122 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18126 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18131 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18132 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18135 for (i
= 0; i
< nelt
; i
++)
18137 if (dfirst
.perm
[i
] == 0xff)
18138 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18139 if (dsecond
.perm
[i
] == 0xff)
18140 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18142 canonicalize_perm (&dfirst
);
18144 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18145 seq1
= get_insns ();
18151 canonicalize_perm (&dsecond
);
18153 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18154 seq2
= get_insns ();
18167 dthird
.op0
= dsecond
.target
;
18168 dthird
.op1
= dsecond
.target
;
18169 dthird
.one_operand_p
= true;
18170 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18171 for (i
= 0; i
< nelt
; i
++)
18172 dthird
.perm
[i
] = i
^ nelt2
;
18174 ok
= expand_vec_perm_1 (&dthird
);
18177 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18178 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18182 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18183 permutation with two pshufb insns and an ior. We should have already
18184 failed all two instruction sequences. */
18187 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18189 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18190 unsigned int i
, nelt
, eltsz
;
18192 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18194 gcc_assert (!d
->one_operand_p
);
18200 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18202 /* Generate two permutation masks. If the required element is within
18203 the given vector it is shuffled into the proper lane. If the required
18204 element is in the other vector, force a zero into the lane by setting
18205 bit 7 in the permutation mask. */
18206 m128
= GEN_INT (-128);
18207 for (i
= 0; i
< nelt
; ++i
)
18209 unsigned j
, e
= d
->perm
[i
];
18210 unsigned which
= (e
>= nelt
);
18214 for (j
= 0; j
< eltsz
; ++j
)
18216 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18217 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18221 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18222 vperm
= force_reg (V16QImode
, vperm
);
18224 l
= gen_reg_rtx (V16QImode
);
18225 op
= gen_lowpart (V16QImode
, d
->op0
);
18226 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18228 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18229 vperm
= force_reg (V16QImode
, vperm
);
18231 h
= gen_reg_rtx (V16QImode
);
18232 op
= gen_lowpart (V16QImode
, d
->op1
);
18233 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18236 if (d
->vmode
!= V16QImode
)
18237 op
= gen_reg_rtx (V16QImode
);
18238 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18239 if (op
!= d
->target
)
18240 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18245 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18246 with two vpshufb insns, vpermq and vpor. We should have already failed
18247 all two or three instruction sequences. */
18250 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18252 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18253 unsigned int i
, nelt
, eltsz
;
18256 || !d
->one_operand_p
18257 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18264 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18266 /* Generate two permutation masks. If the required element is within
18267 the same lane, it is shuffled in. If the required element from the
18268 other lane, force a zero by setting bit 7 in the permutation mask.
18269 In the other mask the mask has non-negative elements if element
18270 is requested from the other lane, but also moved to the other lane,
18271 so that the result of vpshufb can have the two V2TImode halves
18273 m128
= GEN_INT (-128);
18274 for (i
= 0; i
< nelt
; ++i
)
18276 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18277 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18279 for (j
= 0; j
< eltsz
; ++j
)
18281 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18282 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18286 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18287 vperm
= force_reg (V32QImode
, vperm
);
18289 h
= gen_reg_rtx (V32QImode
);
18290 op
= gen_lowpart (V32QImode
, d
->op0
);
18291 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18293 /* Swap the 128-byte lanes of h into hp. */
18294 hp
= gen_reg_rtx (V4DImode
);
18295 op
= gen_lowpart (V4DImode
, h
);
18296 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18299 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18300 vperm
= force_reg (V32QImode
, vperm
);
18302 l
= gen_reg_rtx (V32QImode
);
18303 op
= gen_lowpart (V32QImode
, d
->op0
);
18304 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18307 if (d
->vmode
!= V32QImode
)
18308 op
= gen_reg_rtx (V32QImode
);
18309 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18310 if (op
!= d
->target
)
18311 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18316 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18317 and extract-odd permutations of two V32QImode and V16QImode operand
18318 with two vpshufb insns, vpor and vpermq. We should have already
18319 failed all two or three instruction sequences. */
18322 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18324 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18325 unsigned int i
, nelt
, eltsz
;
18328 || d
->one_operand_p
18329 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18332 for (i
= 0; i
< d
->nelt
; ++i
)
18333 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18340 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18342 /* Generate two permutation masks. In the first permutation mask
18343 the first quarter will contain indexes for the first half
18344 of the op0, the second quarter will contain bit 7 set, third quarter
18345 will contain indexes for the second half of the op0 and the
18346 last quarter bit 7 set. In the second permutation mask
18347 the first quarter will contain bit 7 set, the second quarter
18348 indexes for the first half of the op1, the third quarter bit 7 set
18349 and last quarter indexes for the second half of the op1.
18350 I.e. the first mask e.g. for V32QImode extract even will be:
18351 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18352 (all values masked with 0xf except for -128) and second mask
18353 for extract even will be
18354 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18355 m128
= GEN_INT (-128);
18356 for (i
= 0; i
< nelt
; ++i
)
18358 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18359 unsigned which
= d
->perm
[i
] >= nelt
;
18360 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18362 for (j
= 0; j
< eltsz
; ++j
)
18364 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18365 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18369 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18370 vperm
= force_reg (V32QImode
, vperm
);
18372 l
= gen_reg_rtx (V32QImode
);
18373 op
= gen_lowpart (V32QImode
, d
->op0
);
18374 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18376 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18377 vperm
= force_reg (V32QImode
, vperm
);
18379 h
= gen_reg_rtx (V32QImode
);
18380 op
= gen_lowpart (V32QImode
, d
->op1
);
18381 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18383 ior
= gen_reg_rtx (V32QImode
);
18384 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18386 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18387 op
= gen_reg_rtx (V4DImode
);
18388 ior
= gen_lowpart (V4DImode
, ior
);
18389 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18390 const1_rtx
, GEN_INT (3)));
18391 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18396 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18397 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18398 with two "and" and "pack" or two "shift" and "pack" insns. We should
18399 have already failed all two instruction sequences. */
18402 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18404 rtx op
, dop0
, dop1
, t
;
18405 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18406 bool end_perm
= false;
18407 machine_mode half_mode
;
18408 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18409 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18410 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18412 if (d
->one_operand_p
)
18418 /* Required for "pack". */
18419 if (!TARGET_SSE4_1
)
18423 half_mode
= V4SImode
;
18424 gen_and
= gen_andv4si3
;
18425 gen_pack
= gen_sse4_1_packusdw
;
18426 gen_shift
= gen_lshrv4si3
;
18429 /* No check as all instructions are SSE2. */
18432 half_mode
= V8HImode
;
18433 gen_and
= gen_andv8hi3
;
18434 gen_pack
= gen_sse2_packuswb
;
18435 gen_shift
= gen_lshrv8hi3
;
18442 half_mode
= V8SImode
;
18443 gen_and
= gen_andv8si3
;
18444 gen_pack
= gen_avx2_packusdw
;
18445 gen_shift
= gen_lshrv8si3
;
18453 half_mode
= V16HImode
;
18454 gen_and
= gen_andv16hi3
;
18455 gen_pack
= gen_avx2_packuswb
;
18456 gen_shift
= gen_lshrv16hi3
;
18460 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18461 general shuffles. */
18465 /* Check that permutation is even or odd. */
18470 for (i
= 1; i
< nelt
; ++i
)
18471 if (d
->perm
[i
] != 2 * i
+ odd
)
18477 dop0
= gen_reg_rtx (half_mode
);
18478 dop1
= gen_reg_rtx (half_mode
);
18481 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18482 t
= force_reg (half_mode
, t
);
18483 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18484 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18488 emit_insn (gen_shift (dop0
,
18489 gen_lowpart (half_mode
, d
->op0
),
18491 emit_insn (gen_shift (dop1
,
18492 gen_lowpart (half_mode
, d
->op1
),
18495 /* In AVX2 for 256 bit case we need to permute pack result. */
18496 if (TARGET_AVX2
&& end_perm
)
18498 op
= gen_reg_rtx (d
->vmode
);
18499 t
= gen_reg_rtx (V4DImode
);
18500 emit_insn (gen_pack (op
, dop0
, dop1
));
18501 emit_insn (gen_avx2_permv4di_1 (t
,
18502 gen_lowpart (V4DImode
, op
),
18507 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18510 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18515 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18516 and extract-odd permutations of two V64QI operands
18517 with two "shifts", two "truncs" and one "concat" insns for "odd"
18518 and two "truncs" and one concat insn for "even."
18519 Have already failed all two instruction sequences. */
18522 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18524 rtx t1
, t2
, t3
, t4
;
18525 unsigned i
, odd
, nelt
= d
->nelt
;
18527 if (!TARGET_AVX512BW
18528 || d
->one_operand_p
18529 || d
->vmode
!= V64QImode
)
18532 /* Check that permutation is even or odd. */
18537 for (i
= 1; i
< nelt
; ++i
)
18538 if (d
->perm
[i
] != 2 * i
+ odd
)
18547 t1
= gen_reg_rtx (V32HImode
);
18548 t2
= gen_reg_rtx (V32HImode
);
18549 emit_insn (gen_lshrv32hi3 (t1
,
18550 gen_lowpart (V32HImode
, d
->op0
),
18552 emit_insn (gen_lshrv32hi3 (t2
,
18553 gen_lowpart (V32HImode
, d
->op1
),
18558 t1
= gen_lowpart (V32HImode
, d
->op0
);
18559 t2
= gen_lowpart (V32HImode
, d
->op1
);
18562 t3
= gen_reg_rtx (V32QImode
);
18563 t4
= gen_reg_rtx (V32QImode
);
18564 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18565 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18566 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18571 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18572 and extract-odd permutations. */
18575 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18577 rtx t1
, t2
, t3
, t4
, t5
;
18584 t1
= gen_reg_rtx (V4DFmode
);
18585 t2
= gen_reg_rtx (V4DFmode
);
18587 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18588 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18589 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18591 /* Now an unpck[lh]pd will produce the result required. */
18593 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18595 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18601 int mask
= odd
? 0xdd : 0x88;
18605 t1
= gen_reg_rtx (V8SFmode
);
18606 t2
= gen_reg_rtx (V8SFmode
);
18607 t3
= gen_reg_rtx (V8SFmode
);
18609 /* Shuffle within the 128-bit lanes to produce:
18610 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18611 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18614 /* Shuffle the lanes around to produce:
18615 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18616 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18619 /* Shuffle within the 128-bit lanes to produce:
18620 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18621 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18623 /* Shuffle within the 128-bit lanes to produce:
18624 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18625 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18627 /* Shuffle the lanes around to produce:
18628 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18629 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18639 /* These are always directly implementable by expand_vec_perm_1. */
18640 gcc_unreachable ();
18643 gcc_assert (TARGET_MMX_WITH_SSE
);
18644 /* We have no suitable instructions. */
18652 /* We need 2*log2(N)-1 operations to achieve odd/even
18653 with interleave. */
18654 t1
= gen_reg_rtx (V4HImode
);
18655 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
18656 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
18658 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
18660 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
18666 return expand_vec_perm_even_odd_pack (d
);
18667 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18668 return expand_vec_perm_pshufb2 (d
);
18673 /* We need 2*log2(N)-1 operations to achieve odd/even
18674 with interleave. */
18675 t1
= gen_reg_rtx (V8HImode
);
18676 t2
= gen_reg_rtx (V8HImode
);
18677 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18678 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18679 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18680 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18682 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18684 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18690 return expand_vec_perm_even_odd_pack (d
);
18694 return expand_vec_perm_even_odd_pack (d
);
18697 return expand_vec_perm_even_odd_trunc (d
);
18702 struct expand_vec_perm_d d_copy
= *d
;
18703 d_copy
.vmode
= V4DFmode
;
18705 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18707 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18708 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18709 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18710 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18713 emit_move_insn (d
->target
,
18714 gen_lowpart (V4DImode
, d_copy
.target
));
18723 t1
= gen_reg_rtx (V4DImode
);
18724 t2
= gen_reg_rtx (V4DImode
);
18726 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18727 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18728 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18730 /* Now an vpunpck[lh]qdq will produce the result required. */
18732 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18734 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18741 struct expand_vec_perm_d d_copy
= *d
;
18742 d_copy
.vmode
= V8SFmode
;
18744 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18746 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18747 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18748 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18749 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18752 emit_move_insn (d
->target
,
18753 gen_lowpart (V8SImode
, d_copy
.target
));
18762 t1
= gen_reg_rtx (V8SImode
);
18763 t2
= gen_reg_rtx (V8SImode
);
18764 t3
= gen_reg_rtx (V4DImode
);
18765 t4
= gen_reg_rtx (V4DImode
);
18766 t5
= gen_reg_rtx (V4DImode
);
18768 /* Shuffle the lanes around into
18769 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18770 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18771 gen_lowpart (V4DImode
, d
->op1
),
18773 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18774 gen_lowpart (V4DImode
, d
->op1
),
18777 /* Swap the 2nd and 3rd position in each lane into
18778 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18779 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18780 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18781 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18782 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18784 /* Now an vpunpck[lh]qdq will produce
18785 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18787 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18788 gen_lowpart (V4DImode
, t2
));
18790 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18791 gen_lowpart (V4DImode
, t2
));
18793 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18797 gcc_unreachable ();
18803 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18804 extract-even and extract-odd permutations. */
18807 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18809 unsigned i
, odd
, nelt
= d
->nelt
;
18812 if (odd
!= 0 && odd
!= 1)
18815 for (i
= 1; i
< nelt
; ++i
)
18816 if (d
->perm
[i
] != 2 * i
+ odd
)
18819 return expand_vec_perm_even_odd_1 (d
, odd
);
18822 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18823 permutations. We assume that expand_vec_perm_1 has already failed. */
18826 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18828 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18829 machine_mode vmode
= d
->vmode
;
18830 unsigned char perm2
[4];
18831 rtx op0
= d
->op0
, dest
;
18838 /* These are special-cased in sse.md so that we can optionally
18839 use the vbroadcast instruction. They expand to two insns
18840 if the input happens to be in a register. */
18841 gcc_unreachable ();
18849 /* These are always implementable using standard shuffle patterns. */
18850 gcc_unreachable ();
18854 /* These can be implemented via interleave. We save one insn by
18855 stopping once we have promoted to V4SImode and then use pshufd. */
18861 rtx (*gen
) (rtx
, rtx
, rtx
)
18862 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18863 : gen_vec_interleave_lowv8hi
;
18867 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18868 : gen_vec_interleave_highv8hi
;
18873 dest
= gen_reg_rtx (vmode
);
18874 emit_insn (gen (dest
, op0
, op0
));
18875 vmode
= get_mode_wider_vector (vmode
);
18876 op0
= gen_lowpart (vmode
, dest
);
18878 while (vmode
!= V4SImode
);
18880 memset (perm2
, elt
, 4);
18881 dest
= gen_reg_rtx (V4SImode
);
18882 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18885 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18893 /* For AVX2 broadcasts of the first element vpbroadcast* or
18894 vpermq should be used by expand_vec_perm_1. */
18895 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18899 gcc_unreachable ();
18903 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18904 broadcast permutations. */
18907 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18909 unsigned i
, elt
, nelt
= d
->nelt
;
18911 if (!d
->one_operand_p
)
18915 for (i
= 1; i
< nelt
; ++i
)
18916 if (d
->perm
[i
] != elt
)
18919 return expand_vec_perm_broadcast_1 (d
);
18922 /* Implement arbitrary permutations of two V64QImode operands
18923 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18925 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
18927 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
18933 struct expand_vec_perm_d ds
[2];
18934 rtx rperm
[128], vperm
, target0
, target1
;
18935 unsigned int i
, nelt
;
18936 machine_mode vmode
;
18941 for (i
= 0; i
< 2; i
++)
18944 ds
[i
].vmode
= V32HImode
;
18946 ds
[i
].target
= gen_reg_rtx (V32HImode
);
18947 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
18948 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
18951 /* Prepare permutations such that the first one takes care of
18952 putting the even bytes into the right positions or one higher
18953 positions (ds[0]) and the second one takes care of
18954 putting the odd bytes into the right positions or one below
18957 for (i
= 0; i
< nelt
; i
++)
18959 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
18962 rperm
[i
] = constm1_rtx
;
18963 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18967 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18968 rperm
[i
+ 64] = constm1_rtx
;
18972 bool ok
= expand_vec_perm_1 (&ds
[0]);
18974 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
18976 ok
= expand_vec_perm_1 (&ds
[1]);
18978 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
18980 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
18981 vperm
= force_reg (vmode
, vperm
);
18982 target0
= gen_reg_rtx (V64QImode
);
18983 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
18985 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
18986 vperm
= force_reg (vmode
, vperm
);
18987 target1
= gen_reg_rtx (V64QImode
);
18988 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
18990 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
18994 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18995 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18996 all the shorter instruction sequences. */
18999 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19001 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19002 unsigned int i
, nelt
, eltsz
;
19006 || d
->one_operand_p
19007 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19014 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19016 /* Generate 4 permutation masks. If the required element is within
19017 the same lane, it is shuffled in. If the required element from the
19018 other lane, force a zero by setting bit 7 in the permutation mask.
19019 In the other mask the mask has non-negative elements if element
19020 is requested from the other lane, but also moved to the other lane,
19021 so that the result of vpshufb can have the two V2TImode halves
19023 m128
= GEN_INT (-128);
19024 for (i
= 0; i
< 32; ++i
)
19026 rperm
[0][i
] = m128
;
19027 rperm
[1][i
] = m128
;
19028 rperm
[2][i
] = m128
;
19029 rperm
[3][i
] = m128
;
19035 for (i
= 0; i
< nelt
; ++i
)
19037 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19038 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19039 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19041 for (j
= 0; j
< eltsz
; ++j
)
19042 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19043 used
[which
] = true;
19046 for (i
= 0; i
< 2; ++i
)
19048 if (!used
[2 * i
+ 1])
19053 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19054 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19055 vperm
= force_reg (V32QImode
, vperm
);
19056 h
[i
] = gen_reg_rtx (V32QImode
);
19057 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19058 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19061 /* Swap the 128-byte lanes of h[X]. */
19062 for (i
= 0; i
< 2; ++i
)
19064 if (h
[i
] == NULL_RTX
)
19066 op
= gen_reg_rtx (V4DImode
);
19067 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19068 const2_rtx
, GEN_INT (3), const0_rtx
,
19070 h
[i
] = gen_lowpart (V32QImode
, op
);
19073 for (i
= 0; i
< 2; ++i
)
19080 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19081 vperm
= force_reg (V32QImode
, vperm
);
19082 l
[i
] = gen_reg_rtx (V32QImode
);
19083 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19084 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19087 for (i
= 0; i
< 2; ++i
)
19091 op
= gen_reg_rtx (V32QImode
);
19092 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19099 gcc_assert (l
[0] && l
[1]);
19101 if (d
->vmode
!= V32QImode
)
19102 op
= gen_reg_rtx (V32QImode
);
19103 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19104 if (op
!= d
->target
)
19105 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19109 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19110 taken care of, perform the expansion in D and return true on success. */
19113 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19115 /* Try a single instruction expansion. */
19116 if (expand_vec_perm_1 (d
))
19119 /* Try sequences of two instructions. */
19121 if (expand_vec_perm_pshuflw_pshufhw (d
))
19124 if (expand_vec_perm_palignr (d
, false))
19127 if (expand_vec_perm_interleave2 (d
))
19130 if (expand_vec_perm_broadcast (d
))
19133 if (expand_vec_perm_vpermq_perm_1 (d
))
19136 if (expand_vec_perm_vperm2f128 (d
))
19139 if (expand_vec_perm_pblendv (d
))
19142 /* Try sequences of three instructions. */
19144 if (expand_vec_perm_even_odd_pack (d
))
19147 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19150 if (expand_vec_perm_pshufb2 (d
))
19153 if (expand_vec_perm_interleave3 (d
))
19156 if (expand_vec_perm_vperm2f128_vblend (d
))
19159 /* Try sequences of four instructions. */
19161 if (expand_vec_perm_even_odd_trunc (d
))
19163 if (expand_vec_perm_vpshufb2_vpermq (d
))
19166 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19169 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19172 /* ??? Look for narrow permutations whose element orderings would
19173 allow the promotion to a wider mode. */
19175 /* ??? Look for sequences of interleave or a wider permute that place
19176 the data into the correct lanes for a half-vector shuffle like
19177 pshuf[lh]w or vpermilps. */
19179 /* ??? Look for sequences of interleave that produce the desired results.
19180 The combinatorics of punpck[lh] get pretty ugly... */
19182 if (expand_vec_perm_even_odd (d
))
19185 /* Even longer sequences. */
19186 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19189 /* See if we can get the same permutation in different vector integer
19191 struct expand_vec_perm_d nd
;
19192 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19195 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19199 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19200 if (expand_vec_perm2_vperm2f128_vblend (d
))
19206 /* If a permutation only uses one operand, make it clear. Returns true
19207 if the permutation references both operands. */
19210 canonicalize_perm (struct expand_vec_perm_d
*d
)
19212 int i
, which
, nelt
= d
->nelt
;
19214 for (i
= which
= 0; i
< nelt
; ++i
)
19215 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19217 d
->one_operand_p
= true;
19224 if (!rtx_equal_p (d
->op0
, d
->op1
))
19226 d
->one_operand_p
= false;
19229 /* The elements of PERM do not suggest that only the first operand
19230 is used, but both operands are identical. Allow easier matching
19231 of the permutation by folding the permutation into the single
19236 for (i
= 0; i
< nelt
; ++i
)
19237 d
->perm
[i
] &= nelt
- 1;
19246 return (which
== 3);
19249 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19252 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19253 rtx op1
, const vec_perm_indices
&sel
)
19255 struct expand_vec_perm_d d
;
19256 unsigned char perm
[MAX_VECT_LEN
];
19257 unsigned int i
, nelt
, which
;
19265 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19266 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19267 d
.testing_p
= !target
;
19269 gcc_assert (sel
.length () == nelt
);
19270 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19272 /* Given sufficient ISA support we can just return true here
19273 for selected vector modes. */
19280 if (!TARGET_AVX512F
)
19282 /* All implementable with a single vperm[it]2 insn. */
19287 if (!TARGET_AVX512BW
)
19290 /* All implementable with a single vperm[it]2 insn. */
19294 if (!TARGET_AVX512BW
)
19297 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19306 if (d
.testing_p
&& TARGET_AVX512VL
)
19307 /* All implementable with a single vperm[it]2 insn. */
19313 if (d
.testing_p
&& TARGET_AVX2
)
19314 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19320 if (d
.testing_p
&& TARGET_AVX2
)
19321 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19328 /* Fall through. */
19333 /* All implementable with a single vpperm insn. */
19334 if (d
.testing_p
&& TARGET_XOP
)
19336 /* All implementable with 2 pshufb + 1 ior. */
19337 if (d
.testing_p
&& TARGET_SSSE3
)
19343 if (!TARGET_MMX_WITH_SSE
)
19350 /* All implementable with shufpd or unpck[lh]pd. */
19358 for (i
= which
= 0; i
< nelt
; ++i
)
19360 unsigned char e
= sel
[i
];
19361 gcc_assert (e
< 2 * nelt
);
19364 which
|= (e
< nelt
? 1 : 2);
19369 /* For all elements from second vector, fold the elements to first. */
19371 for (i
= 0; i
< nelt
; ++i
)
19374 /* Check whether the mask can be applied to the vector type. */
19375 d
.one_operand_p
= (which
!= 3);
19377 /* Implementable with shufps or pshufd. */
19378 if (d
.one_operand_p
19379 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
19380 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
))
19383 /* Otherwise we have to go through the motions and see if we can
19384 figure out how to generate the requested permutation. */
19385 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19386 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19387 if (!d
.one_operand_p
)
19388 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19391 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19397 two_args
= canonicalize_perm (&d
);
19399 if (ix86_expand_vec_perm_const_1 (&d
))
19402 /* If the selector says both arguments are needed, but the operands are the
19403 same, the above tried to expand with one_operand_p and flattened selector.
19404 If that didn't work, retry without one_operand_p; we succeeded with that
19406 if (two_args
&& d
.one_operand_p
)
19408 d
.one_operand_p
= false;
19409 memcpy (d
.perm
, perm
, sizeof (perm
));
19410 return ix86_expand_vec_perm_const_1 (&d
);
19417 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19419 struct expand_vec_perm_d d
;
19425 d
.vmode
= GET_MODE (targ
);
19426 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19427 d
.one_operand_p
= false;
19428 d
.testing_p
= false;
19430 for (i
= 0; i
< nelt
; ++i
)
19431 d
.perm
[i
] = i
* 2 + odd
;
19433 /* We'll either be able to implement the permutation directly... */
19434 if (expand_vec_perm_1 (&d
))
19437 /* ... or we use the special-case patterns. */
19438 expand_vec_perm_even_odd_1 (&d
, odd
);
19442 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19444 struct expand_vec_perm_d d
;
19445 unsigned i
, nelt
, base
;
19451 d
.vmode
= GET_MODE (targ
);
19452 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19453 d
.one_operand_p
= false;
19454 d
.testing_p
= false;
19456 base
= high_p
? nelt
/ 2 : 0;
19457 for (i
= 0; i
< nelt
/ 2; ++i
)
19459 d
.perm
[i
* 2] = i
+ base
;
19460 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19463 /* Note that for AVX this isn't one instruction. */
19464 ok
= ix86_expand_vec_perm_const_1 (&d
);
19468 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
19469 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
19471 vpmovzxbw ymm2, xmm0
19472 vpmovzxbw ymm3, xmm1
19473 vpmullw ymm4, ymm2, ymm3
19476 it would take less instructions than ix86_expand_vecop_qihi.
19477 Return true if success. */
19480 ix86_expand_vecmul_qihi (rtx dest
, rtx op1
, rtx op2
)
19482 machine_mode himode
, qimode
= GET_MODE (dest
);
19483 rtx hop1
, hop2
, hdest
;
19484 rtx (*gen_extend
)(rtx
, rtx
);
19485 rtx (*gen_truncate
)(rtx
, rtx
);
19487 /* There's no V64HImode multiplication instruction. */
19488 if (qimode
== E_V64QImode
)
19491 /* vpmovwb only available under AVX512BW. */
19492 if (!TARGET_AVX512BW
)
19494 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
19495 && !TARGET_AVX512VL
)
19497 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
19498 if (qimode
== V32QImode
19499 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
19506 gen_extend
= gen_zero_extendv8qiv8hi2
;
19507 gen_truncate
= gen_truncv8hiv8qi2
;
19510 himode
= V16HImode
;
19511 gen_extend
= gen_zero_extendv16qiv16hi2
;
19512 gen_truncate
= gen_truncv16hiv16qi2
;
19515 himode
= V32HImode
;
19516 gen_extend
= gen_zero_extendv32qiv32hi2
;
19517 gen_truncate
= gen_truncv32hiv32qi2
;
19520 gcc_unreachable ();
19523 hop1
= gen_reg_rtx (himode
);
19524 hop2
= gen_reg_rtx (himode
);
19525 hdest
= gen_reg_rtx (himode
);
19526 emit_insn (gen_extend (hop1
, op1
));
19527 emit_insn (gen_extend (hop2
, op2
));
19528 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (MULT
, himode
,
19530 emit_insn (gen_truncate (dest
, hdest
));
19534 /* Expand a vector operation shift by constant for a V*QImode in terms of the
19535 same operation on V*HImode. Return true if success. */
19537 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19539 machine_mode qimode
, himode
;
19540 unsigned int and_constant
, xor_constant
;
19541 HOST_WIDE_INT shift_amount
;
19542 rtx vec_const_and
, vec_const_xor
;
19543 rtx tmp
, op1_subreg
;
19544 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
19545 rtx (*gen_and
) (rtx
, rtx
, rtx
);
19546 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
19547 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
19549 /* Only optimize shift by constant. */
19550 if (!CONST_INT_P (op2
))
19553 qimode
= GET_MODE (dest
);
19554 shift_amount
= INTVAL (op2
);
19555 /* Do nothing when shift amount greater equal 8. */
19556 if (shift_amount
> 7)
19559 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
19560 /* Record sign bit. */
19561 xor_constant
= 1 << (8 - shift_amount
- 1);
19563 /* Zero upper/lower bits shift from left/right element. */
19565 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
19566 : (1 << (8 - shift_amount
)) - 1);
19575 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
19576 gen_and
= gen_andv16qi3
;
19577 gen_xor
= gen_xorv16qi3
;
19578 gen_sub
= gen_subv16qi3
;
19581 himode
= V16HImode
;
19585 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
19586 gen_and
= gen_andv32qi3
;
19587 gen_xor
= gen_xorv32qi3
;
19588 gen_sub
= gen_subv32qi3
;
19591 himode
= V32HImode
;
19595 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
19596 gen_and
= gen_andv64qi3
;
19597 gen_xor
= gen_xorv64qi3
;
19598 gen_sub
= gen_subv64qi3
;
19601 gcc_unreachable ();
19604 tmp
= gen_reg_rtx (himode
);
19605 vec_const_and
= gen_reg_rtx (qimode
);
19606 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
19608 /* For ASHIFT and LSHIFTRT, perform operation like
19609 vpsllw/vpsrlw $shift_amount, %op1, %dest.
19610 vpand %vec_const_and, %dest. */
19611 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
19612 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
19613 emit_move_insn (vec_const_and
,
19614 ix86_build_const_vector (qimode
, true,
19615 GEN_INT (and_constant
)));
19616 emit_insn (gen_and (dest
, dest
, vec_const_and
));
19618 /* For ASHIFTRT, perform extra operation like
19619 vpxor %vec_const_xor, %dest, %dest
19620 vpsubb %vec_const_xor, %dest, %dest */
19621 if (code
== ASHIFTRT
)
19623 vec_const_xor
= gen_reg_rtx (qimode
);
19624 emit_move_insn (vec_const_xor
,
19625 ix86_build_const_vector (qimode
, true,
19626 GEN_INT (xor_constant
)));
19627 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
19628 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
19633 /* Expand a vector operation CODE for a V*QImode in terms of the
19634 same operation on V*HImode. */
19637 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19639 machine_mode qimode
= GET_MODE (dest
);
19640 machine_mode himode
;
19641 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19642 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19643 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19644 struct expand_vec_perm_d d
;
19645 bool ok
, full_interleave
;
19646 bool uns_p
= false;
19653 gen_il
= gen_vec_interleave_lowv16qi
;
19654 gen_ih
= gen_vec_interleave_highv16qi
;
19657 himode
= V16HImode
;
19658 gen_il
= gen_avx2_interleave_lowv32qi
;
19659 gen_ih
= gen_avx2_interleave_highv32qi
;
19662 himode
= V32HImode
;
19663 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19664 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19667 gcc_unreachable ();
19670 op2_l
= op2_h
= op2
;
19674 /* Unpack data such that we've got a source byte in each low byte of
19675 each word. We don't care what goes into the high byte of each word.
19676 Rather than trying to get zero in there, most convenient is to let
19677 it be a copy of the low byte. */
19678 op2_l
= gen_reg_rtx (qimode
);
19679 op2_h
= gen_reg_rtx (qimode
);
19680 emit_insn (gen_il (op2_l
, op2
, op2
));
19681 emit_insn (gen_ih (op2_h
, op2
, op2
));
19683 op1_l
= gen_reg_rtx (qimode
);
19684 op1_h
= gen_reg_rtx (qimode
);
19685 emit_insn (gen_il (op1_l
, op1
, op1
));
19686 emit_insn (gen_ih (op1_h
, op1
, op1
));
19687 full_interleave
= qimode
== V16QImode
;
19695 op1_l
= gen_reg_rtx (himode
);
19696 op1_h
= gen_reg_rtx (himode
);
19697 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19698 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19699 full_interleave
= true;
19702 gcc_unreachable ();
19705 /* Perform the operation. */
19706 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19708 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19710 gcc_assert (res_l
&& res_h
);
19712 /* Merge the data back into the right place. */
19714 d
.op0
= gen_lowpart (qimode
, res_l
);
19715 d
.op1
= gen_lowpart (qimode
, res_h
);
19717 d
.nelt
= GET_MODE_NUNITS (qimode
);
19718 d
.one_operand_p
= false;
19719 d
.testing_p
= false;
19721 if (full_interleave
)
19723 /* For SSE2, we used an full interleave, so the desired
19724 results are in the even elements. */
19725 for (i
= 0; i
< d
.nelt
; ++i
)
19730 /* For AVX, the interleave used above was not cross-lane. So the
19731 extraction is evens but with the second and third quarter swapped.
19732 Happily, that is even one insn shorter than even extraction.
19733 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19734 always first from the first and then from the second source operand,
19735 the index bits above the low 4 bits remains the same.
19736 Thus, for d.nelt == 32 we want permutation
19737 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19738 and for d.nelt == 64 we want permutation
19739 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19740 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19741 for (i
= 0; i
< d
.nelt
; ++i
)
19742 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19745 ok
= ix86_expand_vec_perm_const_1 (&d
);
19748 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19749 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19752 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19753 if op is CONST_VECTOR with all odd elements equal to their
19754 preceding element. */
19757 const_vector_equal_evenodd_p (rtx op
)
19759 machine_mode mode
= GET_MODE (op
);
19760 int i
, nunits
= GET_MODE_NUNITS (mode
);
19761 if (GET_CODE (op
) != CONST_VECTOR
19762 || nunits
!= CONST_VECTOR_NUNITS (op
))
19764 for (i
= 0; i
< nunits
; i
+= 2)
19765 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19771 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19772 bool uns_p
, bool odd_p
)
19774 machine_mode mode
= GET_MODE (op1
);
19775 machine_mode wmode
= GET_MODE (dest
);
19777 rtx orig_op1
= op1
, orig_op2
= op2
;
19779 if (!nonimmediate_operand (op1
, mode
))
19780 op1
= force_reg (mode
, op1
);
19781 if (!nonimmediate_operand (op2
, mode
))
19782 op2
= force_reg (mode
, op2
);
19784 /* We only play even/odd games with vectors of SImode. */
19785 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19787 /* If we're looking for the odd results, shift those members down to
19788 the even slots. For some cpus this is faster than a PSHUFD. */
19791 /* For XOP use vpmacsdqh, but only for smult, as it is only
19793 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19795 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19796 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19800 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19801 if (!const_vector_equal_evenodd_p (orig_op1
))
19802 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19803 x
, NULL
, 1, OPTAB_DIRECT
);
19804 if (!const_vector_equal_evenodd_p (orig_op2
))
19805 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19806 x
, NULL
, 1, OPTAB_DIRECT
);
19807 op1
= gen_lowpart (mode
, op1
);
19808 op2
= gen_lowpart (mode
, op2
);
19811 if (mode
== V16SImode
)
19814 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19816 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19818 else if (mode
== V8SImode
)
19821 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19823 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19826 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19827 else if (TARGET_SSE4_1
)
19828 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19831 rtx s1
, s2
, t0
, t1
, t2
;
19833 /* The easiest way to implement this without PMULDQ is to go through
19834 the motions as if we are performing a full 64-bit multiply. With
19835 the exception that we need to do less shuffling of the elements. */
19837 /* Compute the sign-extension, aka highparts, of the two operands. */
19838 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19839 op1
, pc_rtx
, pc_rtx
);
19840 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19841 op2
, pc_rtx
, pc_rtx
);
19843 /* Multiply LO(A) * HI(B), and vice-versa. */
19844 t1
= gen_reg_rtx (wmode
);
19845 t2
= gen_reg_rtx (wmode
);
19846 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19847 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19849 /* Multiply LO(A) * LO(B). */
19850 t0
= gen_reg_rtx (wmode
);
19851 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19853 /* Combine and shift the highparts into place. */
19854 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19855 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19858 /* Combine high and low parts. */
19859 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19866 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19867 bool uns_p
, bool high_p
)
19869 machine_mode wmode
= GET_MODE (dest
);
19870 machine_mode mode
= GET_MODE (op1
);
19871 rtx t1
, t2
, t3
, t4
, mask
;
19876 t1
= gen_reg_rtx (mode
);
19877 t2
= gen_reg_rtx (mode
);
19878 if (TARGET_XOP
&& !uns_p
)
19880 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19881 shuffle the elements once so that all elements are in the right
19882 place for immediate use: { A C B D }. */
19883 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19884 const1_rtx
, GEN_INT (3)));
19885 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19886 const1_rtx
, GEN_INT (3)));
19890 /* Put the elements into place for the multiply. */
19891 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19892 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19895 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19899 /* Shuffle the elements between the lanes. After this we
19900 have { A B E F | C D G H } for each operand. */
19901 t1
= gen_reg_rtx (V4DImode
);
19902 t2
= gen_reg_rtx (V4DImode
);
19903 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19904 const0_rtx
, const2_rtx
,
19905 const1_rtx
, GEN_INT (3)));
19906 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19907 const0_rtx
, const2_rtx
,
19908 const1_rtx
, GEN_INT (3)));
19910 /* Shuffle the elements within the lanes. After this we
19911 have { A A B B | C C D D } or { E E F F | G G H H }. */
19912 t3
= gen_reg_rtx (V8SImode
);
19913 t4
= gen_reg_rtx (V8SImode
);
19914 mask
= GEN_INT (high_p
19915 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19916 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19917 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19918 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19920 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19925 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19926 uns_p
, OPTAB_DIRECT
);
19927 t2
= expand_binop (mode
,
19928 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
19929 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
19930 gcc_assert (t1
&& t2
);
19932 t3
= gen_reg_rtx (mode
);
19933 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
19934 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
19942 t1
= gen_reg_rtx (wmode
);
19943 t2
= gen_reg_rtx (wmode
);
19944 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
19945 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
19947 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
19951 gcc_unreachable ();
19956 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
19958 rtx res_1
, res_2
, res_3
, res_4
;
19960 res_1
= gen_reg_rtx (V4SImode
);
19961 res_2
= gen_reg_rtx (V4SImode
);
19962 res_3
= gen_reg_rtx (V2DImode
);
19963 res_4
= gen_reg_rtx (V2DImode
);
19964 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
19965 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
19967 /* Move the results in element 2 down to element 1; we don't care
19968 what goes in elements 2 and 3. Then we can merge the parts
19969 back together with an interleave.
19971 Note that two other sequences were tried:
19972 (1) Use interleaves at the start instead of psrldq, which allows
19973 us to use a single shufps to merge things back at the end.
19974 (2) Use shufps here to combine the two vectors, then pshufd to
19975 put the elements in the correct order.
19976 In both cases the cost of the reformatting stall was too high
19977 and the overall sequence slower. */
19979 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
19980 const0_rtx
, const2_rtx
,
19981 const0_rtx
, const0_rtx
));
19982 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
19983 const0_rtx
, const2_rtx
,
19984 const0_rtx
, const0_rtx
));
19985 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
19987 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
19991 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
19993 machine_mode mode
= GET_MODE (op0
);
19994 rtx t1
, t2
, t3
, t4
, t5
, t6
;
19996 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
19997 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
19998 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
19999 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
20000 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
20001 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
20002 else if (TARGET_XOP
&& mode
== V2DImode
)
20004 /* op1: A,B,C,D, op2: E,F,G,H */
20005 op1
= gen_lowpart (V4SImode
, op1
);
20006 op2
= gen_lowpart (V4SImode
, op2
);
20008 t1
= gen_reg_rtx (V4SImode
);
20009 t2
= gen_reg_rtx (V4SImode
);
20010 t3
= gen_reg_rtx (V2DImode
);
20011 t4
= gen_reg_rtx (V2DImode
);
20014 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
20020 /* t2: (B*E),(A*F),(D*G),(C*H) */
20021 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
20023 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20024 emit_insn (gen_xop_phadddq (t3
, t2
));
20026 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20027 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
20029 /* Multiply lower parts and add all */
20030 t5
= gen_reg_rtx (V2DImode
);
20031 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
20032 gen_lowpart (V4SImode
, op1
),
20033 gen_lowpart (V4SImode
, op2
)));
20034 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
20038 machine_mode nmode
;
20039 rtx (*umul
) (rtx
, rtx
, rtx
);
20041 if (mode
== V2DImode
)
20043 umul
= gen_vec_widen_umult_even_v4si
;
20046 else if (mode
== V4DImode
)
20048 umul
= gen_vec_widen_umult_even_v8si
;
20051 else if (mode
== V8DImode
)
20053 umul
= gen_vec_widen_umult_even_v16si
;
20057 gcc_unreachable ();
20060 /* Multiply low parts. */
20061 t1
= gen_reg_rtx (mode
);
20062 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
20064 /* Shift input vectors right 32 bits so we can multiply high parts. */
20066 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
20067 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
20069 /* Multiply high parts by low parts. */
20070 t4
= gen_reg_rtx (mode
);
20071 t5
= gen_reg_rtx (mode
);
20072 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
20073 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
20075 /* Combine and shift the highparts back. */
20076 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
20077 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
20079 /* Combine high and low parts. */
20080 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
20083 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20084 gen_rtx_MULT (mode
, op1
, op2
));
20087 /* Return 1 if control tansfer instruction INSN
20088 should be encoded with notrack prefix. */
20091 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
20093 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
20098 rtx call
= get_call_rtx_from (insn
);
20099 gcc_assert (call
!= NULL_RTX
);
20100 rtx addr
= XEXP (call
, 0);
20102 /* Do not emit 'notrack' if it's not an indirect call. */
20104 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
20107 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
20110 if (JUMP_P (insn
) && !flag_cet_switch
)
20112 rtx target
= JUMP_LABEL (insn
);
20113 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
20116 /* Check the jump is a switch table. */
20117 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
20118 rtx_insn
*table
= next_insn (label
);
20119 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20127 /* Calculate integer abs() using only SSE2 instructions. */
20130 ix86_expand_sse2_abs (rtx target
, rtx input
)
20132 machine_mode mode
= GET_MODE (target
);
20139 /* For 64-bit signed integer X, with SSE4.2 use
20140 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20141 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20142 32 and use logical instead of arithmetic right shift (which is
20143 unimplemented) and subtract. */
20146 tmp0
= gen_reg_rtx (mode
);
20147 tmp1
= gen_reg_rtx (mode
);
20148 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20149 if (mode
== E_V2DImode
)
20150 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20152 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20156 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20157 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20158 - 1), NULL
, 0, OPTAB_DIRECT
);
20159 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20162 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20163 NULL
, 0, OPTAB_DIRECT
);
20164 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20165 target
, 0, OPTAB_DIRECT
);
20169 /* For 32-bit signed integer X, the best way to calculate the absolute
20170 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20171 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20172 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20173 NULL
, 0, OPTAB_DIRECT
);
20174 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20175 NULL
, 0, OPTAB_DIRECT
);
20176 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20177 target
, 0, OPTAB_DIRECT
);
20181 /* For 16-bit signed integer X, the best way to calculate the absolute
20182 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20183 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20185 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20186 target
, 0, OPTAB_DIRECT
);
20190 /* For 8-bit signed integer X, the best way to calculate the absolute
20191 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20192 as SSE2 provides the PMINUB insn. */
20193 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20195 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20196 target
, 0, OPTAB_DIRECT
);
20200 gcc_unreachable ();
20204 emit_move_insn (target
, x
);
20207 /* Expand an extract from a vector register through pextr insn.
20208 Return true if successful. */
20211 ix86_expand_pextr (rtx
*operands
)
20213 rtx dst
= operands
[0];
20214 rtx src
= operands
[1];
20216 unsigned int size
= INTVAL (operands
[2]);
20217 unsigned int pos
= INTVAL (operands
[3]);
20219 if (SUBREG_P (dst
))
20221 /* Reject non-lowpart subregs. */
20222 if (SUBREG_BYTE (dst
) > 0)
20224 dst
= SUBREG_REG (dst
);
20227 if (SUBREG_P (src
))
20229 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20230 src
= SUBREG_REG (src
);
20233 switch (GET_MODE (src
))
20242 machine_mode srcmode
, dstmode
;
20245 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20251 if (!TARGET_SSE4_1
)
20253 srcmode
= V16QImode
;
20259 srcmode
= V8HImode
;
20263 if (!TARGET_SSE4_1
)
20265 srcmode
= V4SImode
;
20269 gcc_assert (TARGET_64BIT
);
20270 if (!TARGET_SSE4_1
)
20272 srcmode
= V2DImode
;
20279 /* Reject extractions from misaligned positions. */
20280 if (pos
& (size
-1))
20283 if (GET_MODE (dst
) == dstmode
)
20286 d
= gen_reg_rtx (dstmode
);
20288 /* Construct insn pattern. */
20289 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20290 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20292 /* Let the rtl optimizers know about the zero extension performed. */
20293 if (dstmode
== QImode
|| dstmode
== HImode
)
20295 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20296 d
= gen_lowpart (SImode
, d
);
20299 emit_insn (gen_rtx_SET (d
, pat
));
20302 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20311 /* Expand an insert into a vector register through pinsr insn.
20312 Return true if successful. */
20315 ix86_expand_pinsr (rtx
*operands
)
20317 rtx dst
= operands
[0];
20318 rtx src
= operands
[3];
20320 unsigned int size
= INTVAL (operands
[1]);
20321 unsigned int pos
= INTVAL (operands
[2]);
20323 if (SUBREG_P (dst
))
20325 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20326 dst
= SUBREG_REG (dst
);
20329 switch (GET_MODE (dst
))
20338 machine_mode srcmode
, dstmode
;
20339 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20342 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20348 if (!TARGET_SSE4_1
)
20350 dstmode
= V16QImode
;
20351 pinsr
= gen_sse4_1_pinsrb
;
20357 dstmode
= V8HImode
;
20358 pinsr
= gen_sse2_pinsrw
;
20362 if (!TARGET_SSE4_1
)
20364 dstmode
= V4SImode
;
20365 pinsr
= gen_sse4_1_pinsrd
;
20369 gcc_assert (TARGET_64BIT
);
20370 if (!TARGET_SSE4_1
)
20372 dstmode
= V2DImode
;
20373 pinsr
= gen_sse4_1_pinsrq
;
20380 /* Reject insertions to misaligned positions. */
20381 if (pos
& (size
-1))
20384 if (SUBREG_P (src
))
20386 unsigned int srcpos
= SUBREG_BYTE (src
);
20392 extr_ops
[0] = gen_reg_rtx (srcmode
);
20393 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20394 extr_ops
[2] = GEN_INT (size
);
20395 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20397 if (!ix86_expand_pextr (extr_ops
))
20403 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20406 if (GET_MODE (dst
) == dstmode
)
20409 d
= gen_reg_rtx (dstmode
);
20411 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20412 gen_lowpart (srcmode
, src
),
20413 GEN_INT (1 << (pos
/ size
))));
20415 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20424 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20425 upper against lower halves up to SSE reg size. */
20428 ix86_split_reduction (machine_mode mode
)
20430 /* Reduce lowpart against highpart until we reach SSE reg width to
20431 avoid cross-lane operations. */
20457 /* Generate call to __divmoddi4. */
20460 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20462 rtx
*quot_p
, rtx
*rem_p
)
20464 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20466 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20467 mode
, op0
, mode
, op1
, mode
,
20468 XEXP (rem
, 0), Pmode
);
20473 #include "gt-i386-expand.h"